View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.FileNotFoundException;
23  import java.io.IOException;
24  import java.io.InterruptedIOException;
25  import java.io.UnsupportedEncodingException;
26  import java.lang.reflect.Constructor;
27  import java.text.ParseException;
28  import java.util.AbstractList;
29  import java.util.ArrayList;
30  import java.util.Arrays;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashMap;
34  import java.util.Iterator;
35  import java.util.List;
36  import java.util.Map;
37  import java.util.NavigableMap;
38  import java.util.NavigableSet;
39  import java.util.RandomAccess;
40  import java.util.Set;
41  import java.util.TreeMap;
42  import java.util.UUID;
43  import java.util.concurrent.Callable;
44  import java.util.concurrent.CompletionService;
45  import java.util.concurrent.ConcurrentHashMap;
46  import java.util.concurrent.ConcurrentSkipListMap;
47  import java.util.concurrent.CountDownLatch;
48  import java.util.concurrent.ExecutionException;
49  import java.util.concurrent.ExecutorCompletionService;
50  import java.util.concurrent.ExecutorService;
51  import java.util.concurrent.Executors;
52  import java.util.concurrent.Future;
53  import java.util.concurrent.FutureTask;
54  import java.util.concurrent.ThreadFactory;
55  import java.util.concurrent.ThreadPoolExecutor;
56  import java.util.concurrent.TimeUnit;
57  import java.util.concurrent.TimeoutException;
58  import java.util.concurrent.atomic.AtomicBoolean;
59  import java.util.concurrent.atomic.AtomicInteger;
60  import java.util.concurrent.atomic.AtomicLong;
61  import java.util.concurrent.locks.Lock;
62  import java.util.concurrent.locks.ReentrantReadWriteLock;
63  
64  import org.apache.commons.logging.Log;
65  import org.apache.commons.logging.LogFactory;
66  import org.apache.hadoop.conf.Configuration;
67  import org.apache.hadoop.fs.FileStatus;
68  import org.apache.hadoop.fs.FileSystem;
69  import org.apache.hadoop.fs.Path;
70  import org.apache.hadoop.hbase.Cell;
71  import org.apache.hadoop.hbase.CellScanner;
72  import org.apache.hadoop.hbase.CellUtil;
73  import org.apache.hadoop.hbase.CompoundConfiguration;
74  import org.apache.hadoop.hbase.DoNotRetryIOException;
75  import org.apache.hadoop.hbase.DroppedSnapshotException;
76  import org.apache.hadoop.hbase.HBaseConfiguration;
77  import org.apache.hadoop.hbase.HColumnDescriptor;
78  import org.apache.hadoop.hbase.HConstants;
79  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
80  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
81  import org.apache.hadoop.hbase.HRegionInfo;
82  import org.apache.hadoop.hbase.HTableDescriptor;
83  import org.apache.hadoop.hbase.KeyValue;
84  import org.apache.hadoop.hbase.KeyValueUtil;
85  import org.apache.hadoop.hbase.NamespaceDescriptor;
86  import org.apache.hadoop.hbase.NotServingRegionException;
87  import org.apache.hadoop.hbase.RegionTooBusyException;
88  import org.apache.hadoop.hbase.TableName;
89  import org.apache.hadoop.hbase.Tag;
90  import org.apache.hadoop.hbase.TagType;
91  import org.apache.hadoop.hbase.UnknownScannerException;
92  import org.apache.hadoop.hbase.backup.HFileArchiver;
93  import org.apache.hadoop.hbase.classification.InterfaceAudience;
94  import org.apache.hadoop.hbase.client.Append;
95  import org.apache.hadoop.hbase.client.Delete;
96  import org.apache.hadoop.hbase.client.Durability;
97  import org.apache.hadoop.hbase.client.Get;
98  import org.apache.hadoop.hbase.client.Increment;
99  import org.apache.hadoop.hbase.client.IsolationLevel;
100 import org.apache.hadoop.hbase.client.Mutation;
101 import org.apache.hadoop.hbase.client.Put;
102 import org.apache.hadoop.hbase.client.Result;
103 import org.apache.hadoop.hbase.client.RowMutations;
104 import org.apache.hadoop.hbase.client.Scan;
105 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
106 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
107 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
108 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
109 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
110 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
111 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
112 import org.apache.hadoop.hbase.filter.FilterWrapper;
113 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
114 import org.apache.hadoop.hbase.io.HeapSize;
115 import org.apache.hadoop.hbase.io.TimeRange;
116 import org.apache.hadoop.hbase.io.hfile.BlockCache;
117 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
118 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
119 import org.apache.hadoop.hbase.ipc.RpcCallContext;
120 import org.apache.hadoop.hbase.ipc.RpcServer;
121 import org.apache.hadoop.hbase.master.AssignmentManager;
122 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
123 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
124 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
125 import org.apache.hadoop.hbase.protobuf.ResponseConverter;
126 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
127 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
128 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
129 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
130 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
131 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
132 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
133 import org.apache.hadoop.hbase.regionserver.compactions.CompactionThroughputController;
134 import org.apache.hadoop.hbase.regionserver.compactions.NoLimitCompactionThroughputController;
135 import org.apache.hadoop.hbase.regionserver.wal.HLog;
136 import org.apache.hadoop.hbase.regionserver.wal.HLogFactory;
137 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
138 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
139 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter.MutationReplay;
140 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
141 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
142 import org.apache.hadoop.hbase.security.User;
143 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
144 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
145 import org.apache.hadoop.hbase.util.Bytes;
146 import org.apache.hadoop.hbase.util.CancelableProgressable;
147 import org.apache.hadoop.hbase.util.ClassSize;
148 import org.apache.hadoop.hbase.util.CompressionTest;
149 import org.apache.hadoop.hbase.util.EncryptionTest;
150 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
151 import org.apache.hadoop.hbase.util.FSTableDescriptors;
152 import org.apache.hadoop.hbase.util.FSUtils;
153 import org.apache.hadoop.hbase.util.HashedBytes;
154 import org.apache.hadoop.hbase.util.Pair;
155 import org.apache.hadoop.hbase.util.Threads;
156 import org.apache.hadoop.io.MultipleIOException;
157 import org.apache.hadoop.util.StringUtils;
158 import org.cliffc.high_scale_lib.Counter;
159 import org.cloudera.htrace.Trace;
160 import org.cloudera.htrace.TraceScope;
161 
162 import com.google.common.annotations.VisibleForTesting;
163 import com.google.common.base.Preconditions;
164 import com.google.common.collect.Lists;
165 import com.google.common.collect.Maps;
166 import com.google.common.io.Closeables;
167 import com.google.protobuf.Descriptors;
168 import com.google.protobuf.Message;
169 import com.google.protobuf.RpcCallback;
170 import com.google.protobuf.RpcController;
171 import com.google.protobuf.Service;
172 
173 /**
174  * HRegion stores data for a certain region of a table.  It stores all columns
175  * for each row. A given table consists of one or more HRegions.
176  *
177  * <p>We maintain multiple HStores for a single HRegion.
178  *
179  * <p>An Store is a set of rows with some column data; together,
180  * they make up all the data for the rows.
181  *
182  * <p>Each HRegion has a 'startKey' and 'endKey'.
183  * <p>The first is inclusive, the second is exclusive (except for
184  * the final region)  The endKey of region 0 is the same as
185  * startKey for region 1 (if it exists).  The startKey for the
186  * first region is null. The endKey for the final region is null.
187  *
188  * <p>Locking at the HRegion level serves only one purpose: preventing the
189  * region from being closed (and consequently split) while other operations
190  * are ongoing. Each row level operation obtains both a row lock and a region
191  * read lock for the duration of the operation. While a scanner is being
192  * constructed, getScanner holds a read lock. If the scanner is successfully
193  * constructed, it holds a read lock until it is closed. A close takes out a
194  * write lock and consequently will block for ongoing operations and will block
195  * new operations from starting while the close is in progress.
196  *
197  * <p>An HRegion is defined by its table and its key extent.
198  *
199  * <p>It consists of at least one Store.  The number of Stores should be
200  * configurable, so that data which is accessed together is stored in the same
201  * Store.  Right now, we approximate that by building a single Store for
202  * each column family.  (This config info will be communicated via the
203  * tabledesc.)
204  *
205  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
206  * regionName is a unique identifier for this HRegion. (startKey, endKey]
207  * defines the keyspace for this HRegion.
208  */
209 @InterfaceAudience.Private
210 public class HRegion implements HeapSize { // , Writable{
211   public static final Log LOG = LogFactory.getLog(HRegion.class);
212 
213   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
214       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
215 
216   /**
217    * This is the global default value for durability. All tables/mutations not
218    * defining a durability or using USE_DEFAULT will default to this value.
219    */
220   private static final Durability DEFAULT_DURABLITY = Durability.SYNC_WAL;
221 
222   final AtomicBoolean closed = new AtomicBoolean(false);
223   /* Closing can take some time; use the closing flag if there is stuff we don't
224    * want to do while in closing state; e.g. like offer this region up to the
225    * master as a region to close if the carrying regionserver is overloaded.
226    * Once set, it is never cleared.
227    */
228   final AtomicBoolean closing = new AtomicBoolean(false);
229 
230   protected volatile long completeSequenceId = -1L;
231 
232   /**
233    * Region level sequence Id. It is used for appending WALEdits in HLog. Its default value is -1,
234    * as a marker that the region hasn't opened yet. Once it is opened, it is set to
235    * {@link #openSeqNum}.
236    */
237   private final AtomicLong sequenceId = new AtomicLong(-1L);
238 
239   /**
240    * Operation enum is used in {@link HRegion#startRegionOperation} to provide operation context for
241    * startRegionOperation to possibly invoke different checks before any region operations. Not all
242    * operations have to be defined here. It's only needed when a special check is need in
243    * startRegionOperation
244    */
245   public enum Operation {
246     ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE,
247     REPLAY_BATCH_MUTATE, COMPACT_REGION
248   }
249 
250   //////////////////////////////////////////////////////////////////////////////
251   // Members
252   //////////////////////////////////////////////////////////////////////////////
253 
254   // map from a locked row to the context for that lock including:
255   // - CountDownLatch for threads waiting on that row
256   // - the thread that owns the lock (allow reentrancy)
257   // - reference count of (reentrant) locks held by the thread
258   // - the row itself
259   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
260       new ConcurrentHashMap<HashedBytes, RowLockContext>();
261 
262   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
263       Bytes.BYTES_RAWCOMPARATOR);
264 
265   // TODO: account for each registered handler in HeapSize computation
266   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
267 
268   public final AtomicLong memstoreSize = new AtomicLong(0);
269 
270   // Debug possible data loss due to WAL off
271   final Counter numMutationsWithoutWAL = new Counter();
272   final Counter dataInMemoryWithoutWAL = new Counter();
273 
274   // Debug why CAS operations are taking a while.
275   final Counter checkAndMutateChecksPassed = new Counter();
276   final Counter checkAndMutateChecksFailed = new Counter();
277 
278   //Number of requests
279   final Counter readRequestsCount = new Counter();
280   final Counter writeRequestsCount = new Counter();
281 
282   // Number of requests blocked by memstore size.
283   private final Counter blockedRequestsCount = new Counter();
284 
285   /**
286    * @return the number of blocked requests count.
287    */
288   public long getBlockedRequestsCount() {
289     return this.blockedRequestsCount.get();
290   }
291 
292   // Compaction counters
293   final AtomicLong compactionsFinished = new AtomicLong(0L);
294   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
295   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
296 
297 
298   private final HLog log;
299   private final HRegionFileSystem fs;
300   protected final Configuration conf;
301   private final Configuration baseConf;
302   private final KeyValue.KVComparator comparator;
303   private final int rowLockWaitDuration;
304   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
305 
306   // The internal wait duration to acquire a lock before read/update
307   // from the region. It is not per row. The purpose of this wait time
308   // is to avoid waiting a long time while the region is busy, so that
309   // we can release the IPC handler soon enough to improve the
310   // availability of the region server. It can be adjusted by
311   // tuning configuration "hbase.busy.wait.duration".
312   final long busyWaitDuration;
313   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
314 
315   // If updating multiple rows in one call, wait longer,
316   // i.e. waiting for busyWaitDuration * # of rows. However,
317   // we can limit the max multiplier.
318   final int maxBusyWaitMultiplier;
319 
320   // Max busy wait duration. There is no point to wait longer than the RPC
321   // purge timeout, when a RPC call will be terminated by the RPC engine.
322   final long maxBusyWaitDuration;
323 
324   // negative number indicates infinite timeout
325   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
326   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
327 
328   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
329 
330   /**
331    * The sequence ID that was encountered when this region was opened.
332    */
333   private long openSeqNum = HConstants.NO_SEQNUM;
334 
335   /**
336    * The default setting for whether to enable on-demand CF loading for
337    * scan requests to this region. Requests can override it.
338    */
339   private boolean isLoadingCfsOnDemandDefault = false;
340 
341   private final AtomicInteger majorInProgress = new AtomicInteger(0);
342   private final AtomicInteger minorInProgress = new AtomicInteger(0);
343 
344   //
345   // Context: During replay we want to ensure that we do not lose any data. So, we
346   // have to be conservative in how we replay logs. For each store, we calculate
347   // the maxSeqId up to which the store was flushed. And, skip the edits which
348   // are equal to or lower than maxSeqId for each store.
349   // The following map is populated when opening the region
350   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
351 
352   /**
353    * Config setting for whether to allow writes when a region is in recovering or not.
354    */
355   private boolean disallowWritesInRecovering = false;
356 
357   // when a region is in recovering state, it can only accept writes not reads
358   private volatile boolean isRecovering = false;
359 
360   /**
361    * @return The smallest mvcc readPoint across all the scanners in this
362    * region. Writes older than this readPoint, are included  in every
363    * read operation.
364    */
365   public long getSmallestReadPoint() {
366     long minimumReadPoint;
367     // We need to ensure that while we are calculating the smallestReadPoint
368     // no new RegionScanners can grab a readPoint that we are unaware of.
369     // We achieve this by synchronizing on the scannerReadPoints object.
370     synchronized(scannerReadPoints) {
371       minimumReadPoint = mvcc.memstoreReadPoint();
372 
373       for (Long readPoint: this.scannerReadPoints.values()) {
374         if (readPoint < minimumReadPoint) {
375           minimumReadPoint = readPoint;
376         }
377       }
378     }
379     return minimumReadPoint;
380   }
381   /*
382    * Data structure of write state flags used coordinating flushes,
383    * compactions and closes.
384    */
385   static class WriteState {
386     // Set while a memstore flush is happening.
387     volatile boolean flushing = false;
388     // Set when a flush has been requested.
389     volatile boolean flushRequested = false;
390     // Number of compactions running.
391     volatile int compacting = 0;
392     // Gets set in close. If set, cannot compact or flush again.
393     volatile boolean writesEnabled = true;
394     // Set if region is read-only
395     volatile boolean readOnly = false;
396 
397     /**
398      * Set flags that make this region read-only.
399      *
400      * @param onOff flip value for region r/o setting
401      */
402     synchronized void setReadOnly(final boolean onOff) {
403       this.writesEnabled = !onOff;
404       this.readOnly = onOff;
405     }
406 
407     boolean isReadOnly() {
408       return this.readOnly;
409     }
410 
411     boolean isFlushRequested() {
412       return this.flushRequested;
413     }
414 
415     static final long HEAP_SIZE = ClassSize.align(
416         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
417   }
418 
419   /**
420    * Objects from this class are created when flushing to describe all the different states that
421    * that method ends up in. The Result enum describes those states. The sequence id should only
422    * be specified if the flush was successful, and the failure message should only be speficied
423    * if it didn't flush.
424    */
425   public static class FlushResult {
426     enum Result {
427       FLUSHED_NO_COMPACTION_NEEDED,
428       FLUSHED_COMPACTION_NEEDED,
429       // Special case where a flush didn't run because there's nothing in the memstores. Used when
430       // bulk loading to know when we can still load even if a flush didn't happen.
431       CANNOT_FLUSH_MEMSTORE_EMPTY,
432       CANNOT_FLUSH
433       // Be careful adding more to this enum, look at the below methods to make sure
434     }
435 
436     final Result result;
437     final String failureReason;
438     final long flushSequenceId;
439 
440     /**
441      * Convenience constructor to use when the flush is successful, the failure message is set to
442      * null.
443      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
444      * @param flushSequenceId Generated sequence id that comes right after the edits in the
445      *                        memstores.
446      */
447     FlushResult(Result result, long flushSequenceId) {
448       this(result, flushSequenceId, null);
449       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
450           .FLUSHED_COMPACTION_NEEDED;
451     }
452 
453     /**
454      * Convenience constructor to use when we cannot flush.
455      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
456      * @param failureReason Reason why we couldn't flush.
457      */
458     FlushResult(Result result, String failureReason) {
459       this(result, -1, failureReason);
460       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
461     }
462 
463     /**
464      * Constructor with all the parameters.
465      * @param result Any of the Result.
466      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
467      * @param failureReason Reason why we couldn't flush, or null.
468      */
469     FlushResult(Result result, long flushSequenceId, String failureReason) {
470       this.result = result;
471       this.flushSequenceId = flushSequenceId;
472       this.failureReason = failureReason;
473     }
474 
475     /**
476      * Convenience method, the equivalent of checking if result is
477      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
478      * @return true if the memstores were flushed, else false.
479      */
480     public boolean isFlushSucceeded() {
481       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
482           .FLUSHED_COMPACTION_NEEDED;
483     }
484 
485     /**
486      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
487      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
488      */
489     public boolean isCompactionNeeded() {
490       return result == Result.FLUSHED_COMPACTION_NEEDED;
491     }
492   }
493 
494   final WriteState writestate = new WriteState();
495 
496   long memstoreFlushSize;
497   final long timestampSlop;
498   final long rowProcessorTimeout;
499   private volatile long lastFlushTime;
500   final RegionServerServices rsServices;
501   private RegionServerAccounting rsAccounting;
502   private List<Pair<Long, Long>> recentFlushes = new ArrayList<Pair<Long,Long>>();
503   private long flushCheckInterval;
504   // flushPerChanges is to prevent too many changes in memstore
505   private long flushPerChanges;
506   private long blockingMemStoreSize;
507   final long threadWakeFrequency;
508   // Used to guard closes
509   final ReentrantReadWriteLock lock =
510     new ReentrantReadWriteLock();
511 
512   // Stop updates lock
513   private final ReentrantReadWriteLock updatesLock =
514     new ReentrantReadWriteLock();
515   private boolean splitRequest;
516   private byte[] explicitSplitPoint = null;
517 
518   private final MultiVersionConsistencyControl mvcc =
519       new MultiVersionConsistencyControl();
520 
521   // Coprocessor host
522   private RegionCoprocessorHost coprocessorHost;
523 
524   private HTableDescriptor htableDescriptor = null;
525   private RegionSplitPolicy splitPolicy;
526 
527   private final MetricsRegion metricsRegion;
528   private final MetricsRegionWrapperImpl metricsRegionWrapper;
529   private final Durability durability;
530   private final boolean regionStatsEnabled;
531 
532   /**
533    * HRegion constructor. This constructor should only be used for testing and
534    * extensions.  Instances of HRegion should be instantiated with the
535    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
536    *
537    * @param tableDir qualified path of directory where region should be located,
538    * usually the table directory.
539    * @param log The HLog is the outbound log for any updates to the HRegion
540    * (There's a single HLog for all the HRegions on a single HRegionServer.)
541    * The log file is a logfile from the previous execution that's
542    * custom-computed for this HRegion. The HRegionServer computes and sorts the
543    * appropriate log info for this HRegion. If there is a previous log file
544    * (implying that the HRegion has been written-to before), then read it from
545    * the supplied path.
546    * @param fs is the filesystem.
547    * @param confParam is global configuration settings.
548    * @param regionInfo - HRegionInfo that describes the region
549    * is new), then read them from the supplied path.
550    * @param htd the table descriptor
551    * @param rsServices reference to {@link RegionServerServices} or null
552    */
553   @Deprecated
554   public HRegion(final Path tableDir, final HLog log, final FileSystem fs,
555       final Configuration confParam, final HRegionInfo regionInfo,
556       final HTableDescriptor htd, final RegionServerServices rsServices) {
557     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
558       log, confParam, htd, rsServices);
559   }
560 
561   /**
562    * HRegion constructor. This constructor should only be used for testing and
563    * extensions.  Instances of HRegion should be instantiated with the
564    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
565    *
566    * @param fs is the filesystem.
567    * @param log The HLog is the outbound log for any updates to the HRegion
568    * (There's a single HLog for all the HRegions on a single HRegionServer.)
569    * The log file is a logfile from the previous execution that's
570    * custom-computed for this HRegion. The HRegionServer computes and sorts the
571    * appropriate log info for this HRegion. If there is a previous log file
572    * (implying that the HRegion has been written-to before), then read it from
573    * the supplied path.
574    * @param confParam is global configuration settings.
575    * @param htd the table descriptor
576    * @param rsServices reference to {@link RegionServerServices} or null
577    */
578   public HRegion(final HRegionFileSystem fs, final HLog log, final Configuration confParam,
579       final HTableDescriptor htd, final RegionServerServices rsServices) {
580     if (htd == null) {
581       throw new IllegalArgumentException("Need table descriptor");
582     }
583 
584     if (confParam instanceof CompoundConfiguration) {
585       throw new IllegalArgumentException("Need original base configuration");
586     }
587 
588     this.comparator = fs.getRegionInfo().getComparator();
589     this.log = log;
590     this.fs = fs;
591 
592     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
593     this.baseConf = confParam;
594     this.conf = new CompoundConfiguration()
595       .add(confParam)
596       .addStringMap(htd.getConfiguration())
597       .addWritableMap(htd.getValues());
598     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
599         DEFAULT_CACHE_FLUSH_INTERVAL);
600     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
601     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
602       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
603           + MAX_FLUSH_PER_CHANGES);
604     }
605 
606     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
607                     DEFAULT_ROWLOCK_WAIT_DURATION);
608 
609     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
610     this.htableDescriptor = htd;
611     this.rsServices = rsServices;
612     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
613     setHTableSpecificConf();
614     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
615 
616     this.busyWaitDuration = conf.getLong(
617       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
618     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
619     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
620       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
621         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
622         + maxBusyWaitMultiplier + "). Their product should be positive");
623     }
624     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
625       conf.getLong("ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT));
626 
627     /*
628      * timestamp.slop provides a server-side constraint on the timestamp. This
629      * assumes that you base your TS around currentTimeMillis(). In this case,
630      * throw an error to the user if the user-specified TS is newer than now +
631      * slop. LATEST_TIMESTAMP == don't use this functionality
632      */
633     this.timestampSlop = conf.getLong(
634         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
635         HConstants.LATEST_TIMESTAMP);
636 
637     /**
638      * Timeout for the process time in processRowsWithLocks().
639      * Use -1 to switch off time bound.
640      */
641     this.rowProcessorTimeout = conf.getLong(
642         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
643     this.durability = htd.getDurability() == Durability.USE_DEFAULT
644         ? DEFAULT_DURABLITY
645         : htd.getDurability();
646     if (rsServices != null) {
647       this.rsAccounting = this.rsServices.getRegionServerAccounting();
648       // don't initialize coprocessors if not running within a regionserver
649       // TODO: revisit if coprocessors should load in other cases
650       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
651       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
652       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
653 
654       Map<String, HRegion> recoveringRegions = rsServices.getRecoveringRegions();
655       String encodedName = getRegionInfo().getEncodedName();
656       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
657         this.isRecovering = true;
658         recoveringRegions.put(encodedName, this);
659       }
660     } else {
661       this.metricsRegionWrapper = null;
662       this.metricsRegion = null;
663     }
664     if (LOG.isDebugEnabled()) {
665       // Write out region name as string and its encoded name.
666       LOG.debug("Instantiated " + this);
667     }
668 
669     // by default, we allow writes against a region when it's in recovering
670     this.disallowWritesInRecovering =
671         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
672           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
673 
674     // disable stats tracking system tables, but check the config for everything else
675     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
676       NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ? false :
677         conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
678           HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
679   }
680 
681   void setHTableSpecificConf() {
682     if (this.htableDescriptor == null) return;
683     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
684 
685     if (flushSize <= 0) {
686       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
687         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
688     }
689     this.memstoreFlushSize = flushSize;
690     this.blockingMemStoreSize = this.memstoreFlushSize *
691         conf.getLong(HConstants.HREGION_MEMSTORE_BLOCK_MULTIPLIER,
692                 HConstants.DEFAULT_HREGION_MEMSTORE_BLOCK_MULTIPLIER);
693   }
694 
695   /**
696    * Initialize this region.
697    * Used only by tests and SplitTransaction to reopen the region.
698    * You should use createHRegion() or openHRegion()
699    * @return What the next sequence (edit) id should be.
700    * @throws IOException e
701    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
702    */
703   @Deprecated
704   public long initialize() throws IOException {
705     return initialize(null);
706   }
707 
708   /**
709    * Initialize this region.
710    *
711    * @param reporter Tickle every so often if initialize is taking a while.
712    * @return What the next sequence (edit) id should be.
713    * @throws IOException e
714    */
715   private long initialize(final CancelableProgressable reporter) throws IOException {
716     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
717     long nextSeqId = -1;
718     try {
719       nextSeqId = initializeRegionInternals(reporter, status);
720       return nextSeqId;
721     } finally {
722       // nextSeqid will be -1 if the initialization fails.
723       // At least it will be 0 otherwise.
724       if (nextSeqId == -1) {
725         status
726             .abort("Exception during region " + this.getRegionNameAsString() + " initialization.");
727       }
728     }
729   }
730 
731   private long initializeRegionInternals(final CancelableProgressable reporter,
732       final MonitoredTask status) throws IOException, UnsupportedEncodingException {
733     if (coprocessorHost != null) {
734       status.setStatus("Running coprocessor pre-open hook");
735       coprocessorHost.preOpen();
736     }
737 
738     // Write HRI to a file in case we need to recover hbase:meta
739     status.setStatus("Writing region info on filesystem");
740     fs.checkRegionInfoOnFilesystem();
741 
742     // Remove temporary data left over from old regions
743     status.setStatus("Cleaning up temporary data from old regions");
744     fs.cleanupTempDir();
745 
746     // Initialize all the HStores
747     status.setStatus("Initializing all the Stores");
748     long maxSeqId = initializeRegionStores(reporter, status);
749 
750     status.setStatus("Cleaning up detritus from prior splits");
751     // Get rid of any splits or merges that were lost in-progress.  Clean out
752     // these directories here on open.  We may be opening a region that was
753     // being split but we crashed in the middle of it all.
754     fs.cleanupAnySplitDetritus();
755     fs.cleanupMergesDir();
756 
757     this.writestate.setReadOnly(this.htableDescriptor.isReadOnly());
758     this.writestate.flushRequested = false;
759     this.writestate.compacting = 0;
760 
761     // Initialize split policy
762     this.splitPolicy = RegionSplitPolicy.create(this, conf);
763 
764     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
765     // Use maximum of log sequenceid or that which was found in stores
766     // (particularly if no recovered edits, seqid will be -1).
767     long nextSeqid = maxSeqId + 1;
768     if (this.isRecovering) {
769       // In distributedLogReplay mode, we don't know the last change sequence number because region
770       // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
771       // overlaps used sequence numbers
772       nextSeqid += this.flushPerChanges + 10000000; // add another extra 10million
773     }
774     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
775       "; next sequenceid=" + nextSeqid);
776 
777     // A region can be reopened if failed a split; reset flags
778     this.closing.set(false);
779     this.closed.set(false);
780 
781     if (coprocessorHost != null) {
782       status.setStatus("Running coprocessor post-open hooks");
783       coprocessorHost.postOpen();
784     }
785 
786     status.markComplete("Region opened successfully");
787     return nextSeqid;
788   }
789 
790   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status)
791       throws IOException, UnsupportedEncodingException {
792     // Load in all the HStores.
793 
794     long maxSeqId = -1;
795     // initialized to -1 so that we pick up MemstoreTS from column families
796     long maxMemstoreTS = -1;
797 
798     if (!htableDescriptor.getFamilies().isEmpty()) {
799       // initialize the thread pool for opening stores in parallel.
800       ThreadPoolExecutor storeOpenerThreadPool =
801         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
802       CompletionService<HStore> completionService =
803         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
804 
805       // initialize each store in parallel
806       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
807         status.setStatus("Instantiating store for column family " + family);
808         completionService.submit(new Callable<HStore>() {
809           @Override
810           public HStore call() throws IOException {
811             return instantiateHStore(family);
812           }
813         });
814       }
815       boolean allStoresOpened = false;
816       try {
817         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
818           Future<HStore> future = completionService.take();
819           HStore store = future.get();
820           this.stores.put(store.getColumnFamilyName().getBytes(), store);
821 
822           long storeMaxSequenceId = store.getMaxSequenceId();
823           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
824               storeMaxSequenceId);
825           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
826             maxSeqId = storeMaxSequenceId;
827           }
828           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
829           if (maxStoreMemstoreTS > maxMemstoreTS) {
830             maxMemstoreTS = maxStoreMemstoreTS;
831           }
832         }
833         allStoresOpened = true;
834       } catch (InterruptedException e) {
835         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
836       } catch (ExecutionException e) {
837         throw new IOException(e.getCause());
838       } finally {
839         storeOpenerThreadPool.shutdownNow();
840         if (!allStoresOpened) {
841           // something went wrong, close all opened stores
842           LOG.error("Could not initialize all stores for the region=" + this);
843           for (Store store : this.stores.values()) {
844             try {
845               store.close();
846             } catch (IOException e) {
847               LOG.warn(e.getMessage());
848             }
849           }
850         }
851       }
852     }
853     mvcc.initialize(maxMemstoreTS + 1);
854     // Recover any edits if available.
855     maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
856         this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
857     return maxSeqId;
858   }
859 
860   /**
861    * @return True if this region has references.
862    */
863   public boolean hasReferences() {
864     for (Store store : this.stores.values()) {
865       if (store.hasReferences()) return true;
866     }
867     return false;
868   }
869 
870   /**
871    * This function will return the HDFS blocks distribution based on the data
872    * captured when HFile is created
873    * @return The HDFS blocks distribution for the region.
874    */
875   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
876     HDFSBlocksDistribution hdfsBlocksDistribution =
877       new HDFSBlocksDistribution();
878     synchronized (this.stores) {
879       for (Store store : this.stores.values()) {
880         for (StoreFile sf : store.getStorefiles()) {
881           HDFSBlocksDistribution storeFileBlocksDistribution =
882             sf.getHDFSBlockDistribution();
883           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
884         }
885       }
886     }
887     return hdfsBlocksDistribution;
888   }
889 
890   /**
891    * This is a helper function to compute HDFS block distribution on demand
892    * @param conf configuration
893    * @param tableDescriptor HTableDescriptor of the table
894    * @param regionInfo encoded name of the region
895    * @return The HDFS blocks distribution for the given region.
896    * @throws IOException
897    */
898   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
899       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
900     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
901     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
902   }
903 
904   /**
905    * This is a helper function to compute HDFS block distribution on demand
906    * @param conf configuration
907    * @param tableDescriptor HTableDescriptor of the table
908    * @param regionInfo encoded name of the region
909    * @param tablePath the table directory
910    * @return The HDFS blocks distribution for the given region.
911    * @throws IOException
912    */
913   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
914       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
915       throws IOException {
916     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
917     FileSystem fs = tablePath.getFileSystem(conf);
918 
919     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
920     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
921       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
922       if (storeFiles == null) continue;
923 
924       for (StoreFileInfo storeFileInfo : storeFiles) {
925         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
926       }
927     }
928     return hdfsBlocksDistribution;
929   }
930 
931   public AtomicLong getMemstoreSize() {
932     return memstoreSize;
933   }
934 
935   /**
936    * Increase the size of mem store in this region and the size of global mem
937    * store
938    * @param memStoreSize
939    * @return the size of memstore in this region
940    */
941   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
942     if (this.rsAccounting != null) {
943       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
944     }
945     return this.memstoreSize.addAndGet(memStoreSize);
946   }
947 
948   /** @return a HRegionInfo object for this region */
949   public HRegionInfo getRegionInfo() {
950     return this.fs.getRegionInfo();
951   }
952 
953   /**
954    * @return Instance of {@link RegionServerServices} used by this HRegion.
955    * Can be null.
956    */
957   RegionServerServices getRegionServerServices() {
958     return this.rsServices;
959   }
960 
961   /**
962    * @return split policy for this region.
963    */
964   public RegionSplitPolicy getSplitPolicy() {
965     return this.splitPolicy;
966   }
967 
968   /** @return readRequestsCount for this region */
969   long getReadRequestsCount() {
970     return this.readRequestsCount.get();
971   }
972 
973   /** @return writeRequestsCount for this region */
974   long getWriteRequestsCount() {
975     return this.writeRequestsCount.get();
976   }
977 
978   public MetricsRegion getMetrics() {
979     return metricsRegion;
980   }
981 
982   /** @return true if region is closed */
983   public boolean isClosed() {
984     return this.closed.get();
985   }
986 
987   /**
988    * @return True if closing process has started.
989    */
990   public boolean isClosing() {
991     return this.closing.get();
992   }
993 
994   /**
995    * Reset recovering state of current region
996    * @param newState
997    */
998   public void setRecovering(boolean newState) {
999     boolean wasRecovering = this.isRecovering;
1000     this.isRecovering = newState;
1001     if (wasRecovering && !isRecovering) {
1002       // Call only when log replay is over.
1003       coprocessorHost.postLogReplay();
1004     }
1005   }
1006 
1007   /**
1008    * @return True if current region is in recovering
1009    */
1010   public boolean isRecovering() {
1011     return this.isRecovering;
1012   }
1013 
1014   /** @return true if region is available (not closed and not closing) */
1015   public boolean isAvailable() {
1016     return !isClosed() && !isClosing();
1017   }
1018 
1019   /** @return true if region is splittable */
1020   public boolean isSplittable() {
1021     return isAvailable() && !hasReferences();
1022   }
1023 
1024   /**
1025    * @return true if region is mergeable
1026    */
1027   public boolean isMergeable() {
1028     if (!isAvailable()) {
1029       LOG.debug("Region " + this.getRegionNameAsString()
1030           + " is not mergeable because it is closing or closed");
1031       return false;
1032     }
1033     if (hasReferences()) {
1034       LOG.debug("Region " + this.getRegionNameAsString()
1035           + " is not mergeable because it has references");
1036       return false;
1037     }
1038 
1039     return true;
1040   }
1041 
1042   public boolean areWritesEnabled() {
1043     synchronized(this.writestate) {
1044       return this.writestate.writesEnabled;
1045     }
1046   }
1047 
1048    public MultiVersionConsistencyControl getMVCC() {
1049      return mvcc;
1050    }
1051 
1052    /*
1053     * Returns readpoint considering given IsolationLevel
1054     */
1055    public long getReadpoint(IsolationLevel isolationLevel) {
1056      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1057        // This scan can read even uncommitted transactions
1058        return Long.MAX_VALUE;
1059      }
1060      return mvcc.memstoreReadPoint();
1061    }
1062 
1063    public boolean isLoadingCfsOnDemandDefault() {
1064      return this.isLoadingCfsOnDemandDefault;
1065    }
1066 
1067   /**
1068    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1069    * service any more calls.
1070    *
1071    * <p>This method could take some time to execute, so don't call it from a
1072    * time-sensitive thread.
1073    *
1074    * @return Vector of all the storage files that the HRegion's component
1075    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1076    * vector if already closed and null if judged that it should not close.
1077    *
1078    * @throws IOException e
1079    * @throws DroppedSnapshotException Thrown when replay of wal is required
1080    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1081    * caller MUST abort after this.
1082    */
1083   public Map<byte[], List<StoreFile>> close() throws IOException {
1084     return close(false);
1085   }
1086 
1087   private final Object closeLock = new Object();
1088 
1089   /** Conf key for the periodic flush interval */
1090   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1091       "hbase.regionserver.optionalcacheflushinterval";
1092   /** Default interval for the memstore flush */
1093   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1094 
1095   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1096   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1097       "hbase.regionserver.flush.per.changes";
1098   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1099   /**
1100    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1101    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1102    */
1103   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1104 
1105   /**
1106    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1107    * Shut down each HStore, don't service any more calls.
1108    *
1109    * This method could take some time to execute, so don't call it from a
1110    * time-sensitive thread.
1111    *
1112    * @param abort true if server is aborting (only during testing)
1113    * @return Vector of all the storage files that the HRegion's component
1114    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1115    * we are not to close at this time or we are already closed.
1116    *
1117    * @throws IOException e
1118    * @throws DroppedSnapshotException Thrown when replay of wal is required
1119    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1120    * caller MUST abort after this.
1121    */
1122   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1123     // Only allow one thread to close at a time. Serialize them so dual
1124     // threads attempting to close will run up against each other.
1125     MonitoredTask status = TaskMonitor.get().createStatus(
1126         "Closing region " + this +
1127         (abort ? " due to abort" : ""));
1128 
1129     status.setStatus("Waiting for close lock");
1130     try {
1131       synchronized (closeLock) {
1132         return doClose(abort, status);
1133       }
1134     } finally {
1135       status.cleanup();
1136     }
1137   }
1138 
1139   /**
1140    * Exposed for some very specific unit tests.
1141    */
1142   @VisibleForTesting
1143   public void setClosing(boolean closing) {
1144     this.closing.set(closing);
1145   }
1146 
1147   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1148       throws IOException {
1149     if (isClosed()) {
1150       LOG.warn("Region " + this + " already closed");
1151       return null;
1152     }
1153 
1154     if (coprocessorHost != null) {
1155       status.setStatus("Running coprocessor pre-close hooks");
1156       this.coprocessorHost.preClose(abort);
1157     }
1158 
1159     status.setStatus("Disabling compacts and flushes for region");
1160     synchronized (writestate) {
1161       // Disable compacting and flushing by background threads for this
1162       // region.
1163       writestate.writesEnabled = false;
1164       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1165       waitForFlushesAndCompactions();
1166     }
1167     // If we were not just flushing, is it worth doing a preflush...one
1168     // that will clear out of the bulk of the memstore before we put up
1169     // the close flag?
1170     if (!abort && worthPreFlushing()) {
1171       status.setStatus("Pre-flushing region before close");
1172       LOG.info("Running close preflush of " + this.getRegionNameAsString());
1173       try {
1174         internalFlushcache(status);
1175       } catch (IOException ioe) {
1176         // Failed to flush the region. Keep going.
1177         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1178       }
1179     }
1180 
1181     // block waiting for the lock for closing
1182     lock.writeLock().lock();
1183     this.closing.set(true);
1184     status.setStatus("Disabling writes for close");
1185     try {
1186       if (this.isClosed()) {
1187         status.abort("Already got closed by another process");
1188         // SplitTransaction handles the null
1189         return null;
1190       }
1191       LOG.debug("Updates disabled for region " + this);
1192       // Don't flush the cache if we are aborting
1193       if (!abort) {
1194         int flushCount = 0;
1195         while (this.getMemstoreSize().get() > 0) {
1196           try {
1197             if (flushCount++ > 0) {
1198               int actualFlushes = flushCount - 1;
1199               if (actualFlushes > 5) {
1200                 // If we tried 5 times and are unable to clear memory, abort
1201                 // so we do not lose data
1202                 throw new DroppedSnapshotException("Failed clearing memory after " +
1203                   actualFlushes + " attempts on region: " + Bytes.toStringBinary(getRegionName()));
1204               }
1205               LOG.info("Running extra flush, " + actualFlushes +
1206                 " (carrying snapshot?) " + this);
1207             }
1208             internalFlushcache(status);
1209           } catch (IOException ioe) {
1210             status.setStatus("Failed flush " + this + ", putting online again");
1211             synchronized (writestate) {
1212               writestate.writesEnabled = true;
1213             }
1214             // Have to throw to upper layers.  I can't abort server from here.
1215             throw ioe;
1216           }
1217         }
1218       }
1219 
1220       Map<byte[], List<StoreFile>> result =
1221         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1222       if (!stores.isEmpty()) {
1223         // initialize the thread pool for closing stores in parallel.
1224         ThreadPoolExecutor storeCloserThreadPool =
1225           getStoreOpenAndCloseThreadPool("StoreCloserThread-" + this.getRegionNameAsString());
1226         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1227           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1228 
1229         // close each store in parallel
1230         for (final Store store : stores.values()) {
1231           long flushableSize = store.getFlushableSize();
1232           if (!(abort || flushableSize == 0)) {
1233             getRegionServerServices().abort("Assertion failed while closing store "
1234                 + getRegionInfo().getRegionNameAsString() + " " + store
1235                 + ". flushableSize expected=0, actual= " + flushableSize
1236                 + ". Current memstoreSize=" + getMemstoreSize() + ". Maybe a coprocessor "
1237                 + "operation failed and left the memstore in a partially updated state.", null);
1238           }
1239           completionService
1240               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1241                 @Override
1242                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1243                   return new Pair<byte[], Collection<StoreFile>>(
1244                     store.getFamily().getName(), store.close());
1245                 }
1246               });
1247         }
1248         try {
1249           for (int i = 0; i < stores.size(); i++) {
1250             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1251             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1252             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1253             if (familyFiles == null) {
1254               familyFiles = new ArrayList<StoreFile>();
1255               result.put(storeFiles.getFirst(), familyFiles);
1256             }
1257             familyFiles.addAll(storeFiles.getSecond());
1258           }
1259         } catch (InterruptedException e) {
1260           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1261         } catch (ExecutionException e) {
1262           throw new IOException(e.getCause());
1263         } finally {
1264           storeCloserThreadPool.shutdownNow();
1265         }
1266       }
1267       this.closed.set(true);
1268       if (memstoreSize.get() != 0) LOG.error("Memstore size is " + memstoreSize.get());
1269       if (coprocessorHost != null) {
1270         status.setStatus("Running coprocessor post-close hooks");
1271         this.coprocessorHost.postClose(abort);
1272       }
1273       if ( this.metricsRegion != null) {
1274         this.metricsRegion.close();
1275       }
1276       if ( this.metricsRegionWrapper != null) {
1277         Closeables.closeQuietly(this.metricsRegionWrapper);
1278       }
1279       status.markComplete("Closed");
1280       LOG.info("Closed " + this);
1281       return result;
1282     } finally {
1283       lock.writeLock().unlock();
1284     }
1285   }
1286 
1287   /**
1288    * Wait for all current flushes and compactions of the region to complete.
1289    * <p>
1290    * Exposed for TESTING.
1291    */
1292   public void waitForFlushesAndCompactions() {
1293     synchronized (writestate) {
1294       while (writestate.compacting > 0 || writestate.flushing) {
1295         LOG.debug("waiting for " + writestate.compacting + " compactions"
1296             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1297         try {
1298           writestate.wait();
1299         } catch (InterruptedException iex) {
1300           // essentially ignore and propagate the interrupt back up
1301           Thread.currentThread().interrupt();
1302         }
1303       }
1304     }
1305   }
1306 
1307   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1308       final String threadNamePrefix) {
1309     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1310     int maxThreads = Math.min(numStores,
1311         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1312             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1313     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1314   }
1315 
1316   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1317       final String threadNamePrefix) {
1318     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1319     int maxThreads = Math.max(1,
1320         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1321             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1322             / numStores);
1323     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1324   }
1325 
1326   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1327       final String threadNamePrefix) {
1328     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1329       new ThreadFactory() {
1330         private int count = 1;
1331 
1332         @Override
1333         public Thread newThread(Runnable r) {
1334           return new Thread(r, threadNamePrefix + "-" + count++);
1335         }
1336       });
1337   }
1338 
1339    /**
1340     * @return True if its worth doing a flush before we put up the close flag.
1341     */
1342   private boolean worthPreFlushing() {
1343     return this.memstoreSize.get() >
1344       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1345   }
1346 
1347   //////////////////////////////////////////////////////////////////////////////
1348   // HRegion accessors
1349   //////////////////////////////////////////////////////////////////////////////
1350 
1351   /** @return start key for region */
1352   public byte [] getStartKey() {
1353     return this.getRegionInfo().getStartKey();
1354   }
1355 
1356   /** @return end key for region */
1357   public byte [] getEndKey() {
1358     return this.getRegionInfo().getEndKey();
1359   }
1360 
1361   /** @return region id */
1362   public long getRegionId() {
1363     return this.getRegionInfo().getRegionId();
1364   }
1365 
1366   /** @return region name */
1367   public byte [] getRegionName() {
1368     return this.getRegionInfo().getRegionName();
1369   }
1370 
1371   /** @return region name as string for logging */
1372   public String getRegionNameAsString() {
1373     return this.getRegionInfo().getRegionNameAsString();
1374   }
1375 
1376   /** @return HTableDescriptor for this region */
1377   public HTableDescriptor getTableDesc() {
1378     return this.htableDescriptor;
1379   }
1380 
1381   /** @return HLog in use for this region */
1382   public HLog getLog() {
1383     return this.log;
1384   }
1385 
1386   /**
1387    * A split takes the config from the parent region & passes it to the daughter
1388    * region's constructor. If 'conf' was passed, you would end up using the HTD
1389    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1390    * to the daughter regions to avoid this tricky dedupe problem.
1391    * @return Configuration object
1392    */
1393   Configuration getBaseConf() {
1394     return this.baseConf;
1395   }
1396 
1397   /** @return {@link FileSystem} being used by this region */
1398   public FileSystem getFilesystem() {
1399     return fs.getFileSystem();
1400   }
1401 
1402   /** @return the {@link HRegionFileSystem} used by this region */
1403   public HRegionFileSystem getRegionFileSystem() {
1404     return this.fs;
1405   }
1406 
1407   /** @return the last time the region was flushed */
1408   public long getLastFlushTime() {
1409     return this.lastFlushTime;
1410   }
1411 
1412   //////////////////////////////////////////////////////////////////////////////
1413   // HRegion maintenance.
1414   //
1415   // These methods are meant to be called periodically by the HRegionServer for
1416   // upkeep.
1417   //////////////////////////////////////////////////////////////////////////////
1418 
1419   /** @return returns size of largest HStore. */
1420   public long getLargestHStoreSize() {
1421     long size = 0;
1422     for (Store h : stores.values()) {
1423       long storeSize = h.getSize();
1424       if (storeSize > size) {
1425         size = storeSize;
1426       }
1427     }
1428     return size;
1429   }
1430 
1431   /**
1432    * @return KeyValue Comparator
1433    */
1434   public KeyValue.KVComparator getComparator() {
1435     return this.comparator;
1436   }
1437 
1438   /*
1439    * Do preparation for pending compaction.
1440    * @throws IOException
1441    */
1442   protected void doRegionCompactionPrep() throws IOException {
1443   }
1444 
1445   void triggerMajorCompaction() {
1446     for (Store h : stores.values()) {
1447       h.triggerMajorCompaction();
1448     }
1449   }
1450 
1451   /**
1452    * This is a helper function that compact all the stores synchronously
1453    * It is used by utilities and testing
1454    *
1455    * @param majorCompaction True to force a major compaction regardless of thresholds
1456    * @throws IOException e
1457    */
1458   public void compactStores(final boolean majorCompaction)
1459   throws IOException {
1460     if (majorCompaction) {
1461       this.triggerMajorCompaction();
1462     }
1463     compactStores();
1464   }
1465 
1466   /**
1467    * This is a helper function that compact all the stores synchronously
1468    * It is used by utilities and testing
1469    *
1470    * @throws IOException e
1471    */
1472   public void compactStores() throws IOException {
1473     for (Store s : getStores().values()) {
1474       CompactionContext compaction = s.requestCompaction();
1475       if (compaction != null) {
1476         compact(compaction, s, NoLimitCompactionThroughputController.INSTANCE, null);
1477       }
1478     }
1479   }
1480 
1481   /**
1482    * Called by compaction thread and after region is opened to compact the
1483    * HStores if necessary.
1484    *
1485    * <p>This operation could block for a long time, so don't call it from a
1486    * time-sensitive thread.
1487    *
1488    * Note that no locking is necessary at this level because compaction only
1489    * conflicts with a region split, and that cannot happen because the region
1490    * server does them sequentially and not in parallel.
1491    *
1492    * @param compaction Compaction details, obtained by requestCompaction()
1493    * @return whether the compaction completed
1494    */
1495   public boolean compact(CompactionContext compaction, Store store,
1496       CompactionThroughputController throughputController) throws IOException {
1497     return compact(compaction, store, throughputController, null);
1498   }
1499 
1500   public boolean compact(CompactionContext compaction, Store store,
1501       CompactionThroughputController throughputController, User user) throws IOException {
1502     assert compaction != null && compaction.hasSelection();
1503     assert !compaction.getRequest().getFiles().isEmpty();
1504     if (this.closing.get() || this.closed.get()) {
1505       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1506       store.cancelRequestedCompaction(compaction);
1507       return false;
1508     }
1509     MonitoredTask status = null;
1510     boolean requestNeedsCancellation = true;
1511     // block waiting for the lock for compaction
1512     lock.readLock().lock();
1513     try {
1514       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1515       if (stores.get(cf) != store) {
1516         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1517             + " has been re-instantiated, cancel this compaction request. "
1518             + " It may be caused by the roll back of split transaction");
1519         return false;
1520       }
1521 
1522       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1523       if (this.closed.get()) {
1524         String msg = "Skipping compaction on " + this + " because closed";
1525         LOG.debug(msg);
1526         status.abort(msg);
1527         return false;
1528       }
1529       boolean wasStateSet = false;
1530       try {
1531         synchronized (writestate) {
1532           if (writestate.writesEnabled) {
1533             wasStateSet = true;
1534             ++writestate.compacting;
1535           } else {
1536             String msg = "NOT compacting region " + this + ". Writes disabled.";
1537             LOG.info(msg);
1538             status.abort(msg);
1539             return false;
1540           }
1541         }
1542         LOG.info("Starting compaction on " + store + " in region " + this
1543             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1544         doRegionCompactionPrep();
1545         try {
1546           status.setStatus("Compacting store " + store);
1547           // We no longer need to cancel the request on the way out of this
1548           // method because Store#compact will clean up unconditionally
1549           requestNeedsCancellation = false;
1550           store.compact(compaction, throughputController, user);
1551         } catch (InterruptedIOException iioe) {
1552           String msg = "compaction interrupted";
1553           LOG.info(msg, iioe);
1554           status.abort(msg);
1555           return false;
1556         }
1557       } finally {
1558         if (wasStateSet) {
1559           synchronized (writestate) {
1560             --writestate.compacting;
1561             if (writestate.compacting <= 0) {
1562               writestate.notifyAll();
1563             }
1564           }
1565         }
1566       }
1567       status.markComplete("Compaction complete");
1568       return true;
1569     } finally {
1570       try {
1571         if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1572         if (status != null) status.cleanup();
1573       } finally {
1574         lock.readLock().unlock();
1575       }
1576     }
1577   }
1578 
1579   /**
1580    * Flush the cache.
1581    *
1582    * When this method is called the cache will be flushed unless:
1583    * <ol>
1584    *   <li>the cache is empty</li>
1585    *   <li>the region is closed.</li>
1586    *   <li>a flush is already in progress</li>
1587    *   <li>writes are disabled</li>
1588    * </ol>
1589    *
1590    * <p>This method may block for some time, so it should not be called from a
1591    * time-sensitive thread.
1592    *
1593    * @return true if the region needs compacting
1594    *
1595    * @throws IOException general io exceptions
1596    * @throws DroppedSnapshotException Thrown when replay of wal is required
1597    * because a Snapshot was not properly persisted. The region is put in closing mode, and the
1598    * caller MUST abort after this.
1599    */
1600   public FlushResult flushcache() throws IOException {
1601     // fail-fast instead of waiting on the lock
1602     if (this.closing.get()) {
1603       String msg = "Skipping flush on " + this + " because closing";
1604       LOG.debug(msg);
1605       return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1606     }
1607     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1608     status.setStatus("Acquiring readlock on region");
1609     // block waiting for the lock for flushing cache
1610     lock.readLock().lock();
1611     try {
1612       if (this.closed.get()) {
1613         String msg = "Skipping flush on " + this + " because closed";
1614         LOG.debug(msg);
1615         status.abort(msg);
1616         return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1617       }
1618       if (coprocessorHost != null) {
1619         status.setStatus("Running coprocessor pre-flush hooks");
1620         coprocessorHost.preFlush();
1621       }
1622       if (numMutationsWithoutWAL.get() > 0) {
1623         numMutationsWithoutWAL.set(0);
1624         dataInMemoryWithoutWAL.set(0);
1625       }
1626       synchronized (writestate) {
1627         if (!writestate.flushing && writestate.writesEnabled) {
1628           this.writestate.flushing = true;
1629         } else {
1630           if (LOG.isDebugEnabled()) {
1631             LOG.debug("NOT flushing memstore for region " + this
1632                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1633                 + writestate.writesEnabled);
1634           }
1635           String msg = "Not flushing since "
1636               + (writestate.flushing ? "already flushing"
1637               : "writes not enabled");
1638           status.abort(msg);
1639           return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1640         }
1641       }
1642       try {
1643         FlushResult fs = internalFlushcache(status);
1644 
1645         if (coprocessorHost != null) {
1646           status.setStatus("Running post-flush coprocessor hooks");
1647           coprocessorHost.postFlush();
1648         }
1649 
1650         status.markComplete("Flush successful");
1651         return fs;
1652       } finally {
1653         synchronized (writestate) {
1654           writestate.flushing = false;
1655           this.writestate.flushRequested = false;
1656           writestate.notifyAll();
1657         }
1658       }
1659     } finally {
1660       lock.readLock().unlock();
1661       status.cleanup();
1662     }
1663   }
1664 
1665   /**
1666    * Should the memstore be flushed now
1667    */
1668   boolean shouldFlush() {
1669     // This is a rough measure.
1670     if (this.completeSequenceId > 0
1671           && (this.completeSequenceId + this.flushPerChanges < this.sequenceId.get())) {
1672       return true;
1673     }
1674     if (flushCheckInterval <= 0) { //disabled
1675       return false;
1676     }
1677     long now = EnvironmentEdgeManager.currentTimeMillis();
1678     //if we flushed in the recent past, we don't need to do again now
1679     if ((now - getLastFlushTime() < flushCheckInterval)) {
1680       return false;
1681     }
1682     //since we didn't flush in the recent past, flush now if certain conditions
1683     //are met. Return true on first such memstore hit.
1684     for (Store s : this.getStores().values()) {
1685       if (s.timeOfOldestEdit() < now - flushCheckInterval) {
1686         // we have an old enough edit in the memstore, flush
1687         return true;
1688       }
1689     }
1690     return false;
1691   }
1692 
1693   /**
1694    * Flush the memstore.
1695    *
1696    * Flushing the memstore is a little tricky. We have a lot of updates in the
1697    * memstore, all of which have also been written to the log. We need to
1698    * write those updates in the memstore out to disk, while being able to
1699    * process reads/writes as much as possible during the flush operation. Also,
1700    * the log has to state clearly the point in time at which the memstore was
1701    * flushed. (That way, during recovery, we know when we can rely on the
1702    * on-disk flushed structures and when we have to recover the memstore from
1703    * the log.)
1704    *
1705    * <p>So, we have a three-step process:
1706    *
1707    * <ul><li>A. Flush the memstore to the on-disk stores, noting the current
1708    * sequence ID for the log.<li>
1709    *
1710    * <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
1711    * ID that was current at the time of memstore-flush.</li>
1712    *
1713    * <li>C. Get rid of the memstore structures that are now redundant, as
1714    * they've been flushed to the on-disk HStores.</li>
1715    * </ul>
1716    * <p>This method is protected, but can be accessed via several public
1717    * routes.
1718    *
1719    * <p> This method may block for some time.
1720    * @param status
1721    *
1722    * @return object describing the flush's state
1723    *
1724    * @throws IOException general io exceptions
1725    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1726    * because a Snapshot was not properly persisted.
1727    */
1728   protected FlushResult internalFlushcache(MonitoredTask status)
1729       throws IOException {
1730     return internalFlushcache(this.log, -1, status);
1731   }
1732 
1733   /**
1734    * @param wal Null if we're NOT to go via hlog/wal.
1735    * @param myseqid The seqid to use if <code>wal</code> is null writing out
1736    * flush file.
1737    * @param status
1738    * @return true if the region needs compacting
1739    * @throws IOException
1740    * @see #internalFlushcache(MonitoredTask)
1741    */
1742   protected FlushResult internalFlushcache(
1743       final HLog wal, final long myseqid, MonitoredTask status)
1744   throws IOException {
1745     if (this.rsServices != null && this.rsServices.isAborted()) {
1746       // Don't flush when server aborting, it's unsafe
1747       throw new IOException("Aborting flush because server is abortted...");
1748     }
1749     final long startTime = EnvironmentEdgeManager.currentTimeMillis();
1750     // Clear flush flag.
1751     // If nothing to flush, return and avoid logging start/stop flush.
1752     if (this.memstoreSize.get() <= 0) {
1753       if(LOG.isDebugEnabled()) {
1754         LOG.debug("Empty memstore size for the current region "+this);
1755       }
1756       return new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush");
1757     }
1758 
1759     LOG.info("Started memstore flush for " + this +
1760       ", current region memstore size " +
1761       StringUtils.humanReadableInt(this.memstoreSize.get()) +
1762       ((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
1763 
1764     // Stop updates while we snapshot the memstore of all stores. We only have
1765     // to do this for a moment.  Its quick.  The subsequent sequence id that
1766     // goes into the HLog after we've flushed all these snapshots also goes
1767     // into the info file that sits beside the flushed files.
1768     // We also set the memstore size to zero here before we allow updates
1769     // again so its value will represent the size of the updates received
1770     // during the flush
1771     MultiVersionConsistencyControl.WriteEntry w = null;
1772 
1773     // We have to take a write lock during snapshot, or else a write could
1774     // end up in both snapshot and memstore (makes it difficult to do atomic
1775     // rows then)
1776     status.setStatus("Obtaining lock to block concurrent updates");
1777     // block waiting for the lock for internal flush
1778     this.updatesLock.writeLock().lock();
1779     long totalFlushableSize = 0;
1780     status.setStatus("Preparing to flush by snapshotting stores");
1781     TreeMap<byte[], StoreFlushContext> storeFlushCtxs
1782         = new TreeMap<byte[], StoreFlushContext>(Bytes.BYTES_COMPARATOR);
1783     TreeMap<byte[], Long> storeFlushableSize = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
1784     long flushSeqId = -1L;
1785     try {
1786       // Record the mvcc for all transactions in progress.
1787       w = mvcc.beginMemstoreInsert();
1788       mvcc.advanceMemstore(w);
1789       // check if it is not closing.
1790       if (wal != null) {
1791         if (!wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes())) {
1792           String msg = "Flush will not be started for ["
1793               + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.";
1794           status.setStatus(msg);
1795           return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1796         }
1797         flushSeqId = this.sequenceId.incrementAndGet();
1798       } else {
1799         // use the provided sequence Id as WAL is not being used for this flush.
1800         flushSeqId = myseqid;
1801       }
1802 
1803       for (Store s : stores.values()) {
1804         totalFlushableSize += s.getFlushableSize();
1805         byte[] storeName = s.getFamily().getName();
1806         storeFlushCtxs.put(storeName, s.createFlushContext(flushSeqId));
1807         storeFlushableSize.put(storeName, s.getFlushableSize());
1808       }
1809 
1810       // prepare flush (take a snapshot)
1811       for (StoreFlushContext flush : storeFlushCtxs.values()) {
1812         flush.prepare();
1813       }
1814     } finally {
1815       this.updatesLock.writeLock().unlock();
1816     }
1817     boolean compactionRequested = false;
1818     try {
1819       String s = "Finished memstore snapshotting " + this +
1820         ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSize;
1821       status.setStatus(s);
1822       if (LOG.isTraceEnabled()) LOG.trace(s);
1823 
1824       // sync unflushed WAL changes when deferred log sync is enabled
1825       // see HBASE-8208 for details
1826       if (wal != null && !shouldSyncLog()) {
1827         wal.sync();
1828       }
1829 
1830       // wait for all in-progress transactions to commit to HLog before
1831       // we can start the flush. This prevents
1832       // uncommitted transactions from being written into HFiles.
1833       // We have to block before we start the flush, otherwise keys that
1834       // were removed via a rollbackMemstore could be written to Hfiles.
1835       mvcc.waitForRead(w);
1836 
1837       s = "Flushing stores of " + this;
1838       status.setStatus(s);
1839       if (LOG.isTraceEnabled()) LOG.trace(s);
1840 
1841       // Any failure from here on out will be catastrophic requiring server
1842       // restart so hlog content can be replayed and put back into the memstore.
1843       // Otherwise, the snapshot content while backed up in the hlog, it will not
1844       // be part of the current running servers state.
1845       // A.  Flush memstore to all the HStores.
1846       // Keep running vector of all store files that includes both old and the
1847       // just-made new flush store file. The new flushed file is still in the
1848       // tmp directory.
1849 
1850       for (StoreFlushContext flush : storeFlushCtxs.values()) {
1851         flush.flushCache(status);
1852       }
1853 
1854       // Switch snapshot (in memstore) -> new hfile (thus causing
1855       // all the store scanners to reset/reseek).
1856       for (Map.Entry<byte[], StoreFlushContext> flushEntry : storeFlushCtxs.entrySet()) {
1857         byte[] storeName = flushEntry.getKey();
1858         StoreFlushContext flush = flushEntry.getValue();
1859         boolean needsCompaction = flush.commit(status);
1860         if (needsCompaction) {
1861           compactionRequested = true;
1862         }
1863         if (flush.getCommittedFiles() == null || flush.getCommittedFiles().isEmpty()) {
1864           totalFlushableSize -= storeFlushableSize.get(storeName);
1865         }
1866       }
1867       storeFlushCtxs.clear();
1868 
1869       // Set down the memstore size by amount of flush.
1870       this.addAndGetGlobalMemstoreSize(-totalFlushableSize);
1871     } catch (Throwable t) {
1872       // An exception here means that the snapshot was not persisted.
1873       // The hlog needs to be replayed so its content is restored to memstore.
1874       // Currently, only a server restart will do this.
1875       // We used to only catch IOEs but its possible that we'd get other
1876       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
1877       // all and sundry.
1878       if (wal != null) {
1879         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1880       }
1881       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
1882           Bytes.toStringBinary(getRegionName()));
1883       dse.initCause(t);
1884       status.abort("Flush failed: " + StringUtils.stringifyException(t));
1885 
1886       // Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
1887       // However, since we may have the region read lock, we cannot call close(true) here since
1888       // we cannot promote to a write lock. Instead we are setting closing so that all other region
1889       // operations except for close will be rejected.
1890       this.closing.set(true);
1891 
1892       if (rsServices != null) {
1893         // This is a safeguard against the case where the caller fails to explicitly handle aborting
1894         rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
1895       }
1896 
1897       throw dse;
1898     }
1899 
1900     // If we get to here, the HStores have been written.
1901     if (wal != null) {
1902       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1903     }
1904 
1905     // Record latest flush time
1906     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
1907 
1908     // Update the last flushed sequence id for region
1909     completeSequenceId = flushSeqId;
1910 
1911     // C. Finally notify anyone waiting on memstore to clear:
1912     // e.g. checkResources().
1913     synchronized (this) {
1914       notifyAll(); // FindBugs NN_NAKED_NOTIFY
1915     }
1916 
1917     long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
1918     long memstoresize = this.memstoreSize.get();
1919     String msg = "Finished memstore flush of ~" +
1920       StringUtils.humanReadableInt(totalFlushableSize) + "/" + totalFlushableSize +
1921       ", currentsize=" +
1922       StringUtils.humanReadableInt(memstoresize) + "/" + memstoresize +
1923       " for region " + this + " in " + time + "ms, sequenceid=" + flushSeqId +
1924       ", compaction requested=" + compactionRequested +
1925       ((wal == null)? "; wal=null": "");
1926     LOG.info(msg);
1927     status.setStatus(msg);
1928     this.recentFlushes.add(new Pair<Long,Long>(time/1000, totalFlushableSize));
1929 
1930     return new FlushResult(compactionRequested ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
1931         FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushSeqId);
1932   }
1933 
1934   //////////////////////////////////////////////////////////////////////////////
1935   // get() methods for client use.
1936   //////////////////////////////////////////////////////////////////////////////
1937   /**
1938    * Return all the data for the row that matches <i>row</i> exactly,
1939    * or the one that immediately preceeds it, at or immediately before
1940    * <i>ts</i>.
1941    *
1942    * @param row row key
1943    * @return map of values
1944    * @throws IOException
1945    */
1946   Result getClosestRowBefore(final byte [] row)
1947   throws IOException{
1948     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
1949   }
1950 
1951   /**
1952    * Return all the data for the row that matches <i>row</i> exactly,
1953    * or the one that immediately preceeds it, at or immediately before
1954    * <i>ts</i>.
1955    *
1956    * @param row row key
1957    * @param family column family to find on
1958    * @return map of values
1959    * @throws IOException read exceptions
1960    */
1961   public Result getClosestRowBefore(final byte [] row, final byte [] family)
1962   throws IOException {
1963     if (coprocessorHost != null) {
1964       Result result = new Result();
1965       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
1966         return result;
1967       }
1968     }
1969     // look across all the HStores for this region and determine what the
1970     // closest key is across all column families, since the data may be sparse
1971     checkRow(row, "getClosestRowBefore");
1972     startRegionOperation(Operation.GET);
1973     this.readRequestsCount.increment();
1974     try {
1975       Store store = getStore(family);
1976       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
1977       KeyValue key = store.getRowKeyAtOrBefore(row);
1978       Result result = null;
1979       if (key != null) {
1980         Get get = new Get(key.getRow());
1981         get.addFamily(family);
1982         result = get(get);
1983       }
1984       if (coprocessorHost != null) {
1985         coprocessorHost.postGetClosestRowBefore(row, family, result);
1986       }
1987       return result;
1988     } finally {
1989       closeRegionOperation(Operation.GET);
1990     }
1991   }
1992 
1993   /**
1994    * Return an iterator that scans over the HRegion, returning the indicated
1995    * columns and rows specified by the {@link Scan}.
1996    * <p>
1997    * This Iterator must be closed by the caller.
1998    *
1999    * @param scan configured {@link Scan}
2000    * @return RegionScanner
2001    * @throws IOException read exceptions
2002    */
2003   public RegionScanner getScanner(Scan scan) throws IOException {
2004    return getScanner(scan, null);
2005   }
2006 
2007   void prepareScanner(Scan scan) throws IOException {
2008     if(!scan.hasFamilies()) {
2009       // Adding all families to scanner
2010       for(byte[] family: this.htableDescriptor.getFamiliesKeys()){
2011         scan.addFamily(family);
2012       }
2013     }
2014   }
2015 
2016   protected RegionScanner getScanner(Scan scan,
2017       List<KeyValueScanner> additionalScanners) throws IOException {
2018     startRegionOperation(Operation.SCAN);
2019     try {
2020       // Verify families are all valid
2021       prepareScanner(scan);
2022       if(scan.hasFamilies()) {
2023         for(byte [] family : scan.getFamilyMap().keySet()) {
2024           checkFamily(family);
2025         }
2026       }
2027       return instantiateRegionScanner(scan, additionalScanners);
2028     } finally {
2029       closeRegionOperation(Operation.SCAN);
2030     }
2031   }
2032 
2033   protected RegionScanner instantiateRegionScanner(Scan scan,
2034       List<KeyValueScanner> additionalScanners) throws IOException {
2035     if (scan.isReversed()) {
2036       if (scan.getFilter() != null) {
2037         scan.getFilter().setReversed(true);
2038       }
2039       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
2040     }
2041     return new RegionScannerImpl(scan, additionalScanners, this);
2042   }
2043 
2044   /*
2045    * @param delete The passed delete is modified by this method. WARNING!
2046    */
2047   void prepareDelete(Delete delete) throws IOException {
2048     // Check to see if this is a deleteRow insert
2049     if(delete.getFamilyCellMap().isEmpty()){
2050       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
2051         // Don't eat the timestamp
2052         delete.deleteFamily(family, delete.getTimeStamp());
2053       }
2054     } else {
2055       for(byte [] family : delete.getFamilyCellMap().keySet()) {
2056         if(family == null) {
2057           throw new NoSuchColumnFamilyException("Empty family is invalid");
2058         }
2059         checkFamily(family);
2060       }
2061     }
2062   }
2063 
2064   //////////////////////////////////////////////////////////////////////////////
2065   // set() methods for client use.
2066   //////////////////////////////////////////////////////////////////////////////
2067   /**
2068    * @param delete delete object
2069    * @throws IOException read exceptions
2070    */
2071   public void delete(Delete delete)
2072   throws IOException {
2073     checkReadOnly();
2074     checkResources();
2075     startRegionOperation(Operation.DELETE);
2076     try {
2077       delete.getRow();
2078       // All edits for the given row (across all column families) must happen atomically.
2079       doBatchMutate(delete);
2080     } finally {
2081       closeRegionOperation(Operation.DELETE);
2082     }
2083   }
2084 
2085   /**
2086    * Row needed by below method.
2087    */
2088   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2089   /**
2090    * This is used only by unit tests. Not required to be a public API.
2091    * @param familyMap map of family to edits for the given family.
2092    * @param durability
2093    * @throws IOException
2094    */
2095   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2096       Durability durability) throws IOException {
2097     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2098     delete.setFamilyCellMap(familyMap);
2099     delete.setDurability(durability);
2100     doBatchMutate(delete);
2101   }
2102 
2103   /**
2104    * Setup correct timestamps in the KVs in Delete object.
2105    * Caller should have the row and region locks.
2106    * @param mutation
2107    * @param familyMap
2108    * @param byteNow
2109    * @throws IOException
2110    */
2111   void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2112       byte[] byteNow) throws IOException {
2113     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2114 
2115       byte[] family = e.getKey();
2116       List<Cell> cells = e.getValue();
2117       assert cells instanceof RandomAccess;
2118 
2119       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2120       int listSize = cells.size();
2121       for (int i=0; i < listSize; i++) {
2122         Cell cell = cells.get(i);
2123         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2124         //  Check if time is LATEST, change to time of most recent addition if so
2125         //  This is expensive.
2126         if (kv.isLatestTimestamp() && kv.isDeleteType()) {
2127           byte[] qual = kv.getQualifier();
2128           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2129 
2130           Integer count = kvCount.get(qual);
2131           if (count == null) {
2132             kvCount.put(qual, 1);
2133           } else {
2134             kvCount.put(qual, count + 1);
2135           }
2136           count = kvCount.get(qual);
2137 
2138           Get get = new Get(kv.getRow());
2139           get.setMaxVersions(count);
2140           get.addColumn(family, qual);
2141           if (coprocessorHost != null) {
2142             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2143                 byteNow, get)) {
2144               updateDeleteLatestVersionTimeStamp(kv, get, count, byteNow);
2145             }
2146           } else {
2147             updateDeleteLatestVersionTimeStamp(kv, get, count, byteNow);
2148           }
2149         } else {
2150           kv.updateLatestStamp(byteNow);
2151         }
2152       }
2153     }
2154   }
2155 
2156   void updateDeleteLatestVersionTimeStamp(KeyValue kv, Get get, int count, byte[] byteNow)
2157       throws IOException {
2158     List<Cell> result = get(get, false);
2159 
2160     if (result.size() < count) {
2161       // Nothing to delete
2162       kv.updateLatestStamp(byteNow);
2163       return;
2164     }
2165     if (result.size() > count) {
2166       throw new RuntimeException("Unexpected size: " + result.size());
2167     }
2168     KeyValue getkv = KeyValueUtil.ensureKeyValue(result.get(count - 1));
2169     Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(), getkv.getBuffer(),
2170         getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
2171   }
2172 
2173   /**
2174    * @param put
2175    * @throws IOException
2176    */
2177   public void put(Put put)
2178   throws IOException {
2179     checkReadOnly();
2180 
2181     // Do a rough check that we have resources to accept a write.  The check is
2182     // 'rough' in that between the resource check and the call to obtain a
2183     // read lock, resources may run out.  For now, the thought is that this
2184     // will be extremely rare; we'll deal with it when it happens.
2185     checkResources();
2186     startRegionOperation(Operation.PUT);
2187     try {
2188       // All edits for the given row (across all column families) must happen atomically.
2189       doBatchMutate(put);
2190     } finally {
2191       closeRegionOperation(Operation.PUT);
2192     }
2193   }
2194 
2195   /**
2196    * Struct-like class that tracks the progress of a batch operation,
2197    * accumulating status codes and tracking the index at which processing
2198    * is proceeding.
2199    */
2200   private abstract static class BatchOperationInProgress<T> {
2201     T[] operations;
2202     int nextIndexToProcess = 0;
2203     OperationStatus[] retCodeDetails;
2204     WALEdit[] walEditsFromCoprocessors;
2205 
2206     public BatchOperationInProgress(T[] operations) {
2207       this.operations = operations;
2208       this.retCodeDetails = new OperationStatus[operations.length];
2209       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2210       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2211     }
2212 
2213     public abstract Mutation getMutation(int index);
2214     public abstract long getNonceGroup(int index);
2215     public abstract long getNonce(int index);
2216     /** This method is potentially expensive and should only be used for non-replay CP path. */
2217     public abstract Mutation[] getMutationsForCoprocs();
2218     public abstract boolean isInReplay();
2219 
2220     public boolean isDone() {
2221       return nextIndexToProcess == operations.length;
2222     }
2223   }
2224 
2225   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2226     private long nonceGroup;
2227     private long nonce;
2228     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2229       super(operations);
2230       this.nonceGroup = nonceGroup;
2231       this.nonce = nonce;
2232     }
2233 
2234     public Mutation getMutation(int index) {
2235       return this.operations[index];
2236     }
2237 
2238     @Override
2239     public long getNonceGroup(int index) {
2240       return nonceGroup;
2241     }
2242 
2243     @Override
2244     public long getNonce(int index) {
2245       return nonce;
2246     }
2247 
2248     @Override
2249     public Mutation[] getMutationsForCoprocs() {
2250       return this.operations;
2251     }
2252 
2253     @Override
2254     public boolean isInReplay() {
2255       return false;
2256     }
2257   }
2258 
2259   private static class ReplayBatch extends BatchOperationInProgress<HLogSplitter.MutationReplay> {
2260     public ReplayBatch(MutationReplay[] operations) {
2261       super(operations);
2262     }
2263 
2264     @Override
2265     public Mutation getMutation(int index) {
2266       return this.operations[index].mutation;
2267     }
2268 
2269     @Override
2270     public long getNonceGroup(int index) {
2271       return this.operations[index].nonceGroup;
2272     }
2273 
2274     @Override
2275     public long getNonce(int index) {
2276       return this.operations[index].nonce;
2277     }
2278 
2279     @Override
2280     public Mutation[] getMutationsForCoprocs() {
2281       assert false;
2282       throw new RuntimeException("Should not be called for replay batch");
2283     }
2284 
2285     @Override
2286     public boolean isInReplay() {
2287       return true;
2288     }
2289   }
2290 
2291   /**
2292    * Perform a batch of mutations.
2293    * It supports only Put and Delete mutations and will ignore other types passed.
2294    * @param mutations the list of mutations
2295    * @return an array of OperationStatus which internally contains the
2296    *         OperationStatusCode and the exceptionMessage if any.
2297    * @throws IOException
2298    */
2299   public OperationStatus[] batchMutate(
2300       Mutation[] mutations, long nonceGroup, long nonce) throws IOException {
2301     // As it stands, this is used for 3 things
2302     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2303     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2304     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2305     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2306   }
2307 
2308   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2309     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2310   }
2311 
2312   /**
2313    * Replay a batch of mutations.
2314    * @param mutations mutations to replay.
2315    * @return an array of OperationStatus which internally contains the
2316    *         OperationStatusCode and the exceptionMessage if any.
2317    * @throws IOException
2318    */
2319   public OperationStatus[] batchReplay(HLogSplitter.MutationReplay[] mutations)
2320       throws IOException {
2321     return batchMutate(new ReplayBatch(mutations));
2322   }
2323 
2324   /**
2325    * Perform a batch of mutations.
2326    * It supports only Put and Delete mutations and will ignore other types passed.
2327    * @param batchOp contains the list of mutations
2328    * @return an array of OperationStatus which internally contains the
2329    *         OperationStatusCode and the exceptionMessage if any.
2330    * @throws IOException
2331    */
2332   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2333     boolean initialized = false;
2334     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2335     startRegionOperation(op);
2336     try {
2337       while (!batchOp.isDone()) {
2338         if (!batchOp.isInReplay()) {
2339           checkReadOnly();
2340         }
2341         checkResources();
2342 
2343         if (!initialized) {
2344           this.writeRequestsCount.add(batchOp.operations.length);
2345           if (!batchOp.isInReplay()) {
2346             doPreMutationHook(batchOp);
2347           }
2348           initialized = true;
2349         }
2350         long addedSize = doMiniBatchMutation(batchOp);
2351         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2352         if (isFlushSize(newSize)) {
2353           requestFlush();
2354         }
2355       }
2356     } finally {
2357       closeRegionOperation(op);
2358     }
2359     return batchOp.retCodeDetails;
2360   }
2361 
2362 
2363   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2364       throws IOException {
2365     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2366     WALEdit walEdit = new WALEdit();
2367     if (coprocessorHost != null) {
2368       for (int i = 0 ; i < batchOp.operations.length; i++) {
2369         Mutation m = batchOp.getMutation(i);
2370         if (m instanceof Put) {
2371           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2372             // pre hook says skip this Put
2373             // mark as success and skip in doMiniBatchMutation
2374             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2375           }
2376         } else if (m instanceof Delete) {
2377           Delete curDel = (Delete) m;
2378           if (curDel.getFamilyCellMap().isEmpty()) {
2379             // handle deleting a row case
2380             prepareDelete(curDel);
2381           }
2382           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2383             // pre hook says skip this Delete
2384             // mark as success and skip in doMiniBatchMutation
2385             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2386           }
2387         } else {
2388           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2389           // mark the operation return code as failure so that it will not be considered in
2390           // the doMiniBatchMutation
2391           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2392               "Put/Delete mutations only supported in batchMutate() now");
2393         }
2394         if (!walEdit.isEmpty()) {
2395           batchOp.walEditsFromCoprocessors[i] = walEdit;
2396           walEdit = new WALEdit();
2397         }
2398       }
2399     }
2400   }
2401 
2402   @SuppressWarnings("unchecked")
2403   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2404     boolean isInReplay = batchOp.isInReplay();
2405     // variable to note if all Put items are for the same CF -- metrics related
2406     boolean putsCfSetConsistent = true;
2407     //The set of columnFamilies first seen for Put.
2408     Set<byte[]> putsCfSet = null;
2409     // variable to note if all Delete items are for the same CF -- metrics related
2410     boolean deletesCfSetConsistent = true;
2411     //The set of columnFamilies first seen for Delete.
2412     Set<byte[]> deletesCfSet = null;
2413 
2414     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2415     WALEdit walEdit = new WALEdit(isInReplay);
2416     MultiVersionConsistencyControl.WriteEntry w = null;
2417     long txid = 0;
2418     boolean doRollBackMemstore = false;
2419     boolean locked = false;
2420 
2421     /** Keep track of the locks we hold so we can release them in finally clause */
2422     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2423     // reference family maps directly so coprocessors can mutate them if desired
2424     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2425     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2426     int firstIndex = batchOp.nextIndexToProcess;
2427     int lastIndexExclusive = firstIndex;
2428     boolean success = false;
2429     int noOfPuts = 0, noOfDeletes = 0;
2430     try {
2431       // ------------------------------------
2432       // STEP 1. Try to acquire as many locks as we can, and ensure
2433       // we acquire at least one.
2434       // ----------------------------------
2435       int numReadyToWrite = 0;
2436       long now = EnvironmentEdgeManager.currentTimeMillis();
2437       while (lastIndexExclusive < batchOp.operations.length) {
2438         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
2439         boolean isPutMutation = mutation instanceof Put;
2440 
2441         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2442         // store the family map reference to allow for mutations
2443         familyMaps[lastIndexExclusive] = familyMap;
2444 
2445         // skip anything that "ran" already
2446         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
2447             != OperationStatusCode.NOT_RUN) {
2448           lastIndexExclusive++;
2449           continue;
2450         }
2451 
2452         try {
2453           if (isPutMutation) {
2454             // Check the families in the put. If bad, skip this one.
2455             if (isInReplay) {
2456               removeNonExistentColumnFamilyForReplay(familyMap);
2457             } else {
2458               checkFamilies(familyMap.keySet());
2459             }
2460             checkTimestamps(mutation.getFamilyCellMap(), now);
2461           } else {
2462             prepareDelete((Delete) mutation);
2463           }
2464           checkRow(mutation.getRow(), "doMiniBatchMutation");
2465         } catch (NoSuchColumnFamilyException nscf) {
2466           LOG.warn("No such column family in batch mutation", nscf);
2467           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2468               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
2469           lastIndexExclusive++;
2470           continue;
2471         } catch (FailedSanityCheckException fsce) {
2472           LOG.warn("Batch Mutation did not pass sanity check", fsce);
2473           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2474               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
2475           lastIndexExclusive++;
2476           continue;
2477         } catch (WrongRegionException we) {
2478           LOG.warn("Batch mutation had a row that does not belong to this region", we);
2479           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2480               OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
2481           lastIndexExclusive++;
2482           continue;
2483         }
2484 
2485         // If we haven't got any rows in our batch, we should block to
2486         // get the next one.
2487         boolean shouldBlock = numReadyToWrite == 0;
2488         RowLock rowLock = null;
2489         try {
2490           rowLock = getRowLockInternal(mutation.getRow(), shouldBlock);
2491         } catch (IOException ioe) {
2492           LOG.warn("Failed getting lock in batch put, row="
2493             + Bytes.toStringBinary(mutation.getRow()), ioe);
2494         }
2495         if (rowLock == null) {
2496           // We failed to grab another lock
2497           break; // stop acquiring more rows for this batch
2498         } else {
2499           acquiredRowLocks.add(rowLock);
2500         }
2501 
2502         lastIndexExclusive++;
2503         numReadyToWrite++;
2504 
2505         if (isPutMutation) {
2506           // If Column Families stay consistent through out all of the
2507           // individual puts then metrics can be reported as a mutliput across
2508           // column families in the first put.
2509           if (putsCfSet == null) {
2510             putsCfSet = mutation.getFamilyCellMap().keySet();
2511           } else {
2512             putsCfSetConsistent = putsCfSetConsistent
2513                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
2514           }
2515         } else {
2516           if (deletesCfSet == null) {
2517             deletesCfSet = mutation.getFamilyCellMap().keySet();
2518           } else {
2519             deletesCfSetConsistent = deletesCfSetConsistent
2520                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
2521           }
2522         }
2523       }
2524 
2525       // we should record the timestamp only after we have acquired the rowLock,
2526       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
2527       now = EnvironmentEdgeManager.currentTimeMillis();
2528       byte[] byteNow = Bytes.toBytes(now);
2529 
2530       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
2531       if (numReadyToWrite <= 0) return 0L;
2532 
2533       // We've now grabbed as many mutations off the list as we can
2534 
2535       // ------------------------------------
2536       // STEP 2. Update any LATEST_TIMESTAMP timestamps
2537       // ----------------------------------
2538       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2539         // skip invalid
2540         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2541             != OperationStatusCode.NOT_RUN) continue;
2542 
2543         Mutation mutation = batchOp.getMutation(i);
2544         if (mutation instanceof Put) {
2545           updateKVTimestamps(familyMaps[i].values(), byteNow);
2546           noOfPuts++;
2547         } else {
2548           if (!isInReplay) {
2549             prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
2550           }
2551           noOfDeletes++;
2552         }
2553         rewriteCellTags(familyMaps[i], mutation);
2554       }
2555 
2556       lock(this.updatesLock.readLock(), numReadyToWrite);
2557       locked = true;
2558 
2559       //
2560       // ------------------------------------
2561       // Acquire the latest mvcc number
2562       // ----------------------------------
2563       w = mvcc.beginMemstoreInsert();
2564 
2565       // calling the pre CP hook for batch mutation
2566       if (!isInReplay && coprocessorHost != null) {
2567         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2568           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2569           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2570         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
2571       }
2572 
2573       // ------------------------------------
2574       // STEP 3. Write back to memstore
2575       // Write to memstore. It is ok to write to memstore
2576       // first without updating the HLog because we do not roll
2577       // forward the memstore MVCC. The MVCC will be moved up when
2578       // the complete operation is done. These changes are not yet
2579       // visible to scanners till we update the MVCC. The MVCC is
2580       // moved only when the sync is complete.
2581       // ----------------------------------
2582       long addedSize = 0;
2583       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2584         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2585             != OperationStatusCode.NOT_RUN) {
2586           continue;
2587         }
2588         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
2589         addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
2590       }
2591 
2592       // ------------------------------------
2593       // STEP 4. Build WAL edit
2594       // ----------------------------------
2595       boolean hasWalAppends = false;
2596       Durability durability = Durability.USE_DEFAULT;
2597       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2598         // Skip puts that were determined to be invalid during preprocessing
2599         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2600             != OperationStatusCode.NOT_RUN) {
2601           continue;
2602         }
2603         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2604 
2605         Mutation m = batchOp.getMutation(i);
2606         Durability tmpDur = getEffectiveDurability(m.getDurability());
2607         if (tmpDur.ordinal() > durability.ordinal()) {
2608           durability = tmpDur;
2609         }
2610         if (tmpDur == Durability.SKIP_WAL) {
2611           recordMutationWithoutWal(m.getFamilyCellMap());
2612           continue;
2613         }
2614 
2615         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
2616         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
2617         // Given how nonces are originally written, these should be contiguous.
2618         // They don't have to be, it will still work, just write more WALEdits than needed.
2619         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
2620           if (walEdit.size() > 0) {
2621             assert isInReplay;
2622             if (!isInReplay) {
2623               throw new IOException("Multiple nonces per batch and not in replay");
2624             }
2625             // txid should always increase, so having the one from the last call is ok.
2626             txid = this.log.appendNoSync(this.getRegionInfo(), htableDescriptor.getTableName(),
2627                   walEdit, m.getClusterIds(), now, htableDescriptor, this.sequenceId, true,
2628                   currentNonceGroup, currentNonce);
2629             hasWalAppends = true;
2630             walEdit = new WALEdit(isInReplay);
2631           }
2632           currentNonceGroup = nonceGroup;
2633           currentNonce = nonce;
2634         }
2635 
2636         // Add WAL edits by CP
2637         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
2638         if (fromCP != null) {
2639           for (KeyValue kv : fromCP.getKeyValues()) {
2640             walEdit.add(kv);
2641           }
2642         }
2643         addFamilyMapToWALEdit(familyMaps[i], walEdit);
2644       }
2645 
2646       // -------------------------
2647       // STEP 5. Append the final edit to WAL. Do not sync wal.
2648       // -------------------------
2649       Mutation mutation = batchOp.getMutation(firstIndex);
2650       if (walEdit.size() > 0) {
2651         txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
2652               walEdit, mutation.getClusterIds(), now, this.htableDescriptor, this.sequenceId,
2653               true, currentNonceGroup, currentNonce);
2654         hasWalAppends = true;
2655       }
2656 
2657       // -------------------------------
2658       // STEP 6. Release row locks, etc.
2659       // -------------------------------
2660       if (locked) {
2661         this.updatesLock.readLock().unlock();
2662         locked = false;
2663       }
2664       releaseRowLocks(acquiredRowLocks);
2665 
2666       // -------------------------
2667       // STEP 7. Sync wal.
2668       // -------------------------
2669       if (hasWalAppends) {
2670         syncOrDefer(txid, durability);
2671       }
2672       doRollBackMemstore = false;
2673       // calling the post CP hook for batch mutation
2674       if (!isInReplay && coprocessorHost != null) {
2675         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2676           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2677           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2678         coprocessorHost.postBatchMutate(miniBatchOp);
2679       }
2680 
2681       // ------------------------------------------------------------------
2682       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
2683       // ------------------------------------------------------------------
2684       if (w != null) {
2685         mvcc.completeMemstoreInsert(w);
2686         w = null;
2687       }
2688 
2689       // ------------------------------------
2690       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
2691       // synced so that the coprocessor contract is adhered to.
2692       // ------------------------------------
2693       if (!isInReplay && coprocessorHost != null) {
2694         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2695           // only for successful puts
2696           if (batchOp.retCodeDetails[i].getOperationStatusCode()
2697               != OperationStatusCode.SUCCESS) {
2698             continue;
2699           }
2700           Mutation m = batchOp.getMutation(i);
2701           if (m instanceof Put) {
2702             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
2703           } else {
2704             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
2705           }
2706         }
2707       }
2708 
2709       success = true;
2710       return addedSize;
2711     } finally {
2712 
2713       // if the wal sync was unsuccessful, remove keys from memstore
2714       if (doRollBackMemstore) {
2715         rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive);
2716       }
2717       if (w != null) mvcc.completeMemstoreInsert(w);
2718 
2719       if (locked) {
2720         this.updatesLock.readLock().unlock();
2721       }
2722       releaseRowLocks(acquiredRowLocks);
2723 
2724       // See if the column families were consistent through the whole thing.
2725       // if they were then keep them. If they were not then pass a null.
2726       // null will be treated as unknown.
2727       // Total time taken might be involving Puts and Deletes.
2728       // Split the time for puts and deletes based on the total number of Puts and Deletes.
2729 
2730       if (noOfPuts > 0) {
2731         // There were some Puts in the batch.
2732         if (this.metricsRegion != null) {
2733           this.metricsRegion.updatePut();
2734         }
2735       }
2736       if (noOfDeletes > 0) {
2737         // There were some Deletes in the batch.
2738         if (this.metricsRegion != null) {
2739           this.metricsRegion.updateDelete();
2740         }
2741       }
2742       if (!success) {
2743         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2744           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
2745             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
2746           }
2747         }
2748       }
2749       if (coprocessorHost != null && !batchOp.isInReplay()) {
2750         // call the coprocessor hook to do any finalization steps
2751         // after the put is done
2752         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2753             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2754                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
2755                 lastIndexExclusive);
2756         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
2757       }
2758 
2759       batchOp.nextIndexToProcess = lastIndexExclusive;
2760     }
2761   }
2762 
2763   /**
2764    * Returns effective durability from the passed durability and
2765    * the table descriptor.
2766    */
2767   protected Durability getEffectiveDurability(Durability d) {
2768     return d == Durability.USE_DEFAULT ? this.durability : d;
2769   }
2770 
2771   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2772   //the getting of the lock happens before, so that you would just pass it into
2773   //the methods. So in the case of checkAndMutate you could just do lockRow,
2774   //get, put, unlockRow or something
2775   /**
2776    *
2777    * @param row
2778    * @param family
2779    * @param qualifier
2780    * @param compareOp
2781    * @param comparator
2782    * @param w
2783    * @param writeToWAL
2784    * @throws IOException
2785    * @return true if the new put was executed, false otherwise
2786    */
2787   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
2788       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
2789       boolean writeToWAL)
2790   throws IOException{
2791     checkReadOnly();
2792     //TODO, add check for value length or maybe even better move this to the
2793     //client if this becomes a global setting
2794     checkResources();
2795     boolean isPut = w instanceof Put;
2796     if (!isPut && !(w instanceof Delete))
2797       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
2798           "be Put or Delete");
2799     if (!Bytes.equals(row, w.getRow())) {
2800       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
2801           "getRow must match the passed row");
2802     }
2803 
2804     startRegionOperation();
2805     try {
2806       Get get = new Get(row);
2807       checkFamily(family);
2808       get.addColumn(family, qualifier);
2809 
2810       // Lock row - note that doBatchMutate will relock this row if called
2811       RowLock rowLock = getRowLock(get.getRow());
2812       // wait for all previous transactions to complete (with lock held)
2813       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2814       try {
2815         if (this.getCoprocessorHost() != null) {
2816           Boolean processed = null;
2817           if (w instanceof Put) {
2818             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
2819                 qualifier, compareOp, comparator, (Put) w);
2820           } else if (w instanceof Delete) {
2821             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
2822                 qualifier, compareOp, comparator, (Delete) w);
2823           }
2824           if (processed != null) {
2825             return processed;
2826           }
2827         }
2828         List<Cell> result = get(get, false);
2829 
2830         boolean valueIsNull = comparator.getValue() == null ||
2831           comparator.getValue().length == 0;
2832         boolean matches = false;
2833         long cellTs = 0;
2834         if (result.size() == 0 && valueIsNull) {
2835           matches = true;
2836         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2837             valueIsNull) {
2838           matches = true;
2839           cellTs = result.get(0).getTimestamp();
2840         } else if (result.size() == 1 && !valueIsNull) {
2841           Cell kv = result.get(0);
2842           cellTs = kv.getTimestamp();
2843           int compareResult = comparator.compareTo(kv.getValueArray(),
2844               kv.getValueOffset(), kv.getValueLength());
2845           switch (compareOp) {
2846           case LESS:
2847             matches = compareResult < 0;
2848             break;
2849           case LESS_OR_EQUAL:
2850             matches = compareResult <= 0;
2851             break;
2852           case EQUAL:
2853             matches = compareResult == 0;
2854             break;
2855           case NOT_EQUAL:
2856             matches = compareResult != 0;
2857             break;
2858           case GREATER_OR_EQUAL:
2859             matches = compareResult >= 0;
2860             break;
2861           case GREATER:
2862             matches = compareResult > 0;
2863             break;
2864           default:
2865             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2866           }
2867         }
2868         //If matches put the new put or delete the new delete
2869         if (matches) {
2870           // We have acquired the row lock already. If the system clock is NOT monotonically
2871           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
2872           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
2873           // there is no way to pass the cellTs. See HBASE-14054.
2874           long now = EnvironmentEdgeManager.currentTimeMillis();
2875           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
2876           byte[] byteTs = Bytes.toBytes(ts);
2877 
2878           if (w instanceof Put) {
2879             updateKVTimestamps(w.getFamilyCellMap().values(), byteTs);
2880           }
2881           // else delete is not needed since it already does a second get, and sets the timestamp
2882           // from get (see prepareDeleteTimestamps).
2883 
2884           // All edits for the given row (across all column families) must
2885           // happen atomically.
2886           doBatchMutate((Mutation)w);
2887           this.checkAndMutateChecksPassed.increment();
2888           return true;
2889         }
2890         this.checkAndMutateChecksFailed.increment();
2891         return false;
2892       } finally {
2893         rowLock.release();
2894       }
2895     } finally {
2896       closeRegionOperation();
2897     }
2898   }
2899 
2900   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2901   //the getting of the lock happens before, so that you would just pass it into
2902   //the methods. So in the case of checkAndMutate you could just do lockRow,
2903   //get, put, unlockRow or something
2904   /**
2905    *
2906    * @throws IOException
2907    * @return true if the new put was executed, false otherwise
2908    */
2909   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
2910       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
2911       boolean writeToWAL)
2912       throws IOException{
2913     checkReadOnly();
2914     //TODO, add check for value length or maybe even better move this to the
2915     //client if this becomes a global setting
2916     checkResources();
2917 
2918     startRegionOperation();
2919     try {
2920       Get get = new Get(row);
2921       checkFamily(family);
2922       get.addColumn(family, qualifier);
2923 
2924       // Lock row - note that doBatchMutate will relock this row if called
2925       RowLock rowLock = getRowLock(get.getRow());
2926       // wait for all previous transactions to complete (with lock held)
2927       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2928       try {
2929         List<Cell> result = get(get, false);
2930 
2931         boolean valueIsNull = comparator.getValue() == null ||
2932             comparator.getValue().length == 0;
2933         boolean matches = false;
2934         long cellTs = 0;
2935         if (result.size() == 0 && valueIsNull) {
2936           matches = true;
2937         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2938             valueIsNull) {
2939           matches = true;
2940           cellTs = result.get(0).getTimestamp();
2941         } else if (result.size() == 1 && !valueIsNull) {
2942           Cell kv = result.get(0);
2943           cellTs = kv.getTimestamp();
2944           int compareResult = comparator.compareTo(kv.getValueArray(),
2945               kv.getValueOffset(), kv.getValueLength());
2946           switch (compareOp) {
2947           case LESS:
2948             matches = compareResult < 0;
2949             break;
2950           case LESS_OR_EQUAL:
2951             matches = compareResult <= 0;
2952             break;
2953           case EQUAL:
2954             matches = compareResult == 0;
2955             break;
2956           case NOT_EQUAL:
2957             matches = compareResult != 0;
2958             break;
2959           case GREATER_OR_EQUAL:
2960             matches = compareResult >= 0;
2961             break;
2962           case GREATER:
2963             matches = compareResult > 0;
2964             break;
2965           default:
2966             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2967           }
2968         }
2969         //If matches put the new put or delete the new delete
2970         if (matches) {
2971           // We have acquired the row lock already. If the system clock is NOT monotonically
2972           // non-decreasing (see HBASE-14070) we should make sure that the mutation has a
2973           // larger timestamp than what was observed via Get. doBatchMutate already does this, but
2974           // there is no way to pass the cellTs. See HBASE-14054.
2975           long now = EnvironmentEdgeManager.currentTimeMillis();
2976           long ts = Math.max(now, cellTs); // ensure write is not eclipsed
2977           byte[] byteTs = Bytes.toBytes(ts);
2978 
2979           for (Mutation w : rm.getMutations()) {
2980             if (w instanceof Put) {
2981               updateKVTimestamps(w.getFamilyCellMap().values(), byteTs);
2982             }
2983             // else delete is not needed since it already does a second get, and sets the timestamp
2984             // from get (see prepareDeleteTimestamps).
2985           }
2986 
2987           // All edits for the given row (across all column families) must
2988           // happen atomically.
2989           mutateRow(rm);
2990           this.checkAndMutateChecksPassed.increment();
2991           return true;
2992         }
2993         this.checkAndMutateChecksFailed.increment();
2994         return false;
2995       } finally {
2996         rowLock.release();
2997       }
2998     } finally {
2999       closeRegionOperation();
3000     }
3001   }
3002 
3003   private void doBatchMutate(Mutation mutation) throws IOException, DoNotRetryIOException {
3004     // Currently this is only called for puts and deletes, so no nonces.
3005     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation },
3006         HConstants.NO_NONCE, HConstants.NO_NONCE);
3007     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
3008       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
3009     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
3010       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
3011     }
3012   }
3013 
3014   /**
3015    * Complete taking the snapshot on the region. Writes the region info and adds references to the
3016    * working snapshot directory.
3017    *
3018    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
3019    * arg.  (In the future other cancellable HRegion methods could eventually add a
3020    * {@link ForeignExceptionSnare}, or we could do something fancier).
3021    *
3022    * @param desc snasphot description object
3023    * @param exnSnare ForeignExceptionSnare that captures external exeptions in case we need to
3024    *   bail out.  This is allowed to be null and will just be ignored in that case.
3025    * @throws IOException if there is an external or internal error causing the snapshot to fail
3026    */
3027   public void addRegionToSnapshot(SnapshotDescription desc,
3028       ForeignExceptionSnare exnSnare) throws IOException {
3029     Path rootDir = FSUtils.getRootDir(conf);
3030     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
3031 
3032     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
3033                                                         snapshotDir, desc, exnSnare);
3034     manifest.addRegion(this);
3035   }
3036 
3037   /**
3038    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP} with the
3039    * provided current timestamp.
3040    */
3041   void updateKVTimestamps(final Iterable<List<Cell>> keyLists, final byte[] now) {
3042     for (List<Cell> cells: keyLists) {
3043       if (cells == null) continue;
3044       assert cells instanceof RandomAccess;
3045       int listSize = cells.size();
3046       for (int i=0; i < listSize; i++) {
3047         Cell cell = cells.get(i);
3048         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
3049         kv.updateLatestStamp(now);
3050       }
3051     }
3052   }
3053 
3054   /**
3055    * Possibly rewrite incoming cell tags.
3056    */
3057   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
3058     // Check if we have any work to do and early out otherwise
3059     // Update these checks as more logic is added here
3060 
3061     if (m.getTTL() == Long.MAX_VALUE) {
3062       return;
3063     }
3064 
3065     // From this point we know we have some work to do
3066 
3067     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
3068       List<Cell> cells = e.getValue();
3069       assert cells instanceof RandomAccess;
3070       int listSize = cells.size();
3071       for (int i = 0; i < listSize; i++) {
3072         Cell cell = cells.get(i);
3073         List<Tag> newTags = new ArrayList<Tag>();
3074         Iterator<Tag> tagIterator = CellUtil.tagsIterator(cell.getTagsArray(),
3075           cell.getTagsOffset(), cell.getTagsLengthUnsigned());
3076 
3077         // Carry forward existing tags
3078 
3079         while (tagIterator.hasNext()) {
3080 
3081           // Add any filters or tag specific rewrites here
3082 
3083           newTags.add(tagIterator.next());
3084         }
3085 
3086         // Cell TTL handling
3087 
3088         // Check again if we need to add a cell TTL because early out logic
3089         // above may change when there are more tag based features in core.
3090         if (m.getTTL() != Long.MAX_VALUE) {
3091           // Add a cell TTL tag
3092           newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL())));
3093         }
3094 
3095         // Rewrite the cell with the updated set of tags
3096 
3097         cells.set(i, new KeyValue(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
3098           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
3099           cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
3100           cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
3101           cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
3102           newTags));
3103       }
3104     }
3105   }
3106 
3107   /*
3108    * Check if resources to support an update.
3109    *
3110    * We throw RegionTooBusyException if above memstore limit
3111    * and expect client to retry using some kind of backoff
3112   */
3113   private void checkResources()
3114     throws RegionTooBusyException {
3115     // If catalog region, do not impose resource constraints or block updates.
3116     if (this.getRegionInfo().isMetaRegion()) return;
3117 
3118     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3119       blockedRequestsCount.increment();
3120       requestFlush();
3121       throw new RegionTooBusyException("Above memstore limit, " +
3122           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3123           this.getRegionInfo().getRegionNameAsString()) +
3124           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3125           this.getRegionServerServices().getServerName()) +
3126           ", memstoreSize=" + memstoreSize.get() +
3127           ", blockingMemStoreSize=" + blockingMemStoreSize);
3128     }
3129   }
3130 
3131   /**
3132    * @throws IOException Throws exception if region is in read-only mode.
3133    */
3134   protected void checkReadOnly() throws IOException {
3135     if (this.writestate.isReadOnly()) {
3136       throw new DoNotRetryIOException("region is read only");
3137     }
3138   }
3139 
3140   /**
3141    * Add updates first to the hlog and then add values to memstore.
3142    * Warning: Assumption is caller has lock on passed in row.
3143    * @param family
3144    * @param edits Cell updates by column
3145    * @praram now
3146    * @throws IOException
3147    */
3148   private void put(final byte [] row, byte [] family, List<Cell> edits)
3149   throws IOException {
3150     NavigableMap<byte[], List<Cell>> familyMap;
3151     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3152 
3153     familyMap.put(family, edits);
3154     Put p = new Put(row);
3155     p.setFamilyCellMap(familyMap);
3156     doBatchMutate(p);
3157   }
3158 
3159   /**
3160    * Atomically apply the given map of family->edits to the memstore.
3161    * This handles the consistency control on its own, but the caller
3162    * should already have locked updatesLock.readLock(). This also does
3163    * <b>not</b> check the families for validity.
3164    *
3165    * @param familyMap Map of kvs per family
3166    * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
3167    *        If null, then this method internally creates a mvcc transaction.
3168    * @return the additional memory usage of the memstore caused by the
3169    * new entries.
3170    */
3171   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3172     MultiVersionConsistencyControl.WriteEntry localizedWriteEntry) {
3173     long size = 0;
3174     boolean freemvcc = false;
3175 
3176     try {
3177       if (localizedWriteEntry == null) {
3178         localizedWriteEntry = mvcc.beginMemstoreInsert();
3179         freemvcc = true;
3180       }
3181 
3182       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3183         byte[] family = e.getKey();
3184         List<Cell> cells = e.getValue();
3185         assert cells instanceof RandomAccess;
3186         Store store = getStore(family);
3187         int listSize = cells.size();
3188         for (int i=0; i < listSize; i++) {
3189           Cell cell = cells.get(i);
3190           KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
3191           kv.setMvccVersion(localizedWriteEntry.getWriteNumber());
3192           size += store.add(kv);
3193         }
3194       }
3195     } finally {
3196       if (freemvcc) {
3197         mvcc.completeMemstoreInsert(localizedWriteEntry);
3198       }
3199     }
3200 
3201      return size;
3202    }
3203 
3204   /**
3205    * Remove all the keys listed in the map from the memstore. This method is
3206    * called when a Put/Delete has updated memstore but subequently fails to update
3207    * the wal. This method is then invoked to rollback the memstore.
3208    */
3209   private void rollbackMemstore(BatchOperationInProgress<?> batchOp,
3210                                 Map<byte[], List<Cell>>[] familyMaps,
3211                                 int start, int end) {
3212     int kvsRolledback = 0;
3213     for (int i = start; i < end; i++) {
3214       // skip over request that never succeeded in the first place.
3215       if (batchOp.retCodeDetails[i].getOperationStatusCode()
3216             != OperationStatusCode.SUCCESS) {
3217         continue;
3218       }
3219 
3220       // Rollback all the kvs for this row.
3221       Map<byte[], List<Cell>> familyMap  = familyMaps[i];
3222       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3223         byte[] family = e.getKey();
3224         List<Cell> cells = e.getValue();
3225 
3226         // Remove those keys from the memstore that matches our
3227         // key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
3228         // that even the memstoreTS has to match for keys that will be rolleded-back.
3229         Store store = getStore(family);
3230         for (Cell cell: cells) {
3231           store.rollback(KeyValueUtil.ensureKeyValue(cell));
3232           kvsRolledback++;
3233         }
3234       }
3235     }
3236     LOG.debug("rollbackMemstore rolled back " + kvsRolledback +
3237         " keyvalues from start:" + start + " to end:" + end);
3238   }
3239 
3240   /**
3241    * Check the collection of families for validity.
3242    * @throws NoSuchColumnFamilyException if a family does not exist.
3243    */
3244   void checkFamilies(Collection<byte[]> families)
3245   throws NoSuchColumnFamilyException {
3246     for (byte[] family : families) {
3247       checkFamily(family);
3248     }
3249   }
3250 
3251   /**
3252    * During replay, there could exist column families which are removed between region server
3253    * failure and replay
3254    */
3255   private void removeNonExistentColumnFamilyForReplay(
3256       final Map<byte[], List<Cell>> familyMap) {
3257     List<byte[]> nonExistentList = null;
3258     for (byte[] family : familyMap.keySet()) {
3259       if (!this.htableDescriptor.hasFamily(family)) {
3260         if (nonExistentList == null) {
3261           nonExistentList = new ArrayList<byte[]>();
3262         }
3263         nonExistentList.add(family);
3264       }
3265     }
3266     if (nonExistentList != null) {
3267       for (byte[] family : nonExistentList) {
3268         // Perhaps schema was changed between crash and replay
3269         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3270         familyMap.remove(family);
3271       }
3272     }
3273   }
3274 
3275   void checkTimestamps(final Map<byte[], List<Cell>> familyMap,
3276       long now) throws FailedSanityCheckException {
3277     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3278       return;
3279     }
3280     long maxTs = now + timestampSlop;
3281     for (List<Cell> kvs : familyMap.values()) {
3282       assert kvs instanceof RandomAccess;
3283       int listSize  = kvs.size();
3284       for (int i=0; i < listSize; i++) {
3285         Cell cell = kvs.get(i);
3286         // see if the user-side TS is out of range. latest = server-side
3287         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
3288         if (!kv.isLatestTimestamp() && kv.getTimestamp() > maxTs) {
3289           throw new FailedSanityCheckException("Timestamp for KV out of range "
3290               + cell + " (too.new=" + timestampSlop + ")");
3291         }
3292       }
3293     }
3294   }
3295 
3296   /**
3297    * Append the given map of family->edits to a WALEdit data structure.
3298    * This does not write to the HLog itself.
3299    * @param familyMap map of family->edits
3300    * @param walEdit the destination entry to append into
3301    */
3302   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3303       WALEdit walEdit) {
3304     for (List<Cell> edits : familyMap.values()) {
3305       assert edits instanceof RandomAccess;
3306       int listSize = edits.size();
3307       for (int i=0; i < listSize; i++) {
3308         Cell cell = edits.get(i);
3309         walEdit.add(KeyValueUtil.ensureKeyValue(cell));
3310       }
3311     }
3312   }
3313 
3314   private void requestFlush() {
3315     if (this.rsServices == null) {
3316       return;
3317     }
3318     synchronized (writestate) {
3319       if (this.writestate.isFlushRequested()) {
3320         return;
3321       }
3322       writestate.flushRequested = true;
3323     }
3324     // Make request outside of synchronize block; HBASE-818.
3325     this.rsServices.getFlushRequester().requestFlush(this);
3326     if (LOG.isDebugEnabled()) {
3327       LOG.debug("Flush requested on " + this);
3328     }
3329   }
3330 
3331   /*
3332    * @param size
3333    * @return True if size is over the flush threshold
3334    */
3335   private boolean isFlushSize(final long size) {
3336     return size > this.memstoreFlushSize;
3337   }
3338 
3339   /**
3340    * Read the edits log put under this region by wal log splitting process.  Put
3341    * the recovered edits back up into this region.
3342    *
3343    * <p>We can ignore any log message that has a sequence ID that's equal to or
3344    * lower than minSeqId.  (Because we know such log messages are already
3345    * reflected in the HFiles.)
3346    *
3347    * <p>While this is running we are putting pressure on memory yet we are
3348    * outside of our usual accounting because we are not yet an onlined region
3349    * (this stuff is being run as part of Region initialization).  This means
3350    * that if we're up against global memory limits, we'll not be flagged to flush
3351    * because we are not online. We can't be flushed by usual mechanisms anyways;
3352    * we're not yet online so our relative sequenceids are not yet aligned with
3353    * HLog sequenceids -- not till we come up online, post processing of split
3354    * edits.
3355    *
3356    * <p>But to help relieve memory pressure, at least manage our own heap size
3357    * flushing if are in excess of per-region limits.  Flushing, though, we have
3358    * to be careful and avoid using the regionserver/hlog sequenceid.  Its running
3359    * on a different line to whats going on in here in this region context so if we
3360    * crashed replaying these edits, but in the midst had a flush that used the
3361    * regionserver log with a sequenceid in excess of whats going on in here
3362    * in this region and with its split editlogs, then we could miss edits the
3363    * next time we go to recover. So, we have to flush inline, using seqids that
3364    * make sense in a this single region context only -- until we online.
3365    *
3366    * @param regiondir
3367    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3368    * the maxSeqId for the store to be applied, else its skipped.
3369    * @param reporter
3370    * @return the sequence id of the last edit added to this region out of the
3371    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3372    * @throws UnsupportedEncodingException
3373    * @throws IOException
3374    */
3375   protected long replayRecoveredEditsIfAny(final Path regiondir,
3376       Map<byte[], Long> maxSeqIdInStores,
3377       final CancelableProgressable reporter, final MonitoredTask status)
3378       throws UnsupportedEncodingException, IOException {
3379     long minSeqIdForTheRegion = -1;
3380     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3381       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3382         minSeqIdForTheRegion = maxSeqIdInStore;
3383       }
3384     }
3385     long seqid = minSeqIdForTheRegion;
3386 
3387     FileSystem fs = this.fs.getFileSystem();
3388     NavigableSet<Path> files = HLogUtil.getSplitEditFilesSorted(fs, regiondir);
3389     if (LOG.isDebugEnabled()) {
3390       LOG.debug("Found " + (files == null ? 0 : files.size())
3391         + " recovered edits file(s) under " + regiondir);
3392     }
3393 
3394     if (files == null || files.isEmpty()) return seqid;
3395 
3396     for (Path edits: files) {
3397       if (edits == null || !fs.exists(edits)) {
3398         LOG.warn("Null or non-existent edits file: " + edits);
3399         continue;
3400       }
3401       if (isZeroLengthThenDelete(fs, edits)) continue;
3402 
3403       long maxSeqId;
3404       String fileName = edits.getName();
3405       maxSeqId = Math.abs(Long.parseLong(fileName));
3406       if (maxSeqId <= minSeqIdForTheRegion) {
3407         if (LOG.isDebugEnabled()) {
3408           String msg = "Maximum sequenceid for this log is " + maxSeqId
3409             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3410             + ", skipped the whole file, path=" + edits;
3411           LOG.debug(msg);
3412         }
3413         continue;
3414       }
3415 
3416       try {
3417         // replay the edits. Replay can return -1 if everything is skipped, only update if seqId is greater
3418         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3419       } catch (IOException e) {
3420         boolean skipErrors = conf.getBoolean(
3421             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3422             conf.getBoolean(
3423                 "hbase.skip.errors",
3424                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3425         if (conf.get("hbase.skip.errors") != null) {
3426           LOG.warn(
3427               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3428               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3429         }
3430         if (skipErrors) {
3431           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3432           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3433               + "=true so continuing. Renamed " + edits +
3434               " as " + p, e);
3435         } else {
3436           throw e;
3437         }
3438       }
3439     }
3440     // The edits size added into rsAccounting during this replaying will not
3441     // be required any more. So just clear it.
3442     if (this.rsAccounting != null) {
3443       this.rsAccounting.clearRegionReplayEditsSize(this.getRegionName());
3444     }
3445     if (seqid > minSeqIdForTheRegion) {
3446       // Then we added some edits to memory. Flush and cleanup split edit files.
3447       internalFlushcache(null, seqid, status);
3448     }
3449     // Now delete the content of recovered edits.  We're done w/ them.
3450     for (Path file: files) {
3451       if (!fs.delete(file, false)) {
3452         LOG.error("Failed delete of " + file);
3453       } else {
3454         LOG.debug("Deleted recovered.edits file=" + file);
3455       }
3456     }
3457     return seqid;
3458   }
3459 
3460   /*
3461    * @param edits File of recovered edits.
3462    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in log
3463    * must be larger than this to be replayed for each store.
3464    * @param reporter
3465    * @return the sequence id of the last edit added to this region out of the
3466    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3467    * @throws IOException
3468    */
3469   private long replayRecoveredEdits(final Path edits,
3470       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
3471     throws IOException {
3472     String msg = "Replaying edits from " + edits;
3473     LOG.info(msg);
3474     MonitoredTask status = TaskMonitor.get().createStatus(msg);
3475     FileSystem fs = this.fs.getFileSystem();
3476 
3477     status.setStatus("Opening logs");
3478     HLog.Reader reader = null;
3479     try {
3480       reader = HLogFactory.createReader(fs, edits, conf);
3481       long currentEditSeqId = -1;
3482       long firstSeqIdInLog = -1;
3483       long skippedEdits = 0;
3484       long editsCount = 0;
3485       long intervalEdits = 0;
3486       HLog.Entry entry;
3487       Store store = null;
3488       boolean reported_once = false;
3489       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
3490 
3491       try {
3492         // How many edits seen before we check elapsed time
3493         int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
3494             2000);
3495         // How often to send a progress report (default 1/2 master timeout)
3496         int period = this.conf.getInt("hbase.hstore.report.period",
3497           this.conf.getInt(AssignmentManager.ASSIGNMENT_TIMEOUT,
3498             AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT) / 2);
3499         long lastReport = EnvironmentEdgeManager.currentTimeMillis();
3500 
3501         while ((entry = reader.next()) != null) {
3502           HLogKey key = entry.getKey();
3503           WALEdit val = entry.getEdit();
3504 
3505           if (ng != null) { // some test, or nonces disabled
3506             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
3507           }
3508 
3509           if (reporter != null) {
3510             intervalEdits += val.size();
3511             if (intervalEdits >= interval) {
3512               // Number of edits interval reached
3513               intervalEdits = 0;
3514               long cur = EnvironmentEdgeManager.currentTimeMillis();
3515               if (lastReport + period <= cur) {
3516                 status.setStatus("Replaying edits..." +
3517                     " skipped=" + skippedEdits +
3518                     " edits=" + editsCount);
3519                 // Timeout reached
3520                 if(!reporter.progress()) {
3521                   msg = "Progressable reporter failed, stopping replay";
3522                   LOG.warn(msg);
3523                   status.abort(msg);
3524                   throw new IOException(msg);
3525                 }
3526                 reported_once = true;
3527                 lastReport = cur;
3528               }
3529             }
3530           }
3531 
3532           if (firstSeqIdInLog == -1) {
3533             firstSeqIdInLog = key.getLogSeqNum();
3534           }
3535           currentEditSeqId = key.getLogSeqNum();
3536 
3537           // Start coprocessor replay here. The coprocessor is for each WALEdit
3538           // instead of a KeyValue.
3539           if (coprocessorHost != null) {
3540             status.setStatus("Running pre-WAL-restore hook in coprocessors");
3541             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
3542               // if bypass this log entry, ignore it ...
3543               continue;
3544             }
3545           }
3546           boolean checkRowWithinBoundary = false;
3547           // Check this edit is for this region.
3548           if (!Bytes.equals(key.getEncodedRegionName(),
3549               this.getRegionInfo().getEncodedNameAsBytes())) {
3550             checkRowWithinBoundary = true;
3551           }
3552 
3553           boolean flush = false;
3554           for (KeyValue kv: val.getKeyValues()) {
3555             // Check this edit is for me. Also, guard against writing the special
3556             // METACOLUMN info such as HBASE::CACHEFLUSH entries
3557             if (kv.matchingFamily(WALEdit.METAFAMILY) ||
3558                 !Bytes.equals(key.getEncodedRegionName(),
3559                   this.getRegionInfo().getEncodedNameAsBytes())) {
3560               // if region names don't match, skipp replaying compaction marker
3561               if (!checkRowWithinBoundary) {
3562                 //this is a special edit, we should handle it
3563                 CompactionDescriptor compaction = WALEdit.getCompaction(kv);
3564                 if (compaction != null) {
3565                   //replay the compaction
3566                   completeCompactionMarker(compaction);
3567                 }
3568               }
3569 
3570               skippedEdits++;
3571               continue;
3572             }
3573             // Figure which store the edit is meant for.
3574             if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
3575               store = this.stores.get(kv.getFamily());
3576             }
3577             if (store == null) {
3578               // This should never happen.  Perhaps schema was changed between
3579               // crash and redeploy?
3580               LOG.warn("No family for " + kv);
3581               skippedEdits++;
3582               continue;
3583             }
3584             if (checkRowWithinBoundary && !rowIsInRange(this.getRegionInfo(),
3585               kv.getRowArray(), kv.getRowOffset(), kv.getRowLength())) {
3586               LOG.warn("Row of " + kv + " is not within region boundary");
3587               skippedEdits++;
3588               continue;
3589             }
3590             // Now, figure if we should skip this edit.
3591             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
3592                 .getName())) {
3593               skippedEdits++;
3594               continue;
3595             }
3596             // Once we are over the limit, restoreEdit will keep returning true to
3597             // flush -- but don't flush until we've played all the kvs that make up
3598             // the WALEdit.
3599             flush |= restoreEdit(store, kv);
3600             editsCount++;
3601           }
3602           if (flush) {
3603             internalFlushcache(null, currentEditSeqId, status);
3604           }
3605 
3606           if (coprocessorHost != null) {
3607             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
3608           }
3609         }
3610       } catch (EOFException eof) {
3611         Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3612         msg = "Encountered EOF. Most likely due to Master failure during " +
3613             "log spliting, so we have this data in another edit.  " +
3614             "Continuing, but renaming " + edits + " as " + p;
3615         LOG.warn(msg, eof);
3616         status.abort(msg);
3617       } catch (IOException ioe) {
3618         // If the IOE resulted from bad file format,
3619         // then this problem is idempotent and retrying won't help
3620         if (ioe.getCause() instanceof ParseException) {
3621           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3622           msg = "File corruption encountered!  " +
3623               "Continuing, but renaming " + edits + " as " + p;
3624           LOG.warn(msg, ioe);
3625           status.setStatus(msg);
3626         } else {
3627           status.abort(StringUtils.stringifyException(ioe));
3628           // other IO errors may be transient (bad network connection,
3629           // checksum exception on one datanode, etc).  throw & retry
3630           throw ioe;
3631         }
3632       }
3633       if (reporter != null && !reported_once) {
3634         reporter.progress();
3635       }
3636       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
3637         ", firstSequenceidInLog=" + firstSeqIdInLog +
3638         ", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits;
3639       status.markComplete(msg);
3640       LOG.debug(msg);
3641       return currentEditSeqId;
3642     } finally {
3643       status.cleanup();
3644       if (reader != null) {
3645          reader.close();
3646       }
3647     }
3648   }
3649 
3650   /**
3651    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
3652    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
3653    * See HBASE-2331.
3654    * @param compaction
3655    */
3656   void completeCompactionMarker(CompactionDescriptor compaction)
3657       throws IOException {
3658     Store store = this.getStore(compaction.getFamilyName().toByteArray());
3659     if (store == null) {
3660       LOG.warn("Found Compaction WAL edit for deleted family:" +
3661           Bytes.toString(compaction.getFamilyName().toByteArray()));
3662       return;
3663     }
3664     store.completeCompactionMarker(compaction);
3665   }
3666 
3667   /**
3668    * Used by tests
3669    * @param s Store to add edit too.
3670    * @param kv KeyValue to add.
3671    * @return True if we should flush.
3672    */
3673   protected boolean restoreEdit(final Store s, final KeyValue kv) {
3674     long kvSize = s.add(kv);
3675     if (this.rsAccounting != null) {
3676       rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
3677     }
3678     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
3679   }
3680 
3681   /*
3682    * @param fs
3683    * @param p File to check.
3684    * @return True if file was zero-length (and if so, we'll delete it in here).
3685    * @throws IOException
3686    */
3687   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
3688       throws IOException {
3689     FileStatus stat = fs.getFileStatus(p);
3690     if (stat.getLen() > 0) return false;
3691     LOG.warn("File " + p + " is zero-length, deleting.");
3692     fs.delete(p, false);
3693     return true;
3694   }
3695 
3696   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
3697     return new HStore(this, family, this.conf);
3698   }
3699 
3700   /**
3701    * Return HStore instance.
3702    * Use with caution.  Exposed for use of fixup utilities.
3703    * @param column Name of column family hosted by this region.
3704    * @return Store that goes with the family on passed <code>column</code>.
3705    * TODO: Make this lookup faster.
3706    */
3707   public Store getStore(final byte[] column) {
3708     return this.stores.get(column);
3709   }
3710 
3711   public Map<byte[], Store> getStores() {
3712     return this.stores;
3713   }
3714 
3715   /**
3716    * Return list of storeFiles for the set of CFs.
3717    * Uses closeLock to prevent the race condition where a region closes
3718    * in between the for loop - closing the stores one by one, some stores
3719    * will return 0 files.
3720    * @return List of storeFiles.
3721    */
3722   public List<String> getStoreFileList(final byte [][] columns)
3723     throws IllegalArgumentException {
3724     List<String> storeFileNames = new ArrayList<String>();
3725     synchronized(closeLock) {
3726       for(byte[] column : columns) {
3727         Store store = this.stores.get(column);
3728         if (store == null) {
3729           throw new IllegalArgumentException("No column family : " +
3730               new String(column) + " available");
3731         }
3732         for (StoreFile storeFile: store.getStorefiles()) {
3733           storeFileNames.add(storeFile.getPath().toString());
3734         }
3735       }
3736     }
3737     return storeFileNames;
3738   }
3739   //////////////////////////////////////////////////////////////////////////////
3740   // Support code
3741   //////////////////////////////////////////////////////////////////////////////
3742 
3743   /** Make sure this is a valid row for the HRegion */
3744   void checkRow(final byte [] row, String op) throws IOException {
3745     if (!rowIsInRange(getRegionInfo(), row)) {
3746       throw new WrongRegionException("Requested row out of range for " +
3747           op + " on HRegion " + this + ", startKey='" +
3748           Bytes.toStringBinary(getStartKey()) + "', getEndKey()='" +
3749           Bytes.toStringBinary(getEndKey()) + "', row='" +
3750           Bytes.toStringBinary(row) + "'");
3751     }
3752   }
3753 
3754   /**
3755    * Tries to acquire a lock on the given row.
3756    * @param waitForLock if true, will block until the lock is available.
3757    *        Otherwise, just tries to obtain the lock and returns
3758    *        false if unavailable.
3759    * @return the row lock if acquired,
3760    *   null if waitForLock was false and the lock was not acquired
3761    * @throws IOException if waitForLock was true and the lock could not be acquired after waiting
3762    */
3763   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
3764     startRegionOperation();
3765     try {
3766       return getRowLockInternal(row, waitForLock);
3767     } finally {
3768       closeRegionOperation();
3769     }
3770   }
3771 
3772   /**
3773    * A version of getRowLock(byte[], boolean) to use when a region operation has already been
3774    * started (the calling thread has already acquired the region-close-lock).
3775    */
3776   protected RowLock getRowLockInternal(byte[] row, boolean waitForLock) throws IOException {
3777     HashedBytes rowKey = new HashedBytes(row);
3778     RowLockContext rowLockContext = new RowLockContext(rowKey);
3779 
3780     // loop until we acquire the row lock (unless !waitForLock)
3781     while (true) {
3782       RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
3783       if (existingContext == null) {
3784         // Row is not already locked by any thread, use newly created context.
3785         break;
3786       } else if (existingContext.ownedByCurrentThread()) {
3787         // Row is already locked by current thread, reuse existing context instead.
3788         rowLockContext = existingContext;
3789         break;
3790       } else {
3791         // Row is already locked by some other thread, give up or wait for it
3792         if (!waitForLock) {
3793           return null;
3794         }
3795         TraceScope traceScope = null;
3796         try {
3797           if (Trace.isTracing()) {
3798             traceScope = Trace.startSpan("HRegion.getRowLockInternal");
3799           }
3800           if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
3801             if(traceScope != null) {
3802               traceScope.getSpan().addTimelineAnnotation("Failed to get row lock");
3803             }
3804             throw new IOException("Timed out waiting for lock for row: " + rowKey);
3805           }
3806           if (traceScope != null) traceScope.close();
3807           traceScope = null;
3808         } catch (InterruptedException ie) {
3809           LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
3810           InterruptedIOException iie = new InterruptedIOException();
3811           iie.initCause(ie);
3812           throw iie;
3813         } finally {
3814           if (traceScope != null) traceScope.close();
3815         }
3816       }
3817     }
3818 
3819     // allocate new lock for this thread
3820     return rowLockContext.newLock();
3821   }
3822 
3823   /**
3824    * Acqures a lock on the given row.
3825    * The same thread may acquire multiple locks on the same row.
3826    * @return the acquired row lock
3827    * @throws IOException if the lock could not be acquired after waiting
3828    */
3829   public RowLock getRowLock(byte[] row) throws IOException {
3830     return getRowLock(row, true);
3831   }
3832 
3833   /**
3834    * If the given list of row locks is not null, releases all locks.
3835    */
3836   public void releaseRowLocks(List<RowLock> rowLocks) {
3837     if (rowLocks != null) {
3838       for (RowLock rowLock : rowLocks) {
3839         rowLock.release();
3840       }
3841       rowLocks.clear();
3842     }
3843   }
3844 
3845   /**
3846    * Determines whether multiple column families are present
3847    * Precondition: familyPaths is not null
3848    *
3849    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3850    */
3851   private static boolean hasMultipleColumnFamilies(
3852       List<Pair<byte[], String>> familyPaths) {
3853     boolean multipleFamilies = false;
3854     byte[] family = null;
3855     for (Pair<byte[], String> pair : familyPaths) {
3856       byte[] fam = pair.getFirst();
3857       if (family == null) {
3858         family = fam;
3859       } else if (!Bytes.equals(family, fam)) {
3860         multipleFamilies = true;
3861         break;
3862       }
3863     }
3864     return multipleFamilies;
3865   }
3866 
3867 
3868   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths,
3869                                 boolean assignSeqId) throws IOException {
3870     return bulkLoadHFiles(familyPaths, assignSeqId, null);
3871   }
3872 
3873   /**
3874    * Attempts to atomically load a group of hfiles.  This is critical for loading
3875    * rows with multiple column families atomically.
3876    *
3877    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3878    * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
3879    * file about to be bulk loaded
3880    * @param assignSeqId
3881    * @return true if successful, false if failed recoverably
3882    * @throws IOException if failed unrecoverably.
3883    */
3884   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths, boolean assignSeqId,
3885       BulkLoadListener bulkLoadListener) throws IOException {
3886     Preconditions.checkNotNull(familyPaths);
3887     // we need writeLock for multi-family bulk load
3888     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
3889     try {
3890       this.writeRequestsCount.increment();
3891 
3892       // There possibly was a split that happend between when the split keys
3893       // were gathered and before the HReiogn's write lock was taken.  We need
3894       // to validate the HFile region before attempting to bulk load all of them
3895       List<IOException> ioes = new ArrayList<IOException>();
3896       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
3897       for (Pair<byte[], String> p : familyPaths) {
3898         byte[] familyName = p.getFirst();
3899         String path = p.getSecond();
3900 
3901         Store store = getStore(familyName);
3902         if (store == null) {
3903           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
3904               "No such column family " + Bytes.toStringBinary(familyName));
3905           ioes.add(ioe);
3906         } else {
3907           try {
3908             store.assertBulkLoadHFileOk(new Path(path));
3909           } catch (WrongRegionException wre) {
3910             // recoverable (file doesn't fit in region)
3911             failures.add(p);
3912           } catch (IOException ioe) {
3913             // unrecoverable (hdfs problem)
3914             ioes.add(ioe);
3915           }
3916         }
3917       }
3918 
3919       // validation failed because of some sort of IO problem.
3920       if (ioes.size() != 0) {
3921         IOException e = MultipleIOException.createIOException(ioes);
3922         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
3923         throw e;
3924       }
3925 
3926       // validation failed, bail out before doing anything permanent.
3927       if (failures.size() != 0) {
3928         StringBuilder list = new StringBuilder();
3929         for (Pair<byte[], String> p : failures) {
3930           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
3931             .append(p.getSecond());
3932         }
3933         // problem when validating
3934         LOG.warn("There was a recoverable bulk load failure likely due to a" +
3935             " split.  These (family, HFile) pairs were not loaded: " + list);
3936         return false;
3937       }
3938 
3939       long seqId = -1;
3940       // We need to assign a sequential ID that's in between two memstores in order to preserve
3941       // the guarantee that all the edits lower than the highest sequential ID from all the
3942       // HFiles are flushed on disk. See HBASE-10958.
3943       if (assignSeqId) {
3944         FlushResult fs = this.flushcache();
3945         if (fs.isFlushSucceeded()) {
3946           seqId = fs.flushSequenceId;
3947         } else if (fs.result == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
3948           seqId = this.sequenceId.incrementAndGet();
3949         } else {
3950           throw new IOException("Could not bulk load with an assigned sequential ID because the " +
3951               "flush didn't run. Reason for not flushing: " + fs.failureReason);
3952         }
3953       }
3954 
3955       for (Pair<byte[], String> p : familyPaths) {
3956         byte[] familyName = p.getFirst();
3957         String path = p.getSecond();
3958         Store store = getStore(familyName);
3959         try {
3960           String finalPath = path;
3961           if(bulkLoadListener != null) {
3962             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
3963           }
3964           store.bulkLoadHFile(finalPath, seqId);
3965           if(bulkLoadListener != null) {
3966             bulkLoadListener.doneBulkLoad(familyName, path);
3967           }
3968         } catch (IOException ioe) {
3969           // A failure here can cause an atomicity violation that we currently
3970           // cannot recover from since it is likely a failed HDFS operation.
3971 
3972           // TODO Need a better story for reverting partial failures due to HDFS.
3973           LOG.error("There was a partial failure due to IO when attempting to" +
3974               " load " + Bytes.toString(p.getFirst()) + " : "+ p.getSecond(), ioe);
3975           if(bulkLoadListener != null) {
3976             try {
3977               bulkLoadListener.failedBulkLoad(familyName, path);
3978             } catch (Exception ex) {
3979               LOG.error("Error while calling failedBulkLoad for family "+
3980                   Bytes.toString(familyName)+" with path "+path, ex);
3981             }
3982           }
3983           throw ioe;
3984         }
3985       }
3986       return true;
3987     } finally {
3988       closeBulkRegionOperation();
3989     }
3990   }
3991 
3992   @Override
3993   public boolean equals(Object o) {
3994     return o instanceof HRegion && Bytes.equals(this.getRegionName(),
3995                                                 ((HRegion) o).getRegionName());
3996   }
3997 
3998   @Override
3999   public int hashCode() {
4000     return Bytes.hashCode(this.getRegionName());
4001   }
4002 
4003   @Override
4004   public String toString() {
4005     return this.getRegionNameAsString();
4006   }
4007 
4008   /**
4009    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
4010    */
4011   class RegionScannerImpl implements RegionScanner {
4012     // Package local for testability
4013     KeyValueHeap storeHeap = null;
4014     /** Heap of key-values that are not essential for the provided filters and are thus read
4015      * on demand, if on-demand column family loading is enabled.*/
4016     KeyValueHeap joinedHeap = null;
4017     /**
4018      * If the joined heap data gathering is interrupted due to scan limits, this will
4019      * contain the row for which we are populating the values.*/
4020     protected KeyValue joinedContinuationRow = null;
4021     // KeyValue indicating that limit is reached when scanning
4022     private final KeyValue KV_LIMIT = new KeyValue();
4023     protected final byte[] stopRow;
4024     private final FilterWrapper filter;
4025     private int batch;
4026     protected int isScan;
4027     private boolean filterClosed = false;
4028     private long readPt;
4029     private long maxResultSize;
4030     protected HRegion region;
4031 
4032     @Override
4033     public HRegionInfo getRegionInfo() {
4034       return region.getRegionInfo();
4035     }
4036 
4037     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
4038         throws IOException {
4039 
4040       this.region = region;
4041       this.maxResultSize = scan.getMaxResultSize();
4042       if (scan.hasFilter()) {
4043         this.filter = new FilterWrapper(scan.getFilter());
4044       } else {
4045         this.filter = null;
4046       }
4047 
4048       this.batch = scan.getBatch();
4049       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
4050         this.stopRow = null;
4051       } else {
4052         this.stopRow = scan.getStopRow();
4053       }
4054       // If we are doing a get, we want to be [startRow,endRow] normally
4055       // it is [startRow,endRow) and if startRow=endRow we get nothing.
4056       this.isScan = scan.isGetScan() ? -1 : 0;
4057 
4058       // synchronize on scannerReadPoints so that nobody calculates
4059       // getSmallestReadPoint, before scannerReadPoints is updated.
4060       IsolationLevel isolationLevel = scan.getIsolationLevel();
4061       synchronized(scannerReadPoints) {
4062         this.readPt = getReadpoint(isolationLevel);
4063         scannerReadPoints.put(this, this.readPt);
4064       }
4065 
4066       // Here we separate all scanners into two lists - scanner that provide data required
4067       // by the filter to operate (scanners list) and all others (joinedScanners list).
4068       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
4069       List<KeyValueScanner> joinedScanners =
4070           new ArrayList<KeyValueScanner>(scan.getFamilyMap().size());
4071       if (additionalScanners != null) {
4072         scanners.addAll(additionalScanners);
4073       }
4074 
4075       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
4076           scan.getFamilyMap().entrySet()) {
4077         Store store = stores.get(entry.getKey());
4078         KeyValueScanner scanner;
4079         try {
4080           scanner = store.getScanner(scan, entry.getValue(), this.readPt);
4081         } catch (FileNotFoundException e) {
4082           abortRegionServer(e.getMessage());
4083           throw new NotServingRegionException(region.getRegionNameAsString() + " is closing");
4084         }
4085         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
4086           || this.filter.isFamilyEssential(entry.getKey())) {
4087           scanners.add(scanner);
4088         } else {
4089           joinedScanners.add(scanner);
4090         }
4091       }
4092       initializeKVHeap(scanners, joinedScanners, region);
4093     }
4094 
4095     RegionScannerImpl(Scan scan, HRegion region) throws IOException {
4096       this(scan, null, region);
4097     }
4098 
4099     protected void initializeKVHeap(List<KeyValueScanner> scanners,
4100         List<KeyValueScanner> joinedScanners, HRegion region)
4101         throws IOException {
4102       this.storeHeap = new KeyValueHeap(scanners, region.comparator);
4103       if (!joinedScanners.isEmpty()) {
4104         this.joinedHeap = new KeyValueHeap(joinedScanners, region.comparator);
4105       }
4106     }
4107 
4108     @Override
4109     public long getMaxResultSize() {
4110       return maxResultSize;
4111     }
4112 
4113     @Override
4114     public long getMvccReadPoint() {
4115       return this.readPt;
4116     }
4117 
4118     /**
4119      * Reset both the filter and the old filter.
4120      *
4121      * @throws IOException in case a filter raises an I/O exception.
4122      */
4123     protected void resetFilters() throws IOException {
4124       if (filter != null) {
4125         filter.reset();
4126       }
4127     }
4128 
4129     @Override
4130     public boolean next(List<Cell> outResults)
4131         throws IOException {
4132       // apply the batching limit by default
4133       return next(outResults, batch);
4134     }
4135 
4136     @Override
4137     public synchronized boolean next(List<Cell> outResults, int limit) throws IOException {
4138       if (this.filterClosed) {
4139         throw new UnknownScannerException("Scanner was closed (timed out?) " +
4140             "after we renewed it. Could be caused by a very slow scanner " +
4141             "or a lengthy garbage collection");
4142       }
4143       startRegionOperation(Operation.SCAN);
4144       readRequestsCount.increment();
4145       try {
4146         boolean returnResult = nextRaw(outResults, limit);
4147         if (region != null && region.metricsRegion != null) {
4148           long totalSize = 0;
4149           for (Cell cell: outResults) {
4150             KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4151             totalSize += kv.getLength();
4152           }
4153           region.metricsRegion.updateScanNext(totalSize);
4154         }
4155         return returnResult;
4156       } finally {
4157         closeRegionOperation(Operation.SCAN);
4158       }
4159     }
4160 
4161     @Override
4162     public boolean nextRaw(List<Cell> outResults)
4163         throws IOException {
4164       return nextRaw(outResults, batch);
4165     }
4166 
4167     @Override
4168     public boolean nextRaw(List<Cell> outResults, int limit) throws IOException {
4169       if (storeHeap == null) {
4170         // scanner is closed
4171         throw new UnknownScannerException("Scanner was closed");
4172       }
4173       boolean returnResult;
4174       if (outResults.isEmpty()) {
4175         // Usually outResults is empty. This is true when next is called
4176         // to handle scan or get operation.
4177         returnResult = nextInternal(outResults, limit);
4178       } else {
4179         List<Cell> tmpList = new ArrayList<Cell>();
4180         returnResult = nextInternal(tmpList, limit);
4181         outResults.addAll(tmpList);
4182       }
4183       resetFilters();
4184       if (isFilterDoneInternal()) {
4185         returnResult = false;
4186       }
4187       return returnResult;
4188     }
4189 
4190     private void populateFromJoinedHeap(List<Cell> results, int limit)
4191         throws IOException {
4192       assert joinedContinuationRow != null;
4193       KeyValue kv = populateResult(results, this.joinedHeap, limit,
4194           joinedContinuationRow.getBuffer(), joinedContinuationRow.getRowOffset(),
4195           joinedContinuationRow.getRowLength());
4196       if (kv != KV_LIMIT) {
4197         // We are done with this row, reset the continuation.
4198         joinedContinuationRow = null;
4199       }
4200       // As the data is obtained from two independent heaps, we need to
4201       // ensure that result list is sorted, because Result relies on that.
4202       Collections.sort(results, comparator);
4203     }
4204 
4205     /**
4206      * Fetches records with currentRow into results list, until next row or limit (if not -1).
4207      * @param results
4208      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
4209      * @param limit Max amount of KVs to place in result list, -1 means no limit.
4210      * @param currentRow Byte array with key we are fetching.
4211      * @param offset offset for currentRow
4212      * @param length length for currentRow
4213      * @return KV_LIMIT if limit reached, next KeyValue otherwise.
4214      */
4215     private KeyValue populateResult(List<Cell> results, KeyValueHeap heap, int limit,
4216         byte[] currentRow, int offset, short length) throws IOException {
4217       KeyValue nextKv;
4218       try {
4219         do {
4220           heap.next(results, limit - results.size());
4221           if (limit > 0 && results.size() == limit) {
4222             return KV_LIMIT;
4223           }
4224           nextKv = heap.peek();
4225         } while (nextKv != null && nextKv.matchingRow(currentRow, offset, length));
4226       } catch (FileNotFoundException e) {
4227         abortRegionServer(e.getMessage());
4228         throw new NotServingRegionException(region.getRegionNameAsString() + " is closing");
4229       }
4230       return nextKv;
4231     }
4232 
4233     /*
4234      * @return True if a filter rules the scanner is over, done.
4235      */
4236     @Override
4237     public synchronized boolean isFilterDone() throws IOException {
4238       return isFilterDoneInternal();
4239     }
4240 
4241     private boolean isFilterDoneInternal() throws IOException {
4242       return this.filter != null && this.filter.filterAllRemaining();
4243     }
4244 
4245     private boolean nextInternal(List<Cell> results, int limit)
4246     throws IOException {
4247       if (!results.isEmpty()) {
4248         throw new IllegalArgumentException("First parameter should be an empty list");
4249       }
4250       RpcCallContext rpcCall = RpcServer.getCurrentCall();
4251       // The loop here is used only when at some point during the next we determine
4252       // that due to effects of filters or otherwise, we have an empty row in the result.
4253       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
4254       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
4255       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
4256       while (true) {
4257         if (rpcCall != null) {
4258           // If a user specifies a too-restrictive or too-slow scanner, the
4259           // client might time out and disconnect while the server side
4260           // is still processing the request. We should abort aggressively
4261           // in that case.
4262           long afterTime = rpcCall.disconnectSince();
4263           if (afterTime >= 0) {
4264             throw new CallerDisconnectedException(
4265                 "Aborting on region " + getRegionNameAsString() + ", call " +
4266                     this + " after " + afterTime + " ms, since " +
4267                     "caller disconnected");
4268           }
4269         }
4270 
4271         // Let's see what we have in the storeHeap.
4272         KeyValue current = this.storeHeap.peek();
4273 
4274         byte[] currentRow = null;
4275         int offset = 0;
4276         short length = 0;
4277         if (current != null) {
4278           currentRow = current.getBuffer();
4279           offset = current.getRowOffset();
4280           length = current.getRowLength();
4281         }
4282         boolean stopRow = isStopRow(currentRow, offset, length);
4283         // Check if we were getting data from the joinedHeap and hit the limit.
4284         // If not, then it's main path - getting results from storeHeap.
4285         if (joinedContinuationRow == null) {
4286           // First, check if we are at a stop row. If so, there are no more results.
4287           if (stopRow) {
4288             if (filter != null && filter.hasFilterRow()) {
4289               filter.filterRowCells(results);
4290             }
4291             return false;
4292           }
4293 
4294           // Check if rowkey filter wants to exclude this row. If so, loop to next.
4295           // Technically, if we hit limits before on this row, we don't need this call.
4296           if (filterRowKey(currentRow, offset, length)) {
4297             boolean moreRows = nextRow(currentRow, offset, length);
4298             if (!moreRows) return false;
4299             results.clear();
4300             continue;
4301           }
4302 
4303           KeyValue nextKv = populateResult(results, this.storeHeap, limit, currentRow, offset,
4304               length);
4305           // Ok, we are good, let's try to get some results from the main heap.
4306           if (nextKv == KV_LIMIT) {
4307             if (this.filter != null && filter.hasFilterRow()) {
4308               throw new IncompatibleFilterException(
4309                 "Filter whose hasFilterRow() returns true is incompatible with scan with limit!");
4310             }
4311             return true; // We hit the limit.
4312           }
4313 
4314           stopRow = nextKv == null ||
4315               isStopRow(nextKv.getBuffer(), nextKv.getRowOffset(), nextKv.getRowLength());
4316           // save that the row was empty before filters applied to it.
4317           final boolean isEmptyRow = results.isEmpty();
4318 
4319           // We have the part of the row necessary for filtering (all of it, usually).
4320           // First filter with the filterRow(List).
4321           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
4322           if (filter != null && filter.hasFilterRow()) {
4323             ret = filter.filterRowCellsWithRet(results);
4324           }
4325 
4326           if ((isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE) || filterRow()) {
4327             results.clear();
4328             boolean moreRows = nextRow(currentRow, offset, length);
4329             if (!moreRows) return false;
4330 
4331             // This row was totally filtered out, if this is NOT the last row,
4332             // we should continue on. Otherwise, nothing else to do.
4333             if (!stopRow) continue;
4334             return false;
4335           }
4336 
4337           // Ok, we are done with storeHeap for this row.
4338           // Now we may need to fetch additional, non-essential data into row.
4339           // These values are not needed for filter to work, so we postpone their
4340           // fetch to (possibly) reduce amount of data loads from disk.
4341           if (this.joinedHeap != null) {
4342             KeyValue nextJoinedKv = joinedHeap.peek();
4343             // If joinedHeap is pointing to some other row, try to seek to a correct one.
4344             boolean mayHaveData =
4345               (nextJoinedKv != null && nextJoinedKv.matchingRow(currentRow, offset, length))
4346               || (this.joinedHeap.requestSeek(KeyValue.createFirstOnRow(currentRow, offset, length),
4347                 true, true)
4348                 && joinedHeap.peek() != null
4349                 && joinedHeap.peek().matchingRow(currentRow, offset, length));
4350             if (mayHaveData) {
4351               joinedContinuationRow = current;
4352               populateFromJoinedHeap(results, limit);
4353             }
4354           }
4355         } else {
4356           // Populating from the joined heap was stopped by limits, populate some more.
4357           populateFromJoinedHeap(results, limit);
4358         }
4359 
4360         // We may have just called populateFromJoinedMap and hit the limits. If that is
4361         // the case, we need to call it again on the next next() invocation.
4362         if (joinedContinuationRow != null) {
4363           return true;
4364         }
4365 
4366         // Finally, we are done with both joinedHeap and storeHeap.
4367         // Double check to prevent empty rows from appearing in result. It could be
4368         // the case when SingleColumnValueExcludeFilter is used.
4369         if (results.isEmpty()) {
4370           boolean moreRows = nextRow(currentRow, offset, length);
4371           if (!moreRows) return false;
4372           if (!stopRow) continue;
4373         }
4374 
4375         // We are done. Return the result.
4376         return !stopRow;
4377       }
4378     }
4379 
4380     /**
4381      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
4382      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
4383      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
4384      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
4385      * filterRow() will be skipped.
4386      */
4387     private boolean filterRow() throws IOException {
4388       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
4389       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
4390       return filter != null && (!filter.hasFilterRow())
4391           && filter.filterRow();
4392     }
4393 
4394     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
4395       return filter != null
4396           && filter.filterRowKey(row, offset, length);
4397     }
4398 
4399     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
4400       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
4401       KeyValue next;
4402       while ((next = this.storeHeap.peek()) != null &&
4403              next.matchingRow(currentRow, offset, length)) {
4404         this.storeHeap.next(MOCKED_LIST);
4405       }
4406       resetFilters();
4407       // Calling the hook in CP which allows it to do a fast forward
4408       return this.region.getCoprocessorHost() == null
4409           || this.region.getCoprocessorHost()
4410               .postScannerFilterRow(this, currentRow, offset, length);
4411     }
4412 
4413     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
4414       return currentRow == null ||
4415           (stopRow != null &&
4416           comparator.compareRows(stopRow, 0, stopRow.length,
4417             currentRow, offset, length) <= isScan);
4418     }
4419 
4420     @Override
4421     public synchronized void close() {
4422       if (storeHeap != null) {
4423         storeHeap.close();
4424         storeHeap = null;
4425       }
4426       if (joinedHeap != null) {
4427         joinedHeap.close();
4428         joinedHeap = null;
4429       }
4430       // no need to sychronize here.
4431       scannerReadPoints.remove(this);
4432       this.filterClosed = true;
4433     }
4434 
4435     KeyValueHeap getStoreHeapForTesting() {
4436       return storeHeap;
4437     }
4438 
4439     @Override
4440     public synchronized boolean reseek(byte[] row) throws IOException {
4441       if (row == null) {
4442         throw new IllegalArgumentException("Row cannot be null.");
4443       }
4444       boolean result = false;
4445       startRegionOperation();
4446       try {
4447         KeyValue kv = KeyValue.createFirstOnRow(row);
4448         // use request seek to make use of the lazy seek option. See HBASE-5520
4449         result = this.storeHeap.requestSeek(kv, true, true);
4450         if (this.joinedHeap != null) {
4451           result = this.joinedHeap.requestSeek(kv, true, true) || result;
4452         }
4453       } catch (FileNotFoundException e) {
4454         abortRegionServer(e.getMessage());
4455         throw new NotServingRegionException(region.getRegionNameAsString() + " is closing");
4456       } finally {
4457         closeRegionOperation();
4458       }
4459       return result;
4460     }
4461   }
4462 
4463   // Utility methods
4464   /**
4465    * A utility method to create new instances of HRegion based on the
4466    * {@link HConstants#REGION_IMPL} configuration property.
4467    * @param tableDir qualified path of directory where region should be located,
4468    * usually the table directory.
4469    * @param log The HLog is the outbound log for any updates to the HRegion
4470    * (There's a single HLog for all the HRegions on a single HRegionServer.)
4471    * The log file is a logfile from the previous execution that's
4472    * custom-computed for this HRegion. The HRegionServer computes and sorts the
4473    * appropriate log info for this HRegion. If there is a previous log file
4474    * (implying that the HRegion has been written-to before), then read it from
4475    * the supplied path.
4476    * @param fs is the filesystem.
4477    * @param conf is global configuration settings.
4478    * @param regionInfo - HRegionInfo that describes the region
4479    * is new), then read them from the supplied path.
4480    * @param htd the table descriptor
4481    * @param rsServices
4482    * @return the new instance
4483    */
4484   static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs,
4485       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
4486       RegionServerServices rsServices) {
4487     try {
4488       @SuppressWarnings("unchecked")
4489       Class<? extends HRegion> regionClass =
4490           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
4491 
4492       Constructor<? extends HRegion> c =
4493           regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
4494               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
4495               RegionServerServices.class);
4496 
4497       return c.newInstance(tableDir, log, fs, conf, regionInfo, htd, rsServices);
4498     } catch (Throwable e) {
4499       // todo: what should I throw here?
4500       throw new IllegalStateException("Could not instantiate a region instance.", e);
4501     }
4502   }
4503 
4504   /**
4505    * Convenience method creating new HRegions. Used by createTable and by the
4506    * bootstrap code in the HMaster constructor.
4507    * Note, this method creates an {@link HLog} for the created region. It
4508    * needs to be closed explicitly.  Use {@link HRegion#getLog()} to get
4509    * access.  <b>When done with a region created using this method, you will
4510    * need to explicitly close the {@link HLog} it created too; it will not be
4511    * done for you.  Not closing the log will leave at least a daemon thread
4512    * running.</b>  Call {@link #closeHRegion(HRegion)} and it will do
4513    * necessary cleanup for you.
4514    * @param info Info for region to create.
4515    * @param rootDir Root directory for HBase instance
4516    * @param conf
4517    * @param hTableDescriptor
4518    * @return new HRegion
4519    *
4520    * @throws IOException
4521    */
4522   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4523       final Configuration conf, final HTableDescriptor hTableDescriptor)
4524   throws IOException {
4525     return createHRegion(info, rootDir, conf, hTableDescriptor, null);
4526   }
4527 
4528   /**
4529    * This will do the necessary cleanup a call to
4530    * {@link #createHRegion(HRegionInfo, Path, Configuration, HTableDescriptor)}
4531    * requires.  This method will close the region and then close its
4532    * associated {@link HLog} file.  You use it if you call the other createHRegion,
4533    * the one that takes an {@link HLog} instance but don't be surprised by the
4534    * call to the {@link HLog#closeAndDelete()} on the {@link HLog} the
4535    * HRegion was carrying.
4536    * @param r
4537    * @throws IOException
4538    */
4539   public static void closeHRegion(final HRegion r) throws IOException {
4540     if (r == null) return;
4541     r.close();
4542     if (r.getLog() == null) return;
4543     r.getLog().closeAndDelete();
4544   }
4545 
4546   /**
4547    * Convenience method creating new HRegions. Used by createTable.
4548    * The {@link HLog} for the created region needs to be closed explicitly.
4549    * Use {@link HRegion#getLog()} to get access.
4550    *
4551    * @param info Info for region to create.
4552    * @param rootDir Root directory for HBase instance
4553    * @param conf
4554    * @param hTableDescriptor
4555    * @param hlog shared HLog
4556    * @param initialize - true to initialize the region
4557    * @return new HRegion
4558    *
4559    * @throws IOException
4560    */
4561   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4562                                       final Configuration conf,
4563                                       final HTableDescriptor hTableDescriptor,
4564                                       final HLog hlog,
4565                                       final boolean initialize)
4566       throws IOException {
4567     return createHRegion(info, rootDir, conf, hTableDescriptor,
4568         hlog, initialize, false);
4569   }
4570 
4571   /**
4572    * Convenience method creating new HRegions. Used by createTable.
4573    * The {@link HLog} for the created region needs to be closed
4574    * explicitly, if it is not null.
4575    * Use {@link HRegion#getLog()} to get access.
4576    *
4577    * @param info Info for region to create.
4578    * @param rootDir Root directory for HBase instance
4579    * @param conf
4580    * @param hTableDescriptor
4581    * @param hlog shared HLog
4582    * @param initialize - true to initialize the region
4583    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4584    * @return new HRegion
4585    * @throws IOException
4586    */
4587   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4588                                       final Configuration conf,
4589                                       final HTableDescriptor hTableDescriptor,
4590                                       final HLog hlog,
4591                                       final boolean initialize, final boolean ignoreHLog)
4592       throws IOException {
4593       Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4594       return createHRegion(info, rootDir, tableDir, conf, hTableDescriptor, hlog, initialize, ignoreHLog);
4595   }
4596 
4597   /**
4598    * Convenience method creating new HRegions. Used by createTable.
4599    * The {@link HLog} for the created region needs to be closed
4600    * explicitly, if it is not null.
4601    * Use {@link HRegion#getLog()} to get access.
4602    *
4603    * @param info Info for region to create.
4604    * @param rootDir Root directory for HBase instance
4605    * @param tableDir table directory
4606    * @param conf
4607    * @param hTableDescriptor
4608    * @param hlog shared HLog
4609    * @param initialize - true to initialize the region
4610    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4611    * @return new HRegion
4612    * @throws IOException
4613    */
4614   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir, final Path tableDir,
4615                                       final Configuration conf,
4616                                       final HTableDescriptor hTableDescriptor,
4617                                       final HLog hlog,
4618                                       final boolean initialize, final boolean ignoreHLog)
4619       throws IOException {
4620     LOG.info("creating HRegion " + info.getTable().getNameAsString()
4621         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
4622         " Table name == " + info.getTable().getNameAsString());
4623     FileSystem fs = FileSystem.get(conf);
4624     HRegionFileSystem rfs = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
4625     HLog effectiveHLog = hlog;
4626     if (hlog == null && !ignoreHLog) {
4627       effectiveHLog = HLogFactory.createHLog(fs, rfs.getRegionDir(),
4628                                              HConstants.HREGION_LOGDIR_NAME, conf);
4629     }
4630     HRegion region = HRegion.newHRegion(tableDir,
4631         effectiveHLog, fs, conf, info, hTableDescriptor, null);
4632     if (initialize) {
4633       // If initializing, set the sequenceId. It is also required by HLogPerformanceEvaluation when
4634       // verifying the WALEdits.
4635       region.setSequenceId(region.initialize());
4636     }
4637     return region;
4638   }
4639 
4640   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4641                                       final Configuration conf,
4642                                       final HTableDescriptor hTableDescriptor,
4643                                       final HLog hlog)
4644     throws IOException {
4645     return createHRegion(info, rootDir, conf, hTableDescriptor, hlog, true);
4646   }
4647 
4648 
4649   /**
4650    * Open a Region.
4651    * @param info Info for region to be opened.
4652    * @param wal HLog for region to use. This method will call
4653    * HLog#setSequenceNumber(long) passing the result of the call to
4654    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4655    * up.  HRegionStore does this every time it opens a new region.
4656    * @param conf
4657    * @return new HRegion
4658    *
4659    * @throws IOException
4660    */
4661   public static HRegion openHRegion(final HRegionInfo info,
4662       final HTableDescriptor htd, final HLog wal,
4663       final Configuration conf)
4664   throws IOException {
4665     return openHRegion(info, htd, wal, conf, null, null);
4666   }
4667 
4668   /**
4669    * Open a Region.
4670    * @param info Info for region to be opened
4671    * @param htd the table descriptor
4672    * @param wal HLog for region to use. This method will call
4673    * HLog#setSequenceNumber(long) passing the result of the call to
4674    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4675    * up.  HRegionStore does this every time it opens a new region.
4676    * @param conf The Configuration object to use.
4677    * @param rsServices An interface we can request flushes against.
4678    * @param reporter An interface we can report progress against.
4679    * @return new HRegion
4680    *
4681    * @throws IOException
4682    */
4683   public static HRegion openHRegion(final HRegionInfo info,
4684     final HTableDescriptor htd, final HLog wal, final Configuration conf,
4685     final RegionServerServices rsServices,
4686     final CancelableProgressable reporter)
4687   throws IOException {
4688     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
4689   }
4690 
4691   /**
4692    * Open a Region.
4693    * @param rootDir Root directory for HBase instance
4694    * @param info Info for region to be opened.
4695    * @param htd the table descriptor
4696    * @param wal HLog for region to use. This method will call
4697    * HLog#setSequenceNumber(long) passing the result of the call to
4698    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4699    * up.  HRegionStore does this every time it opens a new region.
4700    * @param conf The Configuration object to use.
4701    * @return new HRegion
4702    * @throws IOException
4703    */
4704   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
4705       final HTableDescriptor htd, final HLog wal, final Configuration conf)
4706   throws IOException {
4707     return openHRegion(rootDir, info, htd, wal, conf, null, null);
4708   }
4709 
4710   /**
4711    * Open a Region.
4712    * @param rootDir Root directory for HBase instance
4713    * @param info Info for region to be opened.
4714    * @param htd the table descriptor
4715    * @param wal HLog for region to use. This method will call
4716    * HLog#setSequenceNumber(long) passing the result of the call to
4717    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4718    * up.  HRegionStore does this every time it opens a new region.
4719    * @param conf The Configuration object to use.
4720    * @param rsServices An interface we can request flushes against.
4721    * @param reporter An interface we can report progress against.
4722    * @return new HRegion
4723    * @throws IOException
4724    */
4725   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
4726       final HTableDescriptor htd, final HLog wal, final Configuration conf,
4727       final RegionServerServices rsServices,
4728       final CancelableProgressable reporter)
4729   throws IOException {
4730     FileSystem fs = null;
4731     if (rsServices != null) {
4732       fs = rsServices.getFileSystem();
4733     }
4734     if (fs == null) {
4735       fs = FileSystem.get(conf);
4736     }
4737     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
4738   }
4739 
4740   /**
4741    * Open a Region.
4742    * @param conf The Configuration object to use.
4743    * @param fs Filesystem to use
4744    * @param rootDir Root directory for HBase instance
4745    * @param info Info for region to be opened.
4746    * @param htd the table descriptor
4747    * @param wal HLog for region to use. This method will call
4748    * HLog#setSequenceNumber(long) passing the result of the call to
4749    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4750    * up.  HRegionStore does this every time it opens a new region.
4751    * @return new HRegion
4752    * @throws IOException
4753    */
4754   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4755       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal)
4756       throws IOException {
4757     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
4758   }
4759 
4760   /**
4761    * Open a Region.
4762    * @param conf The Configuration object to use.
4763    * @param fs Filesystem to use
4764    * @param rootDir Root directory for HBase instance
4765    * @param info Info for region to be opened.
4766    * @param htd the table descriptor
4767    * @param wal HLog for region to use. This method will call
4768    * HLog#setSequenceNumber(long) passing the result of the call to
4769    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4770    * up.  HRegionStore does this every time it opens a new region.
4771    * @param rsServices An interface we can request flushes against.
4772    * @param reporter An interface we can report progress against.
4773    * @return new HRegion
4774    * @throws IOException
4775    */
4776   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4777       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4778       final RegionServerServices rsServices, final CancelableProgressable reporter)
4779       throws IOException {
4780     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4781     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
4782   }
4783 
4784   /**
4785    * Open a Region.
4786    * @param conf The Configuration object to use.
4787    * @param fs Filesystem to use
4788    * @param rootDir Root directory for HBase instance
4789    * @param info Info for region to be opened.
4790    * @param htd the table descriptor
4791    * @param wal HLog for region to use. This method will call
4792    * HLog#setSequenceNumber(long) passing the result of the call to
4793    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4794    * up.  HRegionStore does this every time it opens a new region.
4795    * @param rsServices An interface we can request flushes against.
4796    * @param reporter An interface we can report progress against.
4797    * @return new HRegion
4798    * @throws IOException
4799    */
4800   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4801       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4802       final RegionServerServices rsServices, final CancelableProgressable reporter)
4803       throws IOException {
4804     if (info == null) throw new NullPointerException("Passed region info is null");
4805     if (LOG.isDebugEnabled()) {
4806       LOG.debug("Opening region: " + info);
4807     }
4808     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
4809     return r.openHRegion(reporter);
4810   }
4811 
4812 
4813   /**
4814    * Useful when reopening a closed region (normally for unit tests)
4815    * @param other original object
4816    * @param reporter An interface we can report progress against.
4817    * @return new HRegion
4818    * @throws IOException
4819    */
4820   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
4821       throws IOException {
4822     HRegionFileSystem regionFs = other.getRegionFileSystem();
4823     HRegion r = newHRegion(regionFs.getTableDir(), other.getLog(), regionFs.getFileSystem(),
4824         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
4825     return r.openHRegion(reporter);
4826   }
4827 
4828   /**
4829    * Open HRegion.
4830    * Calls initialize and sets sequenceid.
4831    * @param reporter
4832    * @return Returns <code>this</code>
4833    * @throws IOException
4834    */
4835   protected HRegion openHRegion(final CancelableProgressable reporter)
4836   throws IOException {
4837     // Refuse to open the region if we are missing local compression support
4838     checkCompressionCodecs();
4839     // Refuse to open the region if encryption configuration is incorrect or
4840     // codec support is missing
4841     checkEncryption();
4842     // Refuse to open the region if a required class cannot be loaded
4843     checkClassLoading();
4844     this.openSeqNum = initialize(reporter);
4845     this.setSequenceId(openSeqNum);
4846     return this;
4847   }
4848 
4849   private void checkCompressionCodecs() throws IOException {
4850     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4851       CompressionTest.testCompression(fam.getCompression());
4852       CompressionTest.testCompression(fam.getCompactionCompression());
4853     }
4854   }
4855 
4856   private void checkEncryption() throws IOException {
4857     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4858       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
4859     }
4860   }
4861 
4862   private void checkClassLoading() throws IOException {
4863     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
4864     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
4865   }
4866 
4867   /**
4868    * Create a daughter region from given a temp directory with the region data.
4869    * @param hri Spec. for daughter region to open.
4870    * @throws IOException
4871    */
4872   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
4873     // Move the files from the temporary .splits to the final /table/region directory
4874     fs.commitDaughterRegion(hri);
4875 
4876     // Create the daughter HRegion instance
4877     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(), fs.getFileSystem(),
4878         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
4879     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
4880     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
4881     return r;
4882   }
4883 
4884   /**
4885    * Create a merged region given a temp directory with the region data.
4886    * @param mergedRegionInfo
4887    * @param region_b another merging region
4888    * @return merged hregion
4889    * @throws IOException
4890    */
4891   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
4892       final HRegion region_b) throws IOException {
4893     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(),
4894         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
4895         this.getTableDesc(), this.rsServices);
4896     r.readRequestsCount.set(this.getReadRequestsCount()
4897         + region_b.getReadRequestsCount());
4898     r.writeRequestsCount.set(this.getWriteRequestsCount()
4899 
4900         + region_b.getWriteRequestsCount());
4901     this.fs.commitMergedRegion(mergedRegionInfo);
4902     return r;
4903   }
4904 
4905   /**
4906    * Inserts a new region's meta information into the passed
4907    * <code>meta</code> region. Used by the HMaster bootstrap code adding
4908    * new table to hbase:meta table.
4909    *
4910    * @param meta hbase:meta HRegion to be updated
4911    * @param r HRegion to add to <code>meta</code>
4912    *
4913    * @throws IOException
4914    */
4915   // TODO remove since only test and merge use this
4916   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
4917     meta.checkResources();
4918     // The row key is the region name
4919     byte[] row = r.getRegionName();
4920     final long now = EnvironmentEdgeManager.currentTimeMillis();
4921     final List<Cell> cells = new ArrayList<Cell>(2);
4922     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4923       HConstants.REGIONINFO_QUALIFIER, now,
4924       r.getRegionInfo().toByteArray()));
4925     // Set into the root table the version of the meta table.
4926     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4927       HConstants.META_VERSION_QUALIFIER, now,
4928       Bytes.toBytes(HConstants.META_VERSION)));
4929     meta.put(row, HConstants.CATALOG_FAMILY, cells);
4930   }
4931 
4932   /**
4933    * Computes the Path of the HRegion
4934    *
4935    * @param tabledir qualified path for table
4936    * @param name ENCODED region name
4937    * @return Path of HRegion directory
4938    */
4939   @Deprecated
4940   public static Path getRegionDir(final Path tabledir, final String name) {
4941     return new Path(tabledir, name);
4942   }
4943 
4944   /**
4945    * Computes the Path of the HRegion
4946    *
4947    * @param rootdir qualified path of HBase root directory
4948    * @param info HRegionInfo for the region
4949    * @return qualified path of region directory
4950    */
4951   @Deprecated
4952   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
4953     return new Path(
4954       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
4955   }
4956 
4957   /**
4958    * Determines if the specified row is within the row range specified by the
4959    * specified HRegionInfo
4960    *
4961    * @param info HRegionInfo that specifies the row range
4962    * @param row row to be checked
4963    * @return true if the row is within the range specified by the HRegionInfo
4964    */
4965   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
4966     return ((info.getStartKey().length == 0) ||
4967         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
4968         ((info.getEndKey().length == 0) ||
4969             (Bytes.compareTo(info.getEndKey(), row) > 0));
4970   }
4971 
4972   public static boolean rowIsInRange(HRegionInfo info, final byte [] row, final int offset,
4973       final short length) {
4974     return ((info.getStartKey().length == 0) ||
4975         (Bytes.compareTo(info.getStartKey(), 0, info.getStartKey().length,
4976           row, offset, length) <= 0)) &&
4977         ((info.getEndKey().length == 0) ||
4978           (Bytes.compareTo(info.getEndKey(), 0, info.getEndKey().length, row, offset, length) > 0));
4979   }
4980 
4981   /**
4982    * Merge two HRegions.  The regions must be adjacent and must not overlap.
4983    *
4984    * @param srcA
4985    * @param srcB
4986    * @return new merged HRegion
4987    * @throws IOException
4988    */
4989   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
4990   throws IOException {
4991     HRegion a = srcA;
4992     HRegion b = srcB;
4993 
4994     // Make sure that srcA comes first; important for key-ordering during
4995     // write of the merged file.
4996     if (srcA.getStartKey() == null) {
4997       if (srcB.getStartKey() == null) {
4998         throw new IOException("Cannot merge two regions with null start key");
4999       }
5000       // A's start key is null but B's isn't. Assume A comes before B
5001     } else if ((srcB.getStartKey() == null) ||
5002       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
5003       a = srcB;
5004       b = srcA;
5005     }
5006 
5007     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
5008       throw new IOException("Cannot merge non-adjacent regions");
5009     }
5010     return merge(a, b);
5011   }
5012 
5013   /**
5014    * Merge two regions whether they are adjacent or not.
5015    *
5016    * @param a region a
5017    * @param b region b
5018    * @return new merged region
5019    * @throws IOException
5020    */
5021   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
5022     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
5023       throw new IOException("Regions do not belong to the same table");
5024     }
5025 
5026     FileSystem fs = a.getRegionFileSystem().getFileSystem();
5027     // Make sure each region's cache is empty
5028     a.flushcache();
5029     b.flushcache();
5030 
5031     // Compact each region so we only have one store file per family
5032     a.compactStores(true);
5033     if (LOG.isDebugEnabled()) {
5034       LOG.debug("Files for region: " + a);
5035       a.getRegionFileSystem().logFileSystemState(LOG);
5036     }
5037     b.compactStores(true);
5038     if (LOG.isDebugEnabled()) {
5039       LOG.debug("Files for region: " + b);
5040       b.getRegionFileSystem().logFileSystemState(LOG);
5041     }
5042 
5043     RegionMergeTransaction rmt = new RegionMergeTransaction(a, b, true);
5044     if (!rmt.prepare(null)) {
5045       throw new IOException("Unable to merge regions " + a + " and " + b);
5046     }
5047     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
5048     LOG.info("starting merge of regions: " + a + " and " + b
5049         + " into new region " + mergedRegionInfo.getRegionNameAsString()
5050         + " with start key <"
5051         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
5052         + "> and end key <"
5053         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
5054     HRegion dstRegion;
5055     try {
5056       dstRegion = rmt.execute(null, null);
5057     } catch (IOException ioe) {
5058       rmt.rollback(null, null);
5059       throw new IOException("Failed merging region " + a + " and " + b
5060           + ", and succssfully rolled back");
5061     }
5062     dstRegion.compactStores(true);
5063 
5064     if (LOG.isDebugEnabled()) {
5065       LOG.debug("Files for new region");
5066       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
5067     }
5068 
5069     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
5070       throw new IOException("Merged region " + dstRegion
5071           + " still has references after the compaction, is compaction canceled?");
5072     }
5073 
5074     // Archiving the 'A' region
5075     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
5076     // Archiving the 'B' region
5077     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
5078 
5079     LOG.info("merge completed. New region is " + dstRegion);
5080     return dstRegion;
5081   }
5082 
5083   /**
5084    * @return True if needs a major compaction.
5085    * @throws IOException
5086    */
5087   boolean isMajorCompaction() throws IOException {
5088     for (Store store : this.stores.values()) {
5089       if (store.isMajorCompaction()) {
5090         return true;
5091       }
5092     }
5093     return false;
5094   }
5095 
5096   //
5097   // HBASE-880
5098   //
5099   /**
5100    * @param get get object
5101    * @return result
5102    * @throws IOException read exceptions
5103    */
5104   public Result get(final Get get) throws IOException {
5105     checkRow(get.getRow(), "Get");
5106     // Verify families are all valid
5107     if (get.hasFamilies()) {
5108       for (byte [] family: get.familySet()) {
5109         checkFamily(family);
5110       }
5111     } else { // Adding all families to scanner
5112       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
5113         get.addFamily(family);
5114       }
5115     }
5116     List<Cell> results = get(get, true);
5117     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null);
5118   }
5119 
5120   /*
5121    * Do a get based on the get parameter.
5122    * @param withCoprocessor invoke coprocessor or not. We don't want to
5123    * always invoke cp for this private method.
5124    */
5125   public List<Cell> get(Get get, boolean withCoprocessor)
5126   throws IOException {
5127 
5128     List<Cell> results = new ArrayList<Cell>();
5129 
5130     // pre-get CP hook
5131     if (withCoprocessor && (coprocessorHost != null)) {
5132        if (coprocessorHost.preGet(get, results)) {
5133          return results;
5134        }
5135     }
5136 
5137     Scan scan = new Scan(get);
5138 
5139     RegionScanner scanner = null;
5140     try {
5141       scanner = getScanner(scan);
5142       scanner.next(results);
5143     } finally {
5144       if (scanner != null)
5145         scanner.close();
5146     }
5147 
5148     // post-get CP hook
5149     if (withCoprocessor && (coprocessorHost != null)) {
5150       coprocessorHost.postGet(get, results);
5151     }
5152 
5153     // do after lock
5154     if (this.metricsRegion != null) {
5155       long totalSize = 0l;
5156       if (results != null) {
5157         for (Cell kv:results) {
5158           totalSize += KeyValueUtil.ensureKeyValue(kv).getLength();
5159         }
5160       }
5161       this.metricsRegion.updateGet(totalSize);
5162     }
5163 
5164     return results;
5165   }
5166 
5167   public void mutateRow(RowMutations rm) throws IOException {
5168     // Don't need nonces here - RowMutations only supports puts and deletes
5169     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
5170   }
5171 
5172   /**
5173    * Perform atomic mutations within the region w/o nonces.
5174    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
5175    */
5176   public void mutateRowsWithLocks(Collection<Mutation> mutations,
5177       Collection<byte[]> rowsToLock) throws IOException {
5178     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
5179   }
5180 
5181   /**
5182    * Perform atomic mutations within the region.
5183    * @param mutations The list of mutations to perform.
5184    * <code>mutations</code> can contain operations for multiple rows.
5185    * Caller has to ensure that all rows are contained in this region.
5186    * @param rowsToLock Rows to lock
5187    * @param nonceGroup Optional nonce group of the operation (client Id)
5188    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5189    * If multiple rows are locked care should be taken that
5190    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
5191    * @throws IOException
5192    */
5193   public void mutateRowsWithLocks(Collection<Mutation> mutations,
5194       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
5195     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
5196     processRowsWithLocks(proc, -1, nonceGroup, nonce);
5197   }
5198 
5199   /**
5200    * @return the current load statistics for the the region
5201    */
5202   public ClientProtos.RegionLoadStats getRegionStats() {
5203     if (!regionStatsEnabled) {
5204       return null;
5205     }
5206     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
5207     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
5208         .memstoreFlushSize)));
5209     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
5210     stats.setCompactionPressure((int)rsServices.getCompactionPressure()*100 > 100 ? 100 :
5211                 (int)rsServices.getCompactionPressure()*100);
5212     return stats.build();
5213   }
5214 
5215   /**
5216    * Performs atomic multiple reads and writes on a given row.
5217    *
5218    * @param processor The object defines the reads and writes to a row.
5219    * @param nonceGroup Optional nonce group of the operation (client Id)
5220    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5221    */
5222   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
5223       throws IOException {
5224     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
5225   }
5226 
5227   /**
5228    * Performs atomic multiple reads and writes on a given row.
5229    *
5230    * @param processor The object defines the reads and writes to a row.
5231    * @param timeout The timeout of the processor.process() execution
5232    *                Use a negative number to switch off the time bound
5233    * @param nonceGroup Optional nonce group of the operation (client Id)
5234    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5235    */
5236   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
5237       long nonceGroup, long nonce) throws IOException {
5238 
5239     for (byte[] row : processor.getRowsToLock()) {
5240       checkRow(row, "processRowsWithLocks");
5241     }
5242     if (!processor.readOnly()) {
5243       checkReadOnly();
5244     }
5245     checkResources();
5246 
5247     startRegionOperation();
5248     WALEdit walEdit = new WALEdit();
5249 
5250     // 1. Run pre-process hook
5251     try {
5252       processor.preProcess(this, walEdit);
5253     } catch (IOException e) {
5254       closeRegionOperation();
5255       throw e;
5256     }
5257     // Short circuit the read only case
5258     if (processor.readOnly()) {
5259       try {
5260         long now = EnvironmentEdgeManager.currentTimeMillis();
5261         doProcessRowWithTimeout(
5262             processor, now, this, null, null, timeout);
5263         processor.postProcess(this, walEdit, true);
5264       } catch (IOException e) {
5265         throw e;
5266       } finally {
5267         closeRegionOperation();
5268       }
5269       return;
5270     }
5271 
5272     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
5273     boolean locked = false;
5274     boolean walSyncSuccessful = false;
5275     List<RowLock> acquiredRowLocks = null;
5276     long addedSize = 0;
5277     List<Mutation> mutations = new ArrayList<Mutation>();
5278     Collection<byte[]> rowsToLock = processor.getRowsToLock();
5279     try {
5280       // 2. Acquire the row lock(s)
5281       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
5282       for (byte[] row : rowsToLock) {
5283         // Attempt to lock all involved rows, throw if any lock times out
5284         acquiredRowLocks.add(getRowLock(row));
5285       }
5286       // 3. Region lock
5287       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
5288       locked = true;
5289 
5290       long now = EnvironmentEdgeManager.currentTimeMillis();
5291       try {
5292         // 4. Let the processor scan the rows, generate mutations and add
5293         //    waledits
5294         doProcessRowWithTimeout(
5295             processor, now, this, mutations, walEdit, timeout);
5296 
5297         if (!mutations.isEmpty()) {
5298           // 5. Get a mvcc write number
5299           writeEntry = mvcc.beginMemstoreInsert();
5300           // 6. Call the preBatchMutate hook
5301           processor.preBatchMutate(this, walEdit);
5302           // 7. Apply to memstore
5303           for (Mutation m : mutations) {
5304             // Handle any tag based cell features
5305             rewriteCellTags(m.getFamilyCellMap(), m);
5306 
5307             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
5308               KeyValue kv = KeyValueUtil.ensureKeyValue(cellScanner.current());
5309               kv.setMvccVersion(writeEntry.getWriteNumber());
5310               byte[] family = kv.getFamily();
5311               checkFamily(family);
5312               addedSize += stores.get(family).add(kv);
5313             }
5314           }
5315 
5316           long txid = 0;
5317           // 8. Append no sync
5318           if (!walEdit.isEmpty()) {
5319             txid = this.log.appendNoSync(this.getRegionInfo(),
5320               this.htableDescriptor.getTableName(), walEdit, processor.getClusterIds(), now,
5321               this.htableDescriptor, this.sequenceId, true, nonceGroup, nonce);
5322           }
5323           // 9. Release region lock
5324           if (locked) {
5325             this.updatesLock.readLock().unlock();
5326             locked = false;
5327           }
5328 
5329           // 10. Release row lock(s)
5330           releaseRowLocks(acquiredRowLocks);
5331 
5332           // 11. Sync edit log
5333           if (txid != 0) {
5334             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
5335           }
5336           walSyncSuccessful = true;
5337           // 12. call postBatchMutate hook
5338           processor.postBatchMutate(this);
5339         }
5340       } finally {
5341         if (!mutations.isEmpty() && !walSyncSuccessful) {
5342           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
5343               " memstore keyvalues for row(s):" +
5344               processor.getRowsToLock().iterator().next() + "...");
5345           for (Mutation m : mutations) {
5346             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
5347               KeyValue kv = KeyValueUtil.ensureKeyValue(cellScanner.current());
5348               stores.get(kv.getFamily()).rollback(kv);
5349             }
5350           }
5351         }
5352         // 13. Roll mvcc forward
5353         if (writeEntry != null) {
5354           mvcc.completeMemstoreInsert(writeEntry);
5355           writeEntry = null;
5356         }
5357         if (locked) {
5358           this.updatesLock.readLock().unlock();
5359           locked = false;
5360         }
5361         // release locks if some were acquired but another timed out
5362         releaseRowLocks(acquiredRowLocks);
5363       }
5364 
5365       // 14. Run post-process hook
5366       processor.postProcess(this, walEdit, walSyncSuccessful);
5367 
5368     } catch (IOException e) {
5369       throw e;
5370     } finally {
5371       closeRegionOperation();
5372       if (!mutations.isEmpty() &&
5373           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
5374         requestFlush();
5375       }
5376     }
5377   }
5378 
5379   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
5380                                        final long now,
5381                                        final HRegion region,
5382                                        final List<Mutation> mutations,
5383                                        final WALEdit walEdit,
5384                                        final long timeout) throws IOException {
5385     // Short circuit the no time bound case.
5386     if (timeout < 0) {
5387       try {
5388         processor.process(now, region, mutations, walEdit);
5389       } catch (IOException e) {
5390         LOG.warn("RowProcessor:" + processor.getClass().getName() +
5391             " throws Exception on row(s):" +
5392             Bytes.toStringBinary(
5393               processor.getRowsToLock().iterator().next()) + "...", e);
5394         throw e;
5395       }
5396       return;
5397     }
5398 
5399     // Case with time bound
5400     FutureTask<Void> task =
5401       new FutureTask<Void>(new Callable<Void>() {
5402         @Override
5403         public Void call() throws IOException {
5404           try {
5405             processor.process(now, region, mutations, walEdit);
5406             return null;
5407           } catch (IOException e) {
5408             LOG.warn("RowProcessor:" + processor.getClass().getName() +
5409                 " throws Exception on row(s):" +
5410                 Bytes.toStringBinary(
5411                     processor.getRowsToLock().iterator().next()) + "...", e);
5412             throw e;
5413           }
5414         }
5415       });
5416     rowProcessorExecutor.execute(task);
5417     try {
5418       task.get(timeout, TimeUnit.MILLISECONDS);
5419     } catch (TimeoutException te) {
5420       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
5421           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
5422           "...");
5423       throw new IOException(te);
5424     } catch (Exception e) {
5425       throw new IOException(e);
5426     }
5427   }
5428 
5429   public Result append(Append append) throws IOException {
5430     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
5431   }
5432 
5433   // TODO: There's a lot of boiler plate code identical to increment.
5434   // We should refactor append and increment as local get-mutate-put
5435   // transactions, so all stores only go through one code path for puts.
5436   /**
5437    * Perform one or more append operations on a row.
5438    *
5439    * @param append
5440    * @return new keyvalues after increment
5441    * @throws IOException
5442    */
5443   public Result append(Append append, long nonceGroup, long nonce)
5444       throws IOException {
5445     byte[] row = append.getRow();
5446     checkRow(row, "append");
5447     boolean flush = false;
5448     Durability durability = getEffectiveDurability(append.getDurability());
5449     boolean writeToWAL = durability != Durability.SKIP_WAL;
5450     WALEdit walEdits = null;
5451     List<Cell> allKVs = new ArrayList<Cell>(append.size());
5452     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
5453 
5454     long size = 0;
5455     long txid = 0;
5456 
5457     checkReadOnly();
5458     checkResources();
5459     // Lock row
5460     startRegionOperation(Operation.APPEND);
5461     this.writeRequestsCount.increment();
5462     WriteEntry w = null;
5463     RowLock rowLock;
5464     try {
5465       rowLock = getRowLock(row);
5466       try {
5467         lock(this.updatesLock.readLock());
5468         try {
5469           // wait for all prior MVCC transactions to finish - while we hold the row lock
5470           // (so that we are guaranteed to see the latest state)
5471           mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
5472           if (this.coprocessorHost != null) {
5473             Result r = this.coprocessorHost.preAppendAfterRowLock(append);
5474             if(r!= null) {
5475               return r;
5476             }
5477           }
5478           // now start my own transaction
5479           w = mvcc.beginMemstoreInsert();
5480           long now = EnvironmentEdgeManager.currentTimeMillis();
5481           // Process each family
5482           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
5483 
5484             Store store = stores.get(family.getKey());
5485             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
5486 
5487             // Sort the cells so that they match the order that they
5488             // appear in the Get results. Otherwise, we won't be able to
5489             // find the existing values if the cells are not specified
5490             // in order by the client since cells are in an array list.
5491             Collections.sort(family.getValue(), store.getComparator());
5492             // Get previous values for all columns in this family
5493             Get get = new Get(row);
5494             for (Cell cell : family.getValue()) {
5495               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5496               get.addColumn(family.getKey(), kv.getQualifier());
5497             }
5498             List<Cell> results = get(get, false);
5499 
5500             // Iterate the input columns and update existing values if they were
5501             // found, otherwise add new column initialized to the append value
5502 
5503             // Avoid as much copying as possible. We may need to rewrite and
5504             // consolidate tags. Bytes are only copied once.
5505             // Would be nice if KeyValue had scatter/gather logic
5506             int idx = 0;
5507             for (Cell cell : family.getValue()) {
5508               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5509               KeyValue newKv;
5510               KeyValue oldKv = null;
5511               if (idx < results.size()
5512                   && CellUtil.matchingQualifier(results.get(idx), kv)) {
5513                 oldKv = KeyValueUtil.ensureKeyValue(results.get(idx));
5514                 long ts = Math.max(now, oldKv.getTimestamp());
5515 
5516                 // Process cell tags
5517                 List<Tag> newTags = new ArrayList<Tag>();
5518 
5519                 // Make a union of the set of tags in the old and new KVs
5520 
5521                 if (oldKv.getTagsLengthUnsigned() > 0) {
5522                   Iterator<Tag> i = CellUtil.tagsIterator(oldKv.getTagsArray(),
5523                     oldKv.getTagsOffset(), oldKv.getTagsLengthUnsigned());
5524                   while (i.hasNext()) {
5525                     newTags.add(i.next());
5526                   }
5527                 }
5528                 if (kv.getTagsLengthUnsigned() > 0) {
5529                   Iterator<Tag> i  = CellUtil.tagsIterator(kv.getTagsArray(), kv.getTagsOffset(),
5530                     kv.getTagsLengthUnsigned());
5531                   while (i.hasNext()) {
5532                     newTags.add(i.next());
5533                   }
5534                 }
5535 
5536                 // Cell TTL handling
5537 
5538                 if (append.getTTL() != Long.MAX_VALUE) {
5539                   // Add the new TTL tag
5540                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
5541                 }
5542 
5543                 // Rebuild tags
5544                 byte[] tagBytes = Tag.fromList(newTags);
5545 
5546                 // allocate an empty cell once
5547                 newKv = new KeyValue(row.length, kv.getFamilyLength(),
5548                     kv.getQualifierLength(), ts, KeyValue.Type.Put,
5549                     oldKv.getValueLength() + kv.getValueLength(),
5550                     tagBytes.length);
5551                 // copy in row, family, and qualifier
5552                 System.arraycopy(kv.getRowArray(), kv.getRowOffset(),
5553                   newKv.getRowArray(), newKv.getRowOffset(), kv.getRowLength());
5554                 System.arraycopy(kv.getFamilyArray(), kv.getFamilyOffset(),
5555                   newKv.getFamilyArray(), newKv.getFamilyOffset(),
5556                   kv.getFamilyLength());
5557                 System.arraycopy(kv.getQualifierArray(), kv.getQualifierOffset(),
5558                   newKv.getQualifierArray(), newKv.getQualifierOffset(),
5559                   kv.getQualifierLength());
5560                 // copy in the value
5561                 System.arraycopy(oldKv.getValueArray(), oldKv.getValueOffset(),
5562                   newKv.getValueArray(), newKv.getValueOffset(),
5563                   oldKv.getValueLength());
5564                 System.arraycopy(kv.getValueArray(), kv.getValueOffset(),
5565                   newKv.getValueArray(),
5566                   newKv.getValueOffset() + oldKv.getValueLength(),
5567                   kv.getValueLength());
5568                 // Copy in tag data
5569                 System.arraycopy(tagBytes, 0, newKv.getTagsArray(), newKv.getTagsOffset(),
5570                   tagBytes.length);
5571                 idx++;
5572               } else {
5573                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP,
5574                 // so only need to update the timestamp to 'now'
5575                 kv.updateLatestStamp(Bytes.toBytes(now));
5576 
5577                 // Cell TTL handling
5578 
5579                 if (append.getTTL() != Long.MAX_VALUE) {
5580                   List<Tag> newTags = new ArrayList<Tag>(1);
5581                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
5582                   // Add the new TTL tag
5583                   newKv = new KeyValue(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(),
5584                     kv.getFamilyArray(), kv.getFamilyOffset(), kv.getFamilyLength(),
5585                     kv.getQualifierArray(), kv.getQualifierOffset(), kv.getQualifierLength(),
5586                     kv.getTimestamp(), KeyValue.Type.codeToType(kv.getTypeByte()),
5587                     kv.getValueArray(), kv.getValueOffset(), kv.getValueLength(),
5588                     newTags);
5589                 } else {
5590                   newKv = kv;
5591                 }
5592               }
5593               newKv.setMvccVersion(w.getWriteNumber());
5594 
5595               // Give coprocessors a chance to update the new cell
5596               if (coprocessorHost != null) {
5597                 newKv = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5598                     RegionObserver.MutationType.APPEND, append, oldKv, (Cell) newKv));
5599               }
5600               kvs.add(newKv);
5601 
5602               // Append update to WAL
5603               if (writeToWAL) {
5604                 if (walEdits == null) {
5605                   walEdits = new WALEdit();
5606                 }
5607                 walEdits.add(newKv);
5608               }
5609             }
5610 
5611             //store the kvs to the temporary memstore before writing HLog
5612             tempMemstore.put(store, kvs);
5613           }
5614 
5615           // Actually write to WAL now
5616           if (writeToWAL) {
5617             // Using default cluster id, as this can only happen in the orginating
5618             // cluster. A slave cluster receives the final value (not the delta)
5619             // as a Put.
5620             txid = this.log.appendNoSync(this.getRegionInfo(),
5621               this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5622               EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5623               true, nonceGroup, nonce);
5624           } else {
5625             recordMutationWithoutWal(append.getFamilyCellMap());
5626           }
5627 
5628           //Actually write to Memstore now
5629           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5630             Store store = entry.getKey();
5631             if (store.getFamily().getMaxVersions() == 1) {
5632               // upsert if VERSIONS for this CF == 1
5633               size += store.upsert(entry.getValue(), getSmallestReadPoint());
5634             } else {
5635               // otherwise keep older versions around
5636               for (Cell cell: entry.getValue()) {
5637                 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5638                 size += store.add(kv);
5639               }
5640             }
5641             allKVs.addAll(entry.getValue());
5642           }
5643           size = this.addAndGetGlobalMemstoreSize(size);
5644           flush = isFlushSize(size);
5645         } finally {
5646           this.updatesLock.readLock().unlock();
5647         }
5648       } finally {
5649         rowLock.release();
5650       }
5651       if (writeToWAL) {
5652         // sync the transaction log outside the rowlock
5653         syncOrDefer(txid, durability);
5654       }
5655     } finally {
5656       if (w != null) {
5657         mvcc.completeMemstoreInsert(w);
5658       }
5659       closeRegionOperation(Operation.APPEND);
5660     }
5661 
5662     if (this.metricsRegion != null) {
5663       this.metricsRegion.updateAppend();
5664     }
5665 
5666     if (flush) {
5667       // Request a cache flush. Do it outside update lock.
5668       requestFlush();
5669     }
5670 
5671 
5672     return append.isReturnResults() ? Result.create(allKVs) : null;
5673   }
5674 
5675   public Result increment(Increment increment) throws IOException {
5676     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
5677   }
5678 
5679   // TODO: There's a lot of boiler plate code identical to append.
5680   // We should refactor append and increment as local get-mutate-put
5681   // transactions, so all stores only go through one code path for puts.
5682   /**
5683    * Perform one or more increment operations on a row.
5684    * @param increment
5685    * @return new keyvalues after increment
5686    * @throws IOException
5687    */
5688   public Result increment(Increment increment, long nonceGroup, long nonce)
5689   throws IOException {
5690     byte [] row = increment.getRow();
5691     checkRow(row, "increment");
5692     TimeRange tr = increment.getTimeRange();
5693     boolean flush = false;
5694     Durability durability = getEffectiveDurability(increment.getDurability());
5695     boolean writeToWAL = durability != Durability.SKIP_WAL;
5696     WALEdit walEdits = null;
5697     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
5698     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
5699 
5700     long size = 0;
5701     long txid = 0;
5702 
5703     checkReadOnly();
5704     checkResources();
5705     // Lock row
5706     startRegionOperation(Operation.INCREMENT);
5707     this.writeRequestsCount.increment();
5708     WriteEntry w = null;
5709     try {
5710       RowLock rowLock = getRowLock(row);
5711       try {
5712         lock(this.updatesLock.readLock());
5713         try {
5714           // wait for all prior MVCC transactions to finish - while we hold the row lock
5715           // (so that we are guaranteed to see the latest state)
5716           mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
5717           if (this.coprocessorHost != null) {
5718             Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
5719             if (r != null) {
5720               return r;
5721             }
5722           }
5723           // now start my own transaction
5724           w = mvcc.beginMemstoreInsert();
5725           long now = EnvironmentEdgeManager.currentTimeMillis();
5726           // Process each family
5727           for (Map.Entry<byte [], List<Cell>> family:
5728               increment.getFamilyCellMap().entrySet()) {
5729 
5730             Store store = stores.get(family.getKey());
5731             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
5732 
5733             // Sort the cells so that they match the order that they
5734             // appear in the Get results. Otherwise, we won't be able to
5735             // find the existing values if the cells are not specified
5736             // in order by the client since cells are in an array list.
5737             Collections.sort(family.getValue(), store.getComparator());
5738             // Get previous values for all columns in this family
5739             Get get = new Get(row);
5740             for (Cell cell: family.getValue()) {
5741               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5742               get.addColumn(family.getKey(), kv.getQualifier());
5743             }
5744             get.setTimeRange(tr.getMin(), tr.getMax());
5745             List<Cell> results = get(get, false);
5746 
5747             // Iterate the input columns and update existing values if they were
5748             // found, otherwise add new column initialized to the increment amount
5749             int idx = 0;
5750             List<Cell> edits = family.getValue();
5751             for (int i = 0; i < edits.size(); i++) {
5752               Cell cell = edits.get(i);
5753               long amount = Bytes.toLong(CellUtil.cloneValue(cell));
5754               boolean noWriteBack = (amount == 0);
5755               List<Tag> newTags = new ArrayList<Tag>();
5756 
5757               // Carry forward any tags that might have been added by a coprocessor
5758               if (cell.getTagsLengthUnsigned() > 0) {
5759                 Iterator<Tag> itr = CellUtil.tagsIterator(cell.getTagsArray(),
5760                   cell.getTagsOffset(), cell.getTagsLengthUnsigned());
5761                 while (itr.hasNext()) {
5762                   newTags.add(itr.next());
5763                 }
5764               }
5765 
5766               Cell c = null;
5767               long ts = now;
5768               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) {
5769                 c = results.get(idx);
5770                 ts = Math.max(now, c.getTimestamp());
5771                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
5772                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
5773                 } else {
5774                   // throw DoNotRetryIOException instead of IllegalArgumentException
5775                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
5776                       "Attempted to increment field that isn't 64 bits wide");
5777                 }
5778                 // Carry tags forward from previous version
5779                 if (c.getTagsLength() > 0) {
5780                   Iterator<Tag> itr = CellUtil.tagsIterator(c.getTagsArray(),
5781                     c.getTagsOffset(), c.getTagsLength());
5782                   while (itr.hasNext()) {
5783                     newTags.add(itr.next());
5784                   }
5785                 }
5786                 if (i < ( edits.size() - 1) && !CellUtil.matchingQualifier(cell, edits.get(i + 1)))
5787                   idx++;
5788               }
5789 
5790               // Append new incremented KeyValue to list
5791               byte[] q = CellUtil.cloneQualifier(cell);
5792               byte[] val = Bytes.toBytes(amount);
5793 
5794               // Add the TTL tag if the mutation carried one
5795               if (increment.getTTL() != Long.MAX_VALUE) {
5796                 newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(increment.getTTL())));
5797               }
5798 
5799               KeyValue newKv = new KeyValue(row, 0, row.length,
5800                 family.getKey(), 0, family.getKey().length,
5801                 q, 0, q.length,
5802                 ts,
5803                 KeyValue.Type.Put,
5804                 val, 0, val.length,
5805                 newTags);
5806 
5807               newKv.setMvccVersion(w.getWriteNumber());
5808 
5809               // Give coprocessors a chance to update the new cell
5810               if (coprocessorHost != null) {
5811                 newKv = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5812                     RegionObserver.MutationType.INCREMENT, increment, c, (Cell) newKv));
5813               }
5814               allKVs.add(newKv);
5815 
5816               if (!noWriteBack) {
5817                 kvs.add(newKv);
5818 
5819                 // Prepare WAL updates
5820                 if (writeToWAL) {
5821                   if (walEdits == null) {
5822                     walEdits = new WALEdit();
5823                   }
5824                   walEdits.add(newKv);
5825                 }
5826               }
5827             }
5828 
5829             //store the kvs to the temporary memstore before writing HLog
5830             if (!kvs.isEmpty()) {
5831               tempMemstore.put(store, kvs);
5832             }
5833           }
5834 
5835           // Actually write to WAL now
5836           if (walEdits != null && !walEdits.isEmpty()) {
5837             if (writeToWAL) {
5838               // Using default cluster id, as this can only happen in the orginating
5839               // cluster. A slave cluster receives the final value (not the delta)
5840               // as a Put.
5841               txid = this.log.appendNoSync(this.getRegionInfo(),
5842                   this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5843                   EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5844                   true, nonceGroup, nonce);
5845             } else {
5846               recordMutationWithoutWal(increment.getFamilyCellMap());
5847             }
5848           }
5849           //Actually write to Memstore now
5850           if (!tempMemstore.isEmpty()) {
5851             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5852               Store store = entry.getKey();
5853               if (store.getFamily().getMaxVersions() == 1) {
5854                 // upsert if VERSIONS for this CF == 1
5855                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
5856               } else {
5857                 // otherwise keep older versions around
5858                 for (Cell cell : entry.getValue()) {
5859                   KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5860                   size += store.add(kv);
5861                 }
5862               }
5863             }
5864             size = this.addAndGetGlobalMemstoreSize(size);
5865             flush = isFlushSize(size);
5866           }
5867         } finally {
5868           this.updatesLock.readLock().unlock();
5869         }
5870       } finally {
5871         rowLock.release();
5872       }
5873       if (writeToWAL && (walEdits != null) && !walEdits.isEmpty()) {
5874         // sync the transaction log outside the rowlock
5875         syncOrDefer(txid, durability);
5876       }
5877     } finally {
5878       if (w != null) {
5879         mvcc.completeMemstoreInsert(w);
5880       }
5881       closeRegionOperation(Operation.INCREMENT);
5882       if (this.metricsRegion != null) {
5883         this.metricsRegion.updateIncrement();
5884       }
5885     }
5886 
5887     if (flush) {
5888       // Request a cache flush.  Do it outside update lock.
5889       requestFlush();
5890     }
5891 
5892     return Result.create(allKVs);
5893   }
5894 
5895   //
5896   // New HBASE-880 Helpers
5897   //
5898 
5899   private void checkFamily(final byte [] family)
5900   throws NoSuchColumnFamilyException {
5901     if (!this.htableDescriptor.hasFamily(family)) {
5902       throw new NoSuchColumnFamilyException("Column family " +
5903           Bytes.toString(family) + " does not exist in region " + this
5904           + " in table " + this.htableDescriptor);
5905     }
5906   }
5907 
5908   public static final long FIXED_OVERHEAD = ClassSize.align(
5909       ClassSize.OBJECT +
5910       ClassSize.ARRAY +
5911       42 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
5912       (12 * Bytes.SIZEOF_LONG) +
5913       5 * Bytes.SIZEOF_BOOLEAN);
5914 
5915   // woefully out of date - currently missing:
5916   // 1 x HashMap - coprocessorServiceHandlers
5917   // 6 org.cliffc.high_scale_lib.Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
5918   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
5919   //   writeRequestsCount, updatesBlockedMs
5920   // 1 x HRegion$WriteState - writestate
5921   // 1 x RegionCoprocessorHost - coprocessorHost
5922   // 1 x RegionSplitPolicy - splitPolicy
5923   // 1 x MetricsRegion - metricsRegion
5924   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
5925   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
5926       ClassSize.OBJECT + // closeLock
5927       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
5928       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
5929       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
5930       WriteState.HEAP_SIZE + // writestate
5931       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
5932       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
5933       ClassSize.ARRAYLIST + // recentFlushes
5934       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
5935       + ClassSize.TREEMAP // maxSeqIdInStores
5936       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
5937       ;
5938 
5939   @Override
5940   public long heapSize() {
5941     long heapSize = DEEP_OVERHEAD;
5942     for (Store store : this.stores.values()) {
5943       heapSize += store.heapSize();
5944     }
5945     // this does not take into account row locks, recent flushes, mvcc entries, and more
5946     return heapSize;
5947   }
5948 
5949   /*
5950    * This method calls System.exit.
5951    * @param message Message to print out.  May be null.
5952    */
5953   private static void printUsageAndExit(final String message) {
5954     if (message != null && message.length() > 0) System.out.println(message);
5955     System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
5956     System.out.println("Options:");
5957     System.out.println(" major_compact  Pass this option to major compact " +
5958       "passed region.");
5959     System.out.println("Default outputs scan of passed region.");
5960     System.exit(1);
5961   }
5962 
5963   /**
5964    * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
5965    * be available for handling
5966    * {@link HRegion#execService(com.google.protobuf.RpcController,
5967    *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall)}} calls.
5968    *
5969    * <p>
5970    * Only a single instance may be registered per region for a given {@link Service} subclass (the
5971    * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
5972    * After the first registration, subsequent calls with the same service name will fail with
5973    * a return value of {@code false}.
5974    * </p>
5975    * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
5976    * @return {@code true} if the registration was successful, {@code false}
5977    * otherwise
5978    */
5979   public boolean registerService(Service instance) {
5980     /*
5981      * No stacking of instances is allowed for a single service name
5982      */
5983     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
5984     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
5985       LOG.error("Coprocessor service " + serviceDesc.getFullName() +
5986               " already registered, rejecting request from " + instance
5987       );
5988       return false;
5989     }
5990 
5991     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
5992     if (LOG.isDebugEnabled()) {
5993       LOG.debug("Registered coprocessor service: region="+
5994           Bytes.toStringBinary(getRegionName())+" service="+serviceDesc.getFullName());
5995     }
5996     return true;
5997   }
5998 
5999   /**
6000    * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
6001    * the registered protocol handlers.  {@link Service} implementations must be registered via the
6002    * {@link HRegion#registerService(com.google.protobuf.Service)}
6003    * method before they are available.
6004    *
6005    * @param controller an {@code RpcContoller} implementation to pass to the invoked service
6006    * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
6007    *     and parameters for the method invocation
6008    * @return a protocol buffer {@code Message} instance containing the method's result
6009    * @throws IOException if no registered service handler is found or an error
6010    *     occurs during the invocation
6011    * @see org.apache.hadoop.hbase.regionserver.HRegion#registerService(com.google.protobuf.Service)
6012    */
6013   public Message execService(RpcController controller, CoprocessorServiceCall call)
6014       throws IOException {
6015     String serviceName = call.getServiceName();
6016     String methodName = call.getMethodName();
6017     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
6018       throw new UnknownProtocolException(null,
6019           "No registered coprocessor service found for name "+serviceName+
6020           " in region "+Bytes.toStringBinary(getRegionName()));
6021     }
6022 
6023     Service service = coprocessorServiceHandlers.get(serviceName);
6024     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
6025     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
6026     if (methodDesc == null) {
6027       throw new UnknownProtocolException(service.getClass(),
6028           "Unknown method "+methodName+" called on service "+serviceName+
6029               " in region "+Bytes.toStringBinary(getRegionName()));
6030     }
6031 
6032     Message.Builder builder = service.getRequestPrototype(methodDesc).newBuilderForType();
6033     ProtobufUtil.mergeFrom(builder, call.getRequest());
6034     Message request = builder.build();
6035 
6036     if (coprocessorHost != null) {
6037       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
6038     }
6039 
6040     final Message.Builder responseBuilder =
6041         service.getResponsePrototype(methodDesc).newBuilderForType();
6042     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
6043       @Override
6044       public void run(Message message) {
6045         if (message != null) {
6046           responseBuilder.mergeFrom(message);
6047         }
6048       }
6049     });
6050 
6051     if (coprocessorHost != null) {
6052       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
6053     }
6054 
6055     IOException exception = ResponseConverter.getControllerException(controller);
6056     if (exception != null) {
6057       throw exception;
6058     }
6059 
6060     return responseBuilder.build();
6061   }
6062 
6063   /*
6064    * Process table.
6065    * Do major compaction or list content.
6066    * @param fs
6067    * @param p
6068    * @param log
6069    * @param c
6070    * @param majorCompact
6071    * @throws IOException
6072    */
6073   private static void processTable(final FileSystem fs, final Path p,
6074       final HLog log, final Configuration c,
6075       final boolean majorCompact)
6076   throws IOException {
6077     HRegion region = null;
6078     FSTableDescriptors fst = new FSTableDescriptors(c);
6079     // Currently expects tables have one region only.
6080     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
6081       region = HRegion.newHRegion(p, log, fs, c,
6082         HRegionInfo.FIRST_META_REGIONINFO, fst.get(TableName.META_TABLE_NAME), null);
6083     } else {
6084       throw new IOException("Not a known catalog table: " + p.toString());
6085     }
6086     try {
6087       region.initialize();
6088       if (majorCompact) {
6089         region.compactStores(true);
6090       } else {
6091         // Default behavior
6092         Scan scan = new Scan();
6093         // scan.addFamily(HConstants.CATALOG_FAMILY);
6094         RegionScanner scanner = region.getScanner(scan);
6095         try {
6096           List<Cell> kvs = new ArrayList<Cell>();
6097           boolean done;
6098           do {
6099             kvs.clear();
6100             done = scanner.next(kvs);
6101             if (kvs.size() > 0) LOG.info(kvs);
6102           } while (done);
6103         } finally {
6104           scanner.close();
6105         }
6106       }
6107     } finally {
6108       region.close();
6109     }
6110   }
6111 
6112   boolean shouldForceSplit() {
6113     return this.splitRequest;
6114   }
6115 
6116   byte[] getExplicitSplitPoint() {
6117     return this.explicitSplitPoint;
6118   }
6119 
6120   void forceSplit(byte[] sp) {
6121     // This HRegion will go away after the forced split is successful
6122     // But if a forced split fails, we need to clear forced split.
6123     this.splitRequest = true;
6124     if (sp != null) {
6125       this.explicitSplitPoint = sp;
6126     }
6127   }
6128 
6129   void clearSplit() {
6130     this.splitRequest = false;
6131     this.explicitSplitPoint = null;
6132   }
6133 
6134   /**
6135    * Give the region a chance to prepare before it is split.
6136    */
6137   protected void prepareToSplit() {
6138     // nothing
6139   }
6140 
6141   /**
6142    * Return the splitpoint. null indicates the region isn't splittable
6143    * If the splitpoint isn't explicitly specified, it will go over the stores
6144    * to find the best splitpoint. Currently the criteria of best splitpoint
6145    * is based on the size of the store.
6146    */
6147   public byte[] checkSplit() {
6148     // Can't split META
6149     if (this.getRegionInfo().isMetaTable() ||
6150         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
6151       if (shouldForceSplit()) {
6152         LOG.warn("Cannot split meta region in HBase 0.20 and above");
6153       }
6154       return null;
6155     }
6156 
6157     // Can't split region which is in recovering state
6158     if (this.isRecovering()) {
6159       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
6160       return null;
6161     }
6162 
6163     if (!splitPolicy.shouldSplit()) {
6164       return null;
6165     }
6166 
6167     byte[] ret = splitPolicy.getSplitPoint();
6168 
6169     if (ret != null) {
6170       try {
6171         checkRow(ret, "calculated split");
6172       } catch (IOException e) {
6173         LOG.error("Ignoring invalid split", e);
6174         return null;
6175       }
6176     }
6177     return ret;
6178   }
6179 
6180   /**
6181    * @return The priority that this region should have in the compaction queue
6182    */
6183   public int getCompactPriority() {
6184     int count = Integer.MAX_VALUE;
6185     for (Store store : stores.values()) {
6186       count = Math.min(count, store.getCompactPriority());
6187     }
6188     return count;
6189   }
6190 
6191   /**
6192    * Checks every store to see if one has too many
6193    * store files
6194    * @return true if any store has too many store files
6195    */
6196   public boolean needsCompaction() {
6197     for (Store store : stores.values()) {
6198       if(store.needsCompaction()) {
6199         return true;
6200       }
6201     }
6202     return false;
6203   }
6204 
6205   /** @return the coprocessor host */
6206   public RegionCoprocessorHost getCoprocessorHost() {
6207     return coprocessorHost;
6208   }
6209 
6210   /** @param coprocessorHost the new coprocessor host */
6211   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
6212     this.coprocessorHost = coprocessorHost;
6213   }
6214 
6215   public void abortRegionServer(String msg) throws IOException {
6216     RegionServerServices rs = getRegionServerServices();
6217     if (rs instanceof HRegionServer) {
6218       ((HRegionServer)rs).abort(msg);
6219     }
6220   }
6221 
6222   /**
6223    * This method needs to be called before any public call that reads or
6224    * modifies data. It has to be called just before a try.
6225    * #closeRegionOperation needs to be called in the try's finally block
6226    * Acquires a read lock and checks if the region is closing or closed.
6227    * @throws IOException
6228    */
6229   public void startRegionOperation() throws IOException {
6230     startRegionOperation(Operation.ANY);
6231   }
6232 
6233   /**
6234    * @param op The operation is about to be taken on the region
6235    * @throws IOException
6236    */
6237   protected void startRegionOperation(Operation op) throws IOException {
6238     switch (op) {
6239     case INCREMENT:
6240     case APPEND:
6241     case GET:
6242     case SCAN:
6243     case SPLIT_REGION:
6244     case MERGE_REGION:
6245     case PUT:
6246     case DELETE:
6247     case BATCH_MUTATE:
6248     case COMPACT_REGION:
6249       // when a region is in recovering state, no read, split or merge is allowed
6250       if (isRecovering() && (this.disallowWritesInRecovering ||
6251               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
6252         throw new RegionInRecoveryException(this.getRegionNameAsString() +
6253           " is recovering; cannot take reads");
6254       }
6255       break;
6256     default:
6257       break;
6258     }
6259     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
6260         || op == Operation.COMPACT_REGION) {
6261       // split, merge or compact region doesn't need to check the closing/closed state or lock the
6262       // region
6263       return;
6264     }
6265     if (this.closing.get()) {
6266       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
6267     }
6268     lock(lock.readLock());
6269     if (this.closed.get()) {
6270       lock.readLock().unlock();
6271       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
6272     }
6273     try {
6274       if (coprocessorHost != null) {
6275         coprocessorHost.postStartRegionOperation(op);
6276       }
6277     } catch (Exception e) {
6278       lock.readLock().unlock();
6279       throw new IOException(e);
6280     }
6281   }
6282 
6283   /**
6284    * Closes the lock. This needs to be called in the finally block corresponding
6285    * to the try block of #startRegionOperation
6286    * @throws IOException
6287    */
6288   public void closeRegionOperation() throws IOException {
6289     closeRegionOperation(Operation.ANY);
6290   }
6291 
6292   /**
6293    * Closes the lock. This needs to be called in the finally block corresponding
6294    * to the try block of {@link #startRegionOperation(Operation)}
6295    * @param operation
6296    * @throws IOException
6297    */
6298   public void closeRegionOperation(Operation operation) throws IOException {
6299     lock.readLock().unlock();
6300     if (coprocessorHost != null) {
6301       coprocessorHost.postCloseRegionOperation(operation);
6302     }
6303   }
6304 
6305   /**
6306    * This method needs to be called before any public call that reads or
6307    * modifies stores in bulk. It has to be called just before a try.
6308    * #closeBulkRegionOperation needs to be called in the try's finally block
6309    * Acquires a writelock and checks if the region is closing or closed.
6310    * @throws NotServingRegionException when the region is closing or closed
6311    * @throws RegionTooBusyException if failed to get the lock in time
6312    * @throws InterruptedIOException if interrupted while waiting for a lock
6313    */
6314   private void startBulkRegionOperation(boolean writeLockNeeded)
6315       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
6316     if (this.closing.get()) {
6317       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
6318     }
6319     if (writeLockNeeded) lock(lock.writeLock());
6320     else lock(lock.readLock());
6321     if (this.closed.get()) {
6322       if (writeLockNeeded) lock.writeLock().unlock();
6323       else lock.readLock().unlock();
6324       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
6325     }
6326   }
6327 
6328   /**
6329    * Closes the lock. This needs to be called in the finally block corresponding
6330    * to the try block of #startRegionOperation
6331    */
6332   private void closeBulkRegionOperation(){
6333     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
6334     else lock.readLock().unlock();
6335   }
6336 
6337   /**
6338    * Update counters for numer of puts without wal and the size of possible data loss.
6339    * These information are exposed by the region server metrics.
6340    */
6341   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
6342     numMutationsWithoutWAL.increment();
6343     if (numMutationsWithoutWAL.get() <= 1) {
6344       LOG.info("writing data to region " + this +
6345                " with WAL disabled. Data may be lost in the event of a crash.");
6346     }
6347 
6348     long mutationSize = 0;
6349     for (List<Cell> cells: familyMap.values()) {
6350       assert cells instanceof RandomAccess;
6351       int listSize = cells.size();
6352       for (int i=0; i < listSize; i++) {
6353         Cell cell = cells.get(i);
6354         // TODO we need include tags length also here.
6355         mutationSize += KeyValueUtil.keyLength(cell) + cell.getValueLength();
6356       }
6357     }
6358 
6359     dataInMemoryWithoutWAL.add(mutationSize);
6360   }
6361 
6362   private void lock(final Lock lock)
6363       throws RegionTooBusyException, InterruptedIOException {
6364     lock(lock, 1);
6365   }
6366 
6367   /**
6368    * Try to acquire a lock.  Throw RegionTooBusyException
6369    * if failed to get the lock in time. Throw InterruptedIOException
6370    * if interrupted while waiting for the lock.
6371    */
6372   private void lock(final Lock lock, final int multiplier)
6373       throws RegionTooBusyException, InterruptedIOException {
6374     try {
6375       final long waitTime = Math.min(maxBusyWaitDuration,
6376           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
6377       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
6378         throw new RegionTooBusyException(
6379             "failed to get a lock in " + waitTime + " ms. " +
6380                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
6381                 this.getRegionInfo().getRegionNameAsString()) +
6382                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
6383                 this.getRegionServerServices().getServerName()));
6384       }
6385     } catch (InterruptedException ie) {
6386       LOG.info("Interrupted while waiting for a lock");
6387       InterruptedIOException iie = new InterruptedIOException();
6388       iie.initCause(ie);
6389       throw iie;
6390     }
6391   }
6392 
6393   /**
6394    * Calls sync with the given transaction ID if the region's table is not
6395    * deferring it.
6396    * @param txid should sync up to which transaction
6397    * @throws IOException If anything goes wrong with DFS
6398    */
6399   private void syncOrDefer(long txid, Durability durability) throws IOException {
6400     if (this.getRegionInfo().isMetaRegion()) {
6401       this.log.sync(txid);
6402     } else {
6403       switch(durability) {
6404       case USE_DEFAULT:
6405         // do what table defaults to
6406         if (shouldSyncLog()) {
6407           this.log.sync(txid);
6408         }
6409         break;
6410       case SKIP_WAL:
6411         // nothing do to
6412         break;
6413       case ASYNC_WAL:
6414         // nothing do to
6415         break;
6416       case SYNC_WAL:
6417       case FSYNC_WAL:
6418         // sync the WAL edit (SYNC and FSYNC treated the same for now)
6419         this.log.sync(txid);
6420         break;
6421       }
6422     }
6423   }
6424 
6425   /**
6426    * Check whether we should sync the log from the table's durability settings
6427    */
6428   private boolean shouldSyncLog() {
6429     return durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
6430   }
6431 
6432   /**
6433    * A mocked list implementaion - discards all updates.
6434    */
6435   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
6436 
6437     @Override
6438     public void add(int index, Cell element) {
6439       // do nothing
6440     }
6441 
6442     @Override
6443     public boolean addAll(int index, Collection<? extends Cell> c) {
6444       return false; // this list is never changed as a result of an update
6445     }
6446 
6447     @Override
6448     public KeyValue get(int index) {
6449       throw new UnsupportedOperationException();
6450     }
6451 
6452     @Override
6453     public int size() {
6454       return 0;
6455     }
6456   };
6457 
6458   /**
6459    * Facility for dumping and compacting catalog tables.
6460    * Only does catalog tables since these are only tables we for sure know
6461    * schema on.  For usage run:
6462    * <pre>
6463    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
6464    * </pre>
6465    * @param args
6466    * @throws IOException
6467    */
6468   public static void main(String[] args) throws IOException {
6469     if (args.length < 1) {
6470       printUsageAndExit(null);
6471     }
6472     boolean majorCompact = false;
6473     if (args.length > 1) {
6474       if (!args[1].toLowerCase().startsWith("major")) {
6475         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
6476       }
6477       majorCompact = true;
6478     }
6479     final Path tableDir = new Path(args[0]);
6480     final Configuration c = HBaseConfiguration.create();
6481     final FileSystem fs = FileSystem.get(c);
6482     final Path logdir = new Path(c.get("hbase.tmp.dir"));
6483     final String logname = "hlog" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
6484 
6485     final HLog log = HLogFactory.createHLog(fs, logdir, logname, c);
6486     try {
6487       processTable(fs, tableDir, log, c, majorCompact);
6488     } finally {
6489        log.close();
6490        // TODO: is this still right?
6491        BlockCache bc = new CacheConfig(c).getBlockCache();
6492        if (bc != null) bc.shutdown();
6493     }
6494   }
6495 
6496   /**
6497    * Gets the latest sequence number that was read from storage when this region was opened.
6498    */
6499   public long getOpenSeqNum() {
6500     return this.openSeqNum;
6501   }
6502 
6503   /**
6504    * Gets max sequence ids of stores that was read from storage when this region was opened. WAL
6505    * Edits with smaller or equal sequence number will be skipped from replay.
6506    */
6507   public Map<byte[], Long> getMaxStoreSeqIdForLogReplay() {
6508     return this.maxSeqIdInStores;
6509   }
6510 
6511   /**
6512    * @return if a given region is in compaction now.
6513    */
6514   public CompactionState getCompactionState() {
6515     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
6516     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
6517         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
6518   }
6519 
6520   public void reportCompactionRequestStart(boolean isMajor){
6521     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
6522   }
6523 
6524   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted){
6525     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
6526 
6527     // metrics
6528     compactionsFinished.incrementAndGet();
6529     compactionNumFilesCompacted.addAndGet(numFiles);
6530     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
6531 
6532     assert newValue >= 0;
6533   }
6534 
6535   /**
6536    * @return sequenceId.
6537    */
6538   public AtomicLong getSequenceId() {
6539     return this.sequenceId;
6540   }
6541 
6542   /**
6543    * sets this region's sequenceId.
6544    * @param value new value
6545    */
6546   private void setSequenceId(long value) {
6547     this.sequenceId.set(value);
6548   }
6549 
6550   /**
6551    * Listener class to enable callers of
6552    * bulkLoadHFile() to perform any necessary
6553    * pre/post processing of a given bulkload call
6554    */
6555   public interface BulkLoadListener {
6556 
6557     /**
6558      * Called before an HFile is actually loaded
6559      * @param family family being loaded to
6560      * @param srcPath path of HFile
6561      * @return final path to be used for actual loading
6562      * @throws IOException
6563      */
6564     String prepareBulkLoad(byte[] family, String srcPath) throws IOException;
6565 
6566     /**
6567      * Called after a successful HFile load
6568      * @param family family being loaded to
6569      * @param srcPath path of HFile
6570      * @throws IOException
6571      */
6572     void doneBulkLoad(byte[] family, String srcPath) throws IOException;
6573 
6574     /**
6575      * Called after a failed HFile load
6576      * @param family family being loaded to
6577      * @param srcPath path of HFile
6578      * @throws IOException
6579      */
6580     void failedBulkLoad(byte[] family, String srcPath) throws IOException;
6581   }
6582 
6583   @VisibleForTesting class RowLockContext {
6584     private final HashedBytes row;
6585     private final CountDownLatch latch = new CountDownLatch(1);
6586     private final Thread thread;
6587     private int lockCount = 0;
6588 
6589     RowLockContext(HashedBytes row) {
6590       this.row = row;
6591       this.thread = Thread.currentThread();
6592     }
6593 
6594     boolean ownedByCurrentThread() {
6595       return thread == Thread.currentThread();
6596     }
6597 
6598     RowLock newLock() {
6599       lockCount++;
6600       return new RowLock(this);
6601     }
6602 
6603     void releaseLock() {
6604       if (!ownedByCurrentThread()) {
6605         throw new IllegalArgumentException("Lock held by thread: " + thread
6606           + " cannot be released by different thread: " + Thread.currentThread());
6607       }
6608       lockCount--;
6609       if (lockCount == 0) {
6610         // no remaining locks by the thread, unlock and allow other threads to access
6611         RowLockContext existingContext = lockedRows.remove(row);
6612         if (existingContext != this) {
6613           throw new RuntimeException(
6614               "Internal row lock state inconsistent, should not happen, row: " + row);
6615         }
6616         latch.countDown();
6617       }
6618     }
6619   }
6620 
6621   /**
6622    * Row lock held by a given thread.
6623    * One thread may acquire multiple locks on the same row simultaneously.
6624    * The locks must be released by calling release() from the same thread.
6625    */
6626   public static class RowLock {
6627     @VisibleForTesting final RowLockContext context;
6628     private boolean released = false;
6629 
6630     @VisibleForTesting RowLock(RowLockContext context) {
6631       this.context = context;
6632     }
6633 
6634     /**
6635      * Release the given lock.  If there are no remaining locks held by the current thread
6636      * then unlock the row and allow other threads to acquire the lock.
6637      * @throws IllegalArgumentException if called by a different thread than the lock owning thread
6638      */
6639     public void release() {
6640       if (!released) {
6641         context.releaseLock();
6642         released = true;
6643       }
6644     }
6645   }
6646 
6647   /**
6648    * Lock the updates' readLock first, so that we could safely append logs in coprocessors.
6649    * @throws RegionTooBusyException
6650    * @throws InterruptedIOException
6651    */
6652   public void updatesLock() throws RegionTooBusyException, InterruptedIOException {
6653     lock(updatesLock.readLock());
6654   }
6655 
6656   /**
6657    * Unlock the updates' readLock after appending logs in coprocessors.
6658    * @throws InterruptedIOException
6659    */
6660   public void updatesUnlock() throws InterruptedIOException {
6661     updatesLock.readLock().unlock();
6662   }
6663 }