View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.io.UnsupportedEncodingException;
25  import java.lang.reflect.Constructor;
26  import java.text.ParseException;
27  import java.util.AbstractList;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Collection;
31  import java.util.Collections;
32  import java.util.HashMap;
33  import java.util.Iterator;
34  import java.util.List;
35  import java.util.Map;
36  import java.util.NavigableMap;
37  import java.util.NavigableSet;
38  import java.util.RandomAccess;
39  import java.util.Set;
40  import java.util.TreeMap;
41  import java.util.UUID;
42  import java.util.concurrent.Callable;
43  import java.util.concurrent.CompletionService;
44  import java.util.concurrent.ConcurrentHashMap;
45  import java.util.concurrent.ConcurrentSkipListMap;
46  import java.util.concurrent.CountDownLatch;
47  import java.util.concurrent.ExecutionException;
48  import java.util.concurrent.ExecutorCompletionService;
49  import java.util.concurrent.ExecutorService;
50  import java.util.concurrent.Executors;
51  import java.util.concurrent.Future;
52  import java.util.concurrent.FutureTask;
53  import java.util.concurrent.ThreadFactory;
54  import java.util.concurrent.ThreadPoolExecutor;
55  import java.util.concurrent.TimeUnit;
56  import java.util.concurrent.TimeoutException;
57  import java.util.concurrent.atomic.AtomicBoolean;
58  import java.util.concurrent.atomic.AtomicInteger;
59  import java.util.concurrent.atomic.AtomicLong;
60  import java.util.concurrent.locks.Lock;
61  import java.util.concurrent.locks.ReentrantReadWriteLock;
62  
63  import org.apache.commons.logging.Log;
64  import org.apache.commons.logging.LogFactory;
65  import org.apache.hadoop.hbase.classification.InterfaceAudience;
66  import org.apache.hadoop.conf.Configuration;
67  import org.apache.hadoop.fs.FileStatus;
68  import org.apache.hadoop.fs.FileSystem;
69  import org.apache.hadoop.fs.Path;
70  import org.apache.hadoop.hbase.Cell;
71  import org.apache.hadoop.hbase.CellScanner;
72  import org.apache.hadoop.hbase.CellUtil;
73  import org.apache.hadoop.hbase.CompoundConfiguration;
74  import org.apache.hadoop.hbase.DoNotRetryIOException;
75  import org.apache.hadoop.hbase.DroppedSnapshotException;
76  import org.apache.hadoop.hbase.HBaseConfiguration;
77  import org.apache.hadoop.hbase.HColumnDescriptor;
78  import org.apache.hadoop.hbase.HConstants;
79  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
80  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
81  import org.apache.hadoop.hbase.HRegionInfo;
82  import org.apache.hadoop.hbase.HTableDescriptor;
83  import org.apache.hadoop.hbase.KeyValue;
84  import org.apache.hadoop.hbase.KeyValueUtil;
85  import org.apache.hadoop.hbase.NamespaceDescriptor;
86  import org.apache.hadoop.hbase.NotServingRegionException;
87  import org.apache.hadoop.hbase.RegionTooBusyException;
88  import org.apache.hadoop.hbase.TableName;
89  import org.apache.hadoop.hbase.Tag;
90  import org.apache.hadoop.hbase.TagType;
91  import org.apache.hadoop.hbase.UnknownScannerException;
92  import org.apache.hadoop.hbase.backup.HFileArchiver;
93  import org.apache.hadoop.hbase.client.Append;
94  import org.apache.hadoop.hbase.client.Delete;
95  import org.apache.hadoop.hbase.client.Durability;
96  import org.apache.hadoop.hbase.client.Get;
97  import org.apache.hadoop.hbase.client.Increment;
98  import org.apache.hadoop.hbase.client.IsolationLevel;
99  import org.apache.hadoop.hbase.client.Mutation;
100 import org.apache.hadoop.hbase.client.Put;
101 import org.apache.hadoop.hbase.client.Result;
102 import org.apache.hadoop.hbase.client.RowMutations;
103 import org.apache.hadoop.hbase.client.Scan;
104 import org.apache.hadoop.hbase.coprocessor.RegionObserver;
105 import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
106 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
107 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
108 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
109 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
110 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
111 import org.apache.hadoop.hbase.filter.FilterWrapper;
112 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
113 import org.apache.hadoop.hbase.io.HeapSize;
114 import org.apache.hadoop.hbase.io.TimeRange;
115 import org.apache.hadoop.hbase.io.hfile.BlockCache;
116 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
117 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
118 import org.apache.hadoop.hbase.ipc.RpcCallContext;
119 import org.apache.hadoop.hbase.ipc.RpcServer;
120 import org.apache.hadoop.hbase.master.AssignmentManager;
121 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
122 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
123 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
124 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
125 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
126 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
127 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
128 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
129 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
130 import org.apache.hadoop.hbase.regionserver.wal.HLog;
131 import org.apache.hadoop.hbase.regionserver.wal.HLogFactory;
132 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
133 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
134 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter.MutationReplay;
135 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
136 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
137 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
138 import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
139 import org.apache.hadoop.hbase.util.Bytes;
140 import org.apache.hadoop.hbase.util.CancelableProgressable;
141 import org.apache.hadoop.hbase.util.ClassSize;
142 import org.apache.hadoop.hbase.util.CompressionTest;
143 import org.apache.hadoop.hbase.util.EncryptionTest;
144 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
145 import org.apache.hadoop.hbase.util.FSTableDescriptors;
146 import org.apache.hadoop.hbase.util.FSUtils;
147 import org.apache.hadoop.hbase.util.HashedBytes;
148 import org.apache.hadoop.hbase.util.Pair;
149 import org.apache.hadoop.hbase.util.Threads;
150 import org.apache.hadoop.io.MultipleIOException;
151 import org.apache.hadoop.util.StringUtils;
152 import org.cliffc.high_scale_lib.Counter;
153 
154 import com.google.common.annotations.VisibleForTesting;
155 import com.google.common.base.Preconditions;
156 import com.google.common.collect.Lists;
157 import com.google.common.collect.Maps;
158 import com.google.common.io.Closeables;
159 import com.google.protobuf.Descriptors;
160 import com.google.protobuf.Message;
161 import com.google.protobuf.RpcCallback;
162 import com.google.protobuf.RpcController;
163 import com.google.protobuf.Service;
164 
165 /**
166  * HRegion stores data for a certain region of a table.  It stores all columns
167  * for each row. A given table consists of one or more HRegions.
168  *
169  * <p>We maintain multiple HStores for a single HRegion.
170  *
171  * <p>An Store is a set of rows with some column data; together,
172  * they make up all the data for the rows.
173  *
174  * <p>Each HRegion has a 'startKey' and 'endKey'.
175  * <p>The first is inclusive, the second is exclusive (except for
176  * the final region)  The endKey of region 0 is the same as
177  * startKey for region 1 (if it exists).  The startKey for the
178  * first region is null. The endKey for the final region is null.
179  *
180  * <p>Locking at the HRegion level serves only one purpose: preventing the
181  * region from being closed (and consequently split) while other operations
182  * are ongoing. Each row level operation obtains both a row lock and a region
183  * read lock for the duration of the operation. While a scanner is being
184  * constructed, getScanner holds a read lock. If the scanner is successfully
185  * constructed, it holds a read lock until it is closed. A close takes out a
186  * write lock and consequently will block for ongoing operations and will block
187  * new operations from starting while the close is in progress.
188  *
189  * <p>An HRegion is defined by its table and its key extent.
190  *
191  * <p>It consists of at least one Store.  The number of Stores should be
192  * configurable, so that data which is accessed together is stored in the same
193  * Store.  Right now, we approximate that by building a single Store for
194  * each column family.  (This config info will be communicated via the
195  * tabledesc.)
196  *
197  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
198  * regionName is a unique identifier for this HRegion. (startKey, endKey]
199  * defines the keyspace for this HRegion.
200  */
201 @InterfaceAudience.Private
202 public class HRegion implements HeapSize { // , Writable{
203   public static final Log LOG = LogFactory.getLog(HRegion.class);
204 
205   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
206       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
207 
208   /**
209    * This is the global default value for durability. All tables/mutations not
210    * defining a durability or using USE_DEFAULT will default to this value.
211    */
212   private static final Durability DEFAULT_DURABLITY = Durability.SYNC_WAL;
213 
214   final AtomicBoolean closed = new AtomicBoolean(false);
215   /* Closing can take some time; use the closing flag if there is stuff we don't
216    * want to do while in closing state; e.g. like offer this region up to the
217    * master as a region to close if the carrying regionserver is overloaded.
218    * Once set, it is never cleared.
219    */
220   final AtomicBoolean closing = new AtomicBoolean(false);
221 
222   protected volatile long completeSequenceId = -1L;
223 
224   /**
225    * Region level sequence Id. It is used for appending WALEdits in HLog. Its default value is -1,
226    * as a marker that the region hasn't opened yet. Once it is opened, it is set to
227    * {@link #openSeqNum}.
228    */
229   private final AtomicLong sequenceId = new AtomicLong(-1L);
230 
231   /**
232    * Operation enum is used in {@link HRegion#startRegionOperation} to provide operation context for
233    * startRegionOperation to possibly invoke different checks before any region operations. Not all
234    * operations have to be defined here. It's only needed when a special check is need in
235    * startRegionOperation
236    */
237   public enum Operation {
238     ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE,
239     REPLAY_BATCH_MUTATE, COMPACT_REGION
240   }
241 
242   //////////////////////////////////////////////////////////////////////////////
243   // Members
244   //////////////////////////////////////////////////////////////////////////////
245 
246   // map from a locked row to the context for that lock including:
247   // - CountDownLatch for threads waiting on that row
248   // - the thread that owns the lock (allow reentrancy)
249   // - reference count of (reentrant) locks held by the thread
250   // - the row itself
251   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
252       new ConcurrentHashMap<HashedBytes, RowLockContext>();
253 
254   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
255       Bytes.BYTES_RAWCOMPARATOR);
256 
257   // TODO: account for each registered handler in HeapSize computation
258   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
259 
260   public final AtomicLong memstoreSize = new AtomicLong(0);
261 
262   // Debug possible data loss due to WAL off
263   final Counter numMutationsWithoutWAL = new Counter();
264   final Counter dataInMemoryWithoutWAL = new Counter();
265 
266   // Debug why CAS operations are taking a while.
267   final Counter checkAndMutateChecksPassed = new Counter();
268   final Counter checkAndMutateChecksFailed = new Counter();
269 
270   //Number of requests
271   final Counter readRequestsCount = new Counter();
272   final Counter writeRequestsCount = new Counter();
273 
274   // Number of requests blocked by memstore size.
275   private final Counter blockedRequestsCount = new Counter();
276 
277   /**
278    * @return the number of blocked requests count.
279    */
280   public long getBlockedRequestsCount() {
281     return this.blockedRequestsCount.get();
282   }
283 
284   // Compaction counters
285   final AtomicLong compactionsFinished = new AtomicLong(0L);
286   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
287   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
288 
289 
290   private final HLog log;
291   private final HRegionFileSystem fs;
292   protected final Configuration conf;
293   private final Configuration baseConf;
294   private final KeyValue.KVComparator comparator;
295   private final int rowLockWaitDuration;
296   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
297 
298   // The internal wait duration to acquire a lock before read/update
299   // from the region. It is not per row. The purpose of this wait time
300   // is to avoid waiting a long time while the region is busy, so that
301   // we can release the IPC handler soon enough to improve the
302   // availability of the region server. It can be adjusted by
303   // tuning configuration "hbase.busy.wait.duration".
304   final long busyWaitDuration;
305   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
306 
307   // If updating multiple rows in one call, wait longer,
308   // i.e. waiting for busyWaitDuration * # of rows. However,
309   // we can limit the max multiplier.
310   final int maxBusyWaitMultiplier;
311 
312   // Max busy wait duration. There is no point to wait longer than the RPC
313   // purge timeout, when a RPC call will be terminated by the RPC engine.
314   final long maxBusyWaitDuration;
315 
316   // negative number indicates infinite timeout
317   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
318   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
319 
320   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
321 
322   /**
323    * The sequence ID that was encountered when this region was opened.
324    */
325   private long openSeqNum = HConstants.NO_SEQNUM;
326 
327   /**
328    * The default setting for whether to enable on-demand CF loading for
329    * scan requests to this region. Requests can override it.
330    */
331   private boolean isLoadingCfsOnDemandDefault = false;
332 
333   private final AtomicInteger majorInProgress = new AtomicInteger(0);
334   private final AtomicInteger minorInProgress = new AtomicInteger(0);
335 
336   //
337   // Context: During replay we want to ensure that we do not lose any data. So, we
338   // have to be conservative in how we replay logs. For each store, we calculate
339   // the maxSeqId up to which the store was flushed. And, skip the edits which
340   // are equal to or lower than maxSeqId for each store.
341   // The following map is populated when opening the region
342   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
343 
344   /**
345    * Config setting for whether to allow writes when a region is in recovering or not.
346    */
347   private boolean disallowWritesInRecovering = false;
348 
349   // when a region is in recovering state, it can only accept writes not reads
350   private volatile boolean isRecovering = false;
351 
352   /**
353    * @return The smallest mvcc readPoint across all the scanners in this
354    * region. Writes older than this readPoint, are included  in every
355    * read operation.
356    */
357   public long getSmallestReadPoint() {
358     long minimumReadPoint;
359     // We need to ensure that while we are calculating the smallestReadPoint
360     // no new RegionScanners can grab a readPoint that we are unaware of.
361     // We achieve this by synchronizing on the scannerReadPoints object.
362     synchronized(scannerReadPoints) {
363       minimumReadPoint = mvcc.memstoreReadPoint();
364 
365       for (Long readPoint: this.scannerReadPoints.values()) {
366         if (readPoint < minimumReadPoint) {
367           minimumReadPoint = readPoint;
368         }
369       }
370     }
371     return minimumReadPoint;
372   }
373   /*
374    * Data structure of write state flags used coordinating flushes,
375    * compactions and closes.
376    */
377   static class WriteState {
378     // Set while a memstore flush is happening.
379     volatile boolean flushing = false;
380     // Set when a flush has been requested.
381     volatile boolean flushRequested = false;
382     // Number of compactions running.
383     volatile int compacting = 0;
384     // Gets set in close. If set, cannot compact or flush again.
385     volatile boolean writesEnabled = true;
386     // Set if region is read-only
387     volatile boolean readOnly = false;
388 
389     /**
390      * Set flags that make this region read-only.
391      *
392      * @param onOff flip value for region r/o setting
393      */
394     synchronized void setReadOnly(final boolean onOff) {
395       this.writesEnabled = !onOff;
396       this.readOnly = onOff;
397     }
398 
399     boolean isReadOnly() {
400       return this.readOnly;
401     }
402 
403     boolean isFlushRequested() {
404       return this.flushRequested;
405     }
406 
407     static final long HEAP_SIZE = ClassSize.align(
408         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
409   }
410 
411   /**
412    * Objects from this class are created when flushing to describe all the different states that
413    * that method ends up in. The Result enum describes those states. The sequence id should only
414    * be specified if the flush was successful, and the failure message should only be speficied
415    * if it didn't flush.
416    */
417   public static class FlushResult {
418     enum Result {
419       FLUSHED_NO_COMPACTION_NEEDED,
420       FLUSHED_COMPACTION_NEEDED,
421       // Special case where a flush didn't run because there's nothing in the memstores. Used when
422       // bulk loading to know when we can still load even if a flush didn't happen.
423       CANNOT_FLUSH_MEMSTORE_EMPTY,
424       CANNOT_FLUSH
425       // Be careful adding more to this enum, look at the below methods to make sure
426     }
427 
428     final Result result;
429     final String failureReason;
430     final long flushSequenceId;
431 
432     /**
433      * Convenience constructor to use when the flush is successful, the failure message is set to
434      * null.
435      * @param result Expecting FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_COMPACTION_NEEDED.
436      * @param flushSequenceId Generated sequence id that comes right after the edits in the
437      *                        memstores.
438      */
439     FlushResult(Result result, long flushSequenceId) {
440       this(result, flushSequenceId, null);
441       assert result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
442           .FLUSHED_COMPACTION_NEEDED;
443     }
444 
445     /**
446      * Convenience constructor to use when we cannot flush.
447      * @param result Expecting CANNOT_FLUSH_MEMSTORE_EMPTY or CANNOT_FLUSH.
448      * @param failureReason Reason why we couldn't flush.
449      */
450     FlushResult(Result result, String failureReason) {
451       this(result, -1, failureReason);
452       assert result == Result.CANNOT_FLUSH_MEMSTORE_EMPTY || result == Result.CANNOT_FLUSH;
453     }
454 
455     /**
456      * Constructor with all the parameters.
457      * @param result Any of the Result.
458      * @param flushSequenceId Generated sequence id if the memstores were flushed else -1.
459      * @param failureReason Reason why we couldn't flush, or null.
460      */
461     FlushResult(Result result, long flushSequenceId, String failureReason) {
462       this.result = result;
463       this.flushSequenceId = flushSequenceId;
464       this.failureReason = failureReason;
465     }
466 
467     /**
468      * Convenience method, the equivalent of checking if result is
469      * FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
470      * @return true if the memstores were flushed, else false.
471      */
472     public boolean isFlushSucceeded() {
473       return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
474           .FLUSHED_COMPACTION_NEEDED;
475     }
476 
477     /**
478      * Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
479      * @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
480      */
481     public boolean isCompactionNeeded() {
482       return result == Result.FLUSHED_COMPACTION_NEEDED;
483     }
484   }
485 
486   final WriteState writestate = new WriteState();
487 
488   long memstoreFlushSize;
489   final long timestampSlop;
490   final long rowProcessorTimeout;
491   private volatile long lastFlushTime;
492   final RegionServerServices rsServices;
493   private RegionServerAccounting rsAccounting;
494   private List<Pair<Long, Long>> recentFlushes = new ArrayList<Pair<Long,Long>>();
495   private long flushCheckInterval;
496   // flushPerChanges is to prevent too many changes in memstore
497   private long flushPerChanges;
498   private long blockingMemStoreSize;
499   final long threadWakeFrequency;
500   // Used to guard closes
501   final ReentrantReadWriteLock lock =
502     new ReentrantReadWriteLock();
503 
504   // Stop updates lock
505   private final ReentrantReadWriteLock updatesLock =
506     new ReentrantReadWriteLock();
507   private boolean splitRequest;
508   private byte[] explicitSplitPoint = null;
509 
510   private final MultiVersionConsistencyControl mvcc =
511       new MultiVersionConsistencyControl();
512 
513   // Coprocessor host
514   private RegionCoprocessorHost coprocessorHost;
515 
516   private HTableDescriptor htableDescriptor = null;
517   private RegionSplitPolicy splitPolicy;
518 
519   private final MetricsRegion metricsRegion;
520   private final MetricsRegionWrapperImpl metricsRegionWrapper;
521   private final Durability durability;
522   private final boolean regionStatsEnabled;
523 
524   /**
525    * HRegion constructor. This constructor should only be used for testing and
526    * extensions.  Instances of HRegion should be instantiated with the
527    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
528    *
529    * @param tableDir qualified path of directory where region should be located,
530    * usually the table directory.
531    * @param log The HLog is the outbound log for any updates to the HRegion
532    * (There's a single HLog for all the HRegions on a single HRegionServer.)
533    * The log file is a logfile from the previous execution that's
534    * custom-computed for this HRegion. The HRegionServer computes and sorts the
535    * appropriate log info for this HRegion. If there is a previous log file
536    * (implying that the HRegion has been written-to before), then read it from
537    * the supplied path.
538    * @param fs is the filesystem.
539    * @param confParam is global configuration settings.
540    * @param regionInfo - HRegionInfo that describes the region
541    * is new), then read them from the supplied path.
542    * @param htd the table descriptor
543    * @param rsServices reference to {@link RegionServerServices} or null
544    */
545   @Deprecated
546   public HRegion(final Path tableDir, final HLog log, final FileSystem fs,
547       final Configuration confParam, final HRegionInfo regionInfo,
548       final HTableDescriptor htd, final RegionServerServices rsServices) {
549     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
550       log, confParam, htd, rsServices);
551   }
552 
553   /**
554    * HRegion constructor. This constructor should only be used for testing and
555    * extensions.  Instances of HRegion should be instantiated with the
556    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
557    *
558    * @param fs is the filesystem.
559    * @param log The HLog is the outbound log for any updates to the HRegion
560    * (There's a single HLog for all the HRegions on a single HRegionServer.)
561    * The log file is a logfile from the previous execution that's
562    * custom-computed for this HRegion. The HRegionServer computes and sorts the
563    * appropriate log info for this HRegion. If there is a previous log file
564    * (implying that the HRegion has been written-to before), then read it from
565    * the supplied path.
566    * @param confParam is global configuration settings.
567    * @param htd the table descriptor
568    * @param rsServices reference to {@link RegionServerServices} or null
569    */
570   public HRegion(final HRegionFileSystem fs, final HLog log, final Configuration confParam,
571       final HTableDescriptor htd, final RegionServerServices rsServices) {
572     if (htd == null) {
573       throw new IllegalArgumentException("Need table descriptor");
574     }
575 
576     if (confParam instanceof CompoundConfiguration) {
577       throw new IllegalArgumentException("Need original base configuration");
578     }
579 
580     this.comparator = fs.getRegionInfo().getComparator();
581     this.log = log;
582     this.fs = fs;
583 
584     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
585     this.baseConf = confParam;
586     this.conf = new CompoundConfiguration()
587       .add(confParam)
588       .addStringMap(htd.getConfiguration())
589       .addWritableMap(htd.getValues());
590     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
591         DEFAULT_CACHE_FLUSH_INTERVAL);
592     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
593     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
594       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
595           + MAX_FLUSH_PER_CHANGES);
596     }
597 
598     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
599                     DEFAULT_ROWLOCK_WAIT_DURATION);
600 
601     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
602     this.htableDescriptor = htd;
603     this.rsServices = rsServices;
604     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
605     setHTableSpecificConf();
606     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
607 
608     this.busyWaitDuration = conf.getLong(
609       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
610     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
611     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
612       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
613         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
614         + maxBusyWaitMultiplier + "). Their product should be positive");
615     }
616     this.maxBusyWaitDuration = conf.getLong("hbase.ipc.client.call.purge.timeout",
617       conf.getLong("ipc.client.call.purge.timeout", 2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT));
618 
619     /*
620      * timestamp.slop provides a server-side constraint on the timestamp. This
621      * assumes that you base your TS around currentTimeMillis(). In this case,
622      * throw an error to the user if the user-specified TS is newer than now +
623      * slop. LATEST_TIMESTAMP == don't use this functionality
624      */
625     this.timestampSlop = conf.getLong(
626         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
627         HConstants.LATEST_TIMESTAMP);
628 
629     /**
630      * Timeout for the process time in processRowsWithLocks().
631      * Use -1 to switch off time bound.
632      */
633     this.rowProcessorTimeout = conf.getLong(
634         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
635     this.durability = htd.getDurability() == Durability.USE_DEFAULT
636         ? DEFAULT_DURABLITY
637         : htd.getDurability();
638     if (rsServices != null) {
639       this.rsAccounting = this.rsServices.getRegionServerAccounting();
640       // don't initialize coprocessors if not running within a regionserver
641       // TODO: revisit if coprocessors should load in other cases
642       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
643       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
644       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
645 
646       Map<String, HRegion> recoveringRegions = rsServices.getRecoveringRegions();
647       String encodedName = getRegionInfo().getEncodedName();
648       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
649         this.isRecovering = true;
650         recoveringRegions.put(encodedName, this);
651       }
652     } else {
653       this.metricsRegionWrapper = null;
654       this.metricsRegion = null;
655     }
656     if (LOG.isDebugEnabled()) {
657       // Write out region name as string and its encoded name.
658       LOG.debug("Instantiated " + this);
659     }
660 
661     // by default, we allow writes against a region when it's in recovering
662     this.disallowWritesInRecovering =
663         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
664           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
665 
666     // disable stats tracking system tables, but check the config for everything else
667     this.regionStatsEnabled = htd.getTableName().getNamespaceAsString().equals(
668       NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR) ? false :
669         conf.getBoolean(HConstants.ENABLE_CLIENT_BACKPRESSURE,
670           HConstants.DEFAULT_ENABLE_CLIENT_BACKPRESSURE);
671   }
672 
673   void setHTableSpecificConf() {
674     if (this.htableDescriptor == null) return;
675     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
676 
677     if (flushSize <= 0) {
678       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
679         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
680     }
681     this.memstoreFlushSize = flushSize;
682     this.blockingMemStoreSize = this.memstoreFlushSize *
683         conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
684   }
685 
686   /**
687    * Initialize this region.
688    * Used only by tests and SplitTransaction to reopen the region.
689    * You should use createHRegion() or openHRegion()
690    * @return What the next sequence (edit) id should be.
691    * @throws IOException e
692    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
693    */
694   @Deprecated
695   public long initialize() throws IOException {
696     return initialize(null);
697   }
698 
699   /**
700    * Initialize this region.
701    *
702    * @param reporter Tickle every so often if initialize is taking a while.
703    * @return What the next sequence (edit) id should be.
704    * @throws IOException e
705    */
706   private long initialize(final CancelableProgressable reporter) throws IOException {
707     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
708     long nextSeqId = -1;
709     try {
710       nextSeqId = initializeRegionInternals(reporter, status);
711       return nextSeqId;
712     } finally {
713       // nextSeqid will be -1 if the initialization fails.
714       // At least it will be 0 otherwise.
715       if (nextSeqId == -1) {
716         status
717             .abort("Exception during region " + this.getRegionNameAsString() + " initialization.");
718       }
719     }
720   }
721 
722   private long initializeRegionInternals(final CancelableProgressable reporter,
723       final MonitoredTask status) throws IOException, UnsupportedEncodingException {
724     if (coprocessorHost != null) {
725       status.setStatus("Running coprocessor pre-open hook");
726       coprocessorHost.preOpen();
727     }
728 
729     // Write HRI to a file in case we need to recover hbase:meta
730     status.setStatus("Writing region info on filesystem");
731     fs.checkRegionInfoOnFilesystem();
732 
733     // Remove temporary data left over from old regions
734     status.setStatus("Cleaning up temporary data from old regions");
735     fs.cleanupTempDir();
736 
737     // Initialize all the HStores
738     status.setStatus("Initializing all the Stores");
739     long maxSeqId = initializeRegionStores(reporter, status);
740 
741     status.setStatus("Cleaning up detritus from prior splits");
742     // Get rid of any splits or merges that were lost in-progress.  Clean out
743     // these directories here on open.  We may be opening a region that was
744     // being split but we crashed in the middle of it all.
745     fs.cleanupAnySplitDetritus();
746     fs.cleanupMergesDir();
747 
748     this.writestate.setReadOnly(this.htableDescriptor.isReadOnly());
749     this.writestate.flushRequested = false;
750     this.writestate.compacting = 0;
751 
752     // Initialize split policy
753     this.splitPolicy = RegionSplitPolicy.create(this, conf);
754 
755     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
756     // Use maximum of log sequenceid or that which was found in stores
757     // (particularly if no recovered edits, seqid will be -1).
758     long nextSeqid = maxSeqId + 1;
759     if (this.isRecovering) {
760       // In distributedLogReplay mode, we don't know the last change sequence number because region
761       // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
762       // overlaps used sequence numbers
763       nextSeqid += this.flushPerChanges + 10000000; // add another extra 10million
764     }
765     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
766       "; next sequenceid=" + nextSeqid);
767 
768     // A region can be reopened if failed a split; reset flags
769     this.closing.set(false);
770     this.closed.set(false);
771 
772     if (coprocessorHost != null) {
773       status.setStatus("Running coprocessor post-open hooks");
774       coprocessorHost.postOpen();
775     }
776 
777     status.markComplete("Region opened successfully");
778     return nextSeqid;
779   }
780 
781   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status)
782       throws IOException, UnsupportedEncodingException {
783     // Load in all the HStores.
784 
785     long maxSeqId = -1;
786     // initialized to -1 so that we pick up MemstoreTS from column families
787     long maxMemstoreTS = -1;
788 
789     if (!htableDescriptor.getFamilies().isEmpty()) {
790       // initialize the thread pool for opening stores in parallel.
791       ThreadPoolExecutor storeOpenerThreadPool =
792         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
793       CompletionService<HStore> completionService =
794         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
795 
796       // initialize each store in parallel
797       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
798         status.setStatus("Instantiating store for column family " + family);
799         completionService.submit(new Callable<HStore>() {
800           @Override
801           public HStore call() throws IOException {
802             return instantiateHStore(family);
803           }
804         });
805       }
806       boolean allStoresOpened = false;
807       try {
808         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
809           Future<HStore> future = completionService.take();
810           HStore store = future.get();
811           this.stores.put(store.getColumnFamilyName().getBytes(), store);
812 
813           long storeMaxSequenceId = store.getMaxSequenceId();
814           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
815               storeMaxSequenceId);
816           if (maxSeqId == -1 || storeMaxSequenceId > maxSeqId) {
817             maxSeqId = storeMaxSequenceId;
818           }
819           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
820           if (maxStoreMemstoreTS > maxMemstoreTS) {
821             maxMemstoreTS = maxStoreMemstoreTS;
822           }
823         }
824         allStoresOpened = true;
825       } catch (InterruptedException e) {
826         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
827       } catch (ExecutionException e) {
828         throw new IOException(e.getCause());
829       } finally {
830         storeOpenerThreadPool.shutdownNow();
831         if (!allStoresOpened) {
832           // something went wrong, close all opened stores
833           LOG.error("Could not initialize all stores for the region=" + this);
834           for (Store store : this.stores.values()) {
835             try {
836               store.close();
837             } catch (IOException e) {
838               LOG.warn(e.getMessage());
839             }
840           }
841         }
842       }
843     }
844     mvcc.initialize(maxMemstoreTS + 1);
845     // Recover any edits if available.
846     maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
847         this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
848     return maxSeqId;
849   }
850 
851   /**
852    * @return True if this region has references.
853    */
854   public boolean hasReferences() {
855     for (Store store : this.stores.values()) {
856       if (store.hasReferences()) return true;
857     }
858     return false;
859   }
860 
861   /**
862    * This function will return the HDFS blocks distribution based on the data
863    * captured when HFile is created
864    * @return The HDFS blocks distribution for the region.
865    */
866   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
867     HDFSBlocksDistribution hdfsBlocksDistribution =
868       new HDFSBlocksDistribution();
869     synchronized (this.stores) {
870       for (Store store : this.stores.values()) {
871         for (StoreFile sf : store.getStorefiles()) {
872           HDFSBlocksDistribution storeFileBlocksDistribution =
873             sf.getHDFSBlockDistribution();
874           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
875         }
876       }
877     }
878     return hdfsBlocksDistribution;
879   }
880 
881   /**
882    * This is a helper function to compute HDFS block distribution on demand
883    * @param conf configuration
884    * @param tableDescriptor HTableDescriptor of the table
885    * @param regionInfo encoded name of the region
886    * @return The HDFS blocks distribution for the given region.
887    * @throws IOException
888    */
889   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
890       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
891     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
892     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
893   }
894 
895   /**
896    * This is a helper function to compute HDFS block distribution on demand
897    * @param conf configuration
898    * @param tableDescriptor HTableDescriptor of the table
899    * @param regionInfo encoded name of the region
900    * @param tablePath the table directory
901    * @return The HDFS blocks distribution for the given region.
902    * @throws IOException
903    */
904   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
905       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
906       throws IOException {
907     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
908     FileSystem fs = tablePath.getFileSystem(conf);
909 
910     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
911     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
912       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
913       if (storeFiles == null) continue;
914 
915       for (StoreFileInfo storeFileInfo : storeFiles) {
916         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
917       }
918     }
919     return hdfsBlocksDistribution;
920   }
921 
922   public AtomicLong getMemstoreSize() {
923     return memstoreSize;
924   }
925 
926   /**
927    * Increase the size of mem store in this region and the size of global mem
928    * store
929    * @param memStoreSize
930    * @return the size of memstore in this region
931    */
932   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
933     if (this.rsAccounting != null) {
934       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
935     }
936     return this.memstoreSize.addAndGet(memStoreSize);
937   }
938 
939   /** @return a HRegionInfo object for this region */
940   public HRegionInfo getRegionInfo() {
941     return this.fs.getRegionInfo();
942   }
943 
944   /**
945    * @return Instance of {@link RegionServerServices} used by this HRegion.
946    * Can be null.
947    */
948   RegionServerServices getRegionServerServices() {
949     return this.rsServices;
950   }
951 
952   /**
953    * @return split policy for this region.
954    */
955   public RegionSplitPolicy getSplitPolicy() {
956     return this.splitPolicy;
957   }
958 
959   /** @return readRequestsCount for this region */
960   long getReadRequestsCount() {
961     return this.readRequestsCount.get();
962   }
963 
964   /** @return writeRequestsCount for this region */
965   long getWriteRequestsCount() {
966     return this.writeRequestsCount.get();
967   }
968 
969   public MetricsRegion getMetrics() {
970     return metricsRegion;
971   }
972 
973   /** @return true if region is closed */
974   public boolean isClosed() {
975     return this.closed.get();
976   }
977 
978   /**
979    * @return True if closing process has started.
980    */
981   public boolean isClosing() {
982     return this.closing.get();
983   }
984 
985   /**
986    * Reset recovering state of current region
987    * @param newState
988    */
989   public void setRecovering(boolean newState) {
990     boolean wasRecovering = this.isRecovering;
991     this.isRecovering = newState;
992     if (wasRecovering && !isRecovering) {
993       // Call only when log replay is over.
994       coprocessorHost.postLogReplay();
995     }
996   }
997 
998   /**
999    * @return True if current region is in recovering
1000    */
1001   public boolean isRecovering() {
1002     return this.isRecovering;
1003   }
1004 
1005   /** @return true if region is available (not closed and not closing) */
1006   public boolean isAvailable() {
1007     return !isClosed() && !isClosing();
1008   }
1009 
1010   /** @return true if region is splittable */
1011   public boolean isSplittable() {
1012     return isAvailable() && !hasReferences();
1013   }
1014 
1015   /**
1016    * @return true if region is mergeable
1017    */
1018   public boolean isMergeable() {
1019     if (!isAvailable()) {
1020       LOG.debug("Region " + this.getRegionNameAsString()
1021           + " is not mergeable because it is closing or closed");
1022       return false;
1023     }
1024     if (hasReferences()) {
1025       LOG.debug("Region " + this.getRegionNameAsString()
1026           + " is not mergeable because it has references");
1027       return false;
1028     }
1029 
1030     return true;
1031   }
1032 
1033   public boolean areWritesEnabled() {
1034     synchronized(this.writestate) {
1035       return this.writestate.writesEnabled;
1036     }
1037   }
1038 
1039    public MultiVersionConsistencyControl getMVCC() {
1040      return mvcc;
1041    }
1042 
1043    /*
1044     * Returns readpoint considering given IsolationLevel
1045     */
1046    public long getReadpoint(IsolationLevel isolationLevel) {
1047      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
1048        // This scan can read even uncommitted transactions
1049        return Long.MAX_VALUE;
1050      }
1051      return mvcc.memstoreReadPoint();
1052    }
1053 
1054    public boolean isLoadingCfsOnDemandDefault() {
1055      return this.isLoadingCfsOnDemandDefault;
1056    }
1057 
1058   /**
1059    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
1060    * service any more calls.
1061    *
1062    * <p>This method could take some time to execute, so don't call it from a
1063    * time-sensitive thread.
1064    *
1065    * @return Vector of all the storage files that the HRegion's component
1066    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
1067    * vector if already closed and null if judged that it should not close.
1068    *
1069    * @throws IOException e
1070    */
1071   public Map<byte[], List<StoreFile>> close() throws IOException {
1072     return close(false);
1073   }
1074 
1075   private final Object closeLock = new Object();
1076 
1077   /** Conf key for the periodic flush interval */
1078   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
1079       "hbase.regionserver.optionalcacheflushinterval";
1080   /** Default interval for the memstore flush */
1081   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
1082 
1083   /** Conf key to force a flush if there are already enough changes for one region in memstore */
1084   public static final String MEMSTORE_FLUSH_PER_CHANGES =
1085       "hbase.regionserver.flush.per.changes";
1086   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
1087   /**
1088    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
1089    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
1090    */
1091   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
1092 
1093   /**
1094    * Close down this HRegion.  Flush the cache unless abort parameter is true,
1095    * Shut down each HStore, don't service any more calls.
1096    *
1097    * This method could take some time to execute, so don't call it from a
1098    * time-sensitive thread.
1099    *
1100    * @param abort true if server is aborting (only during testing)
1101    * @return Vector of all the storage files that the HRegion's component
1102    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
1103    * we are not to close at this time or we are already closed.
1104    *
1105    * @throws IOException e
1106    */
1107   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1108     // Only allow one thread to close at a time. Serialize them so dual
1109     // threads attempting to close will run up against each other.
1110     MonitoredTask status = TaskMonitor.get().createStatus(
1111         "Closing region " + this +
1112         (abort ? " due to abort" : ""));
1113 
1114     status.setStatus("Waiting for close lock");
1115     try {
1116       synchronized (closeLock) {
1117         return doClose(abort, status);
1118       }
1119     } finally {
1120       status.cleanup();
1121     }
1122   }
1123 
1124   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1125       throws IOException {
1126     if (isClosed()) {
1127       LOG.warn("Region " + this + " already closed");
1128       return null;
1129     }
1130 
1131     if (coprocessorHost != null) {
1132       status.setStatus("Running coprocessor pre-close hooks");
1133       this.coprocessorHost.preClose(abort);
1134     }
1135 
1136     status.setStatus("Disabling compacts and flushes for region");
1137     synchronized (writestate) {
1138       // Disable compacting and flushing by background threads for this
1139       // region.
1140       writestate.writesEnabled = false;
1141       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1142       waitForFlushesAndCompactions();
1143     }
1144     // If we were not just flushing, is it worth doing a preflush...one
1145     // that will clear out of the bulk of the memstore before we put up
1146     // the close flag?
1147     if (!abort && worthPreFlushing()) {
1148       status.setStatus("Pre-flushing region before close");
1149       LOG.info("Running close preflush of " + this.getRegionNameAsString());
1150       try {
1151         internalFlushcache(status);
1152       } catch (IOException ioe) {
1153         // Failed to flush the region. Keep going.
1154         status.setStatus("Failed pre-flush " + this + "; " + ioe.getMessage());
1155       }
1156     }
1157 
1158     this.closing.set(true);
1159     status.setStatus("Disabling writes for close");
1160     // block waiting for the lock for closing
1161     lock.writeLock().lock();
1162     try {
1163       if (this.isClosed()) {
1164         status.abort("Already got closed by another process");
1165         // SplitTransaction handles the null
1166         return null;
1167       }
1168       LOG.debug("Updates disabled for region " + this);
1169       // Don't flush the cache if we are aborting
1170       if (!abort) {
1171         int flushCount = 0;
1172         while (this.getMemstoreSize().get() > 0) {
1173           try {
1174             if (flushCount++ > 0) {
1175               int actualFlushes = flushCount - 1;
1176               if (actualFlushes > 5) {
1177                 // If we tried 5 times and are unable to clear memory, abort
1178                 // so we do not lose data
1179                 throw new DroppedSnapshotException("Failed clearing memory after " +
1180                   actualFlushes + " attempts on region: " + Bytes.toStringBinary(getRegionName()));
1181               }
1182               LOG.info("Running extra flush, " + actualFlushes +
1183                 " (carrying snapshot?) " + this);
1184             }
1185             internalFlushcache(status);
1186           } catch (IOException ioe) {
1187             status.setStatus("Failed flush " + this + ", putting online again");
1188             synchronized (writestate) {
1189               writestate.writesEnabled = true;
1190             }
1191             // Have to throw to upper layers.  I can't abort server from here.
1192             throw ioe;
1193           }
1194         }
1195       }
1196 
1197       Map<byte[], List<StoreFile>> result =
1198         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1199       if (!stores.isEmpty()) {
1200         // initialize the thread pool for closing stores in parallel.
1201         ThreadPoolExecutor storeCloserThreadPool =
1202           getStoreOpenAndCloseThreadPool("StoreCloserThread-" + this.getRegionNameAsString());
1203         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1204           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1205 
1206         // close each store in parallel
1207         for (final Store store : stores.values()) {
1208           assert abort? true: store.getFlushableSize() == 0;
1209           completionService
1210               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1211                 @Override
1212                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1213                   return new Pair<byte[], Collection<StoreFile>>(
1214                     store.getFamily().getName(), store.close());
1215                 }
1216               });
1217         }
1218         try {
1219           for (int i = 0; i < stores.size(); i++) {
1220             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1221             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1222             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1223             if (familyFiles == null) {
1224               familyFiles = new ArrayList<StoreFile>();
1225               result.put(storeFiles.getFirst(), familyFiles);
1226             }
1227             familyFiles.addAll(storeFiles.getSecond());
1228           }
1229         } catch (InterruptedException e) {
1230           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1231         } catch (ExecutionException e) {
1232           throw new IOException(e.getCause());
1233         } finally {
1234           storeCloserThreadPool.shutdownNow();
1235         }
1236       }
1237       this.closed.set(true);
1238       if (memstoreSize.get() != 0) LOG.error("Memstore size is " + memstoreSize.get());
1239       if (coprocessorHost != null) {
1240         status.setStatus("Running coprocessor post-close hooks");
1241         this.coprocessorHost.postClose(abort);
1242       }
1243       if ( this.metricsRegion != null) {
1244         this.metricsRegion.close();
1245       }
1246       if ( this.metricsRegionWrapper != null) {
1247         Closeables.closeQuietly(this.metricsRegionWrapper);
1248       }
1249       status.markComplete("Closed");
1250       LOG.info("Closed " + this);
1251       return result;
1252     } finally {
1253       lock.writeLock().unlock();
1254     }
1255   }
1256 
1257   /**
1258    * Wait for all current flushes and compactions of the region to complete.
1259    * <p>
1260    * Exposed for TESTING.
1261    */
1262   public void waitForFlushesAndCompactions() {
1263     synchronized (writestate) {
1264       while (writestate.compacting > 0 || writestate.flushing) {
1265         LOG.debug("waiting for " + writestate.compacting + " compactions"
1266             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1267         try {
1268           writestate.wait();
1269         } catch (InterruptedException iex) {
1270           // essentially ignore and propagate the interrupt back up
1271           Thread.currentThread().interrupt();
1272         }
1273       }
1274     }
1275   }
1276 
1277   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1278       final String threadNamePrefix) {
1279     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1280     int maxThreads = Math.min(numStores,
1281         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1282             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1283     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1284   }
1285 
1286   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1287       final String threadNamePrefix) {
1288     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1289     int maxThreads = Math.max(1,
1290         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1291             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1292             / numStores);
1293     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1294   }
1295 
1296   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1297       final String threadNamePrefix) {
1298     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1299       new ThreadFactory() {
1300         private int count = 1;
1301 
1302         @Override
1303         public Thread newThread(Runnable r) {
1304           return new Thread(r, threadNamePrefix + "-" + count++);
1305         }
1306       });
1307   }
1308 
1309    /**
1310     * @return True if its worth doing a flush before we put up the close flag.
1311     */
1312   private boolean worthPreFlushing() {
1313     return this.memstoreSize.get() >
1314       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1315   }
1316 
1317   //////////////////////////////////////////////////////////////////////////////
1318   // HRegion accessors
1319   //////////////////////////////////////////////////////////////////////////////
1320 
1321   /** @return start key for region */
1322   public byte [] getStartKey() {
1323     return this.getRegionInfo().getStartKey();
1324   }
1325 
1326   /** @return end key for region */
1327   public byte [] getEndKey() {
1328     return this.getRegionInfo().getEndKey();
1329   }
1330 
1331   /** @return region id */
1332   public long getRegionId() {
1333     return this.getRegionInfo().getRegionId();
1334   }
1335 
1336   /** @return region name */
1337   public byte [] getRegionName() {
1338     return this.getRegionInfo().getRegionName();
1339   }
1340 
1341   /** @return region name as string for logging */
1342   public String getRegionNameAsString() {
1343     return this.getRegionInfo().getRegionNameAsString();
1344   }
1345 
1346   /** @return HTableDescriptor for this region */
1347   public HTableDescriptor getTableDesc() {
1348     return this.htableDescriptor;
1349   }
1350 
1351   /** @return HLog in use for this region */
1352   public HLog getLog() {
1353     return this.log;
1354   }
1355 
1356   /**
1357    * A split takes the config from the parent region & passes it to the daughter
1358    * region's constructor. If 'conf' was passed, you would end up using the HTD
1359    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1360    * to the daughter regions to avoid this tricky dedupe problem.
1361    * @return Configuration object
1362    */
1363   Configuration getBaseConf() {
1364     return this.baseConf;
1365   }
1366 
1367   /** @return {@link FileSystem} being used by this region */
1368   public FileSystem getFilesystem() {
1369     return fs.getFileSystem();
1370   }
1371 
1372   /** @return the {@link HRegionFileSystem} used by this region */
1373   public HRegionFileSystem getRegionFileSystem() {
1374     return this.fs;
1375   }
1376 
1377   /** @return the last time the region was flushed */
1378   public long getLastFlushTime() {
1379     return this.lastFlushTime;
1380   }
1381 
1382   //////////////////////////////////////////////////////////////////////////////
1383   // HRegion maintenance.
1384   //
1385   // These methods are meant to be called periodically by the HRegionServer for
1386   // upkeep.
1387   //////////////////////////////////////////////////////////////////////////////
1388 
1389   /** @return returns size of largest HStore. */
1390   public long getLargestHStoreSize() {
1391     long size = 0;
1392     for (Store h : stores.values()) {
1393       long storeSize = h.getSize();
1394       if (storeSize > size) {
1395         size = storeSize;
1396       }
1397     }
1398     return size;
1399   }
1400 
1401   /**
1402    * @return KeyValue Comparator
1403    */
1404   public KeyValue.KVComparator getComparator() {
1405     return this.comparator;
1406   }
1407 
1408   /*
1409    * Do preparation for pending compaction.
1410    * @throws IOException
1411    */
1412   protected void doRegionCompactionPrep() throws IOException {
1413   }
1414 
1415   void triggerMajorCompaction() {
1416     for (Store h : stores.values()) {
1417       h.triggerMajorCompaction();
1418     }
1419   }
1420 
1421   /**
1422    * This is a helper function that compact all the stores synchronously
1423    * It is used by utilities and testing
1424    *
1425    * @param majorCompaction True to force a major compaction regardless of thresholds
1426    * @throws IOException e
1427    */
1428   public void compactStores(final boolean majorCompaction)
1429   throws IOException {
1430     if (majorCompaction) {
1431       this.triggerMajorCompaction();
1432     }
1433     compactStores();
1434   }
1435 
1436   /**
1437    * This is a helper function that compact all the stores synchronously
1438    * It is used by utilities and testing
1439    *
1440    * @throws IOException e
1441    */
1442   public void compactStores() throws IOException {
1443     for (Store s : getStores().values()) {
1444       CompactionContext compaction = s.requestCompaction();
1445       if (compaction != null) {
1446         compact(compaction, s);
1447       }
1448     }
1449   }
1450 
1451   /*
1452    * Called by compaction thread and after region is opened to compact the
1453    * HStores if necessary.
1454    *
1455    * <p>This operation could block for a long time, so don't call it from a
1456    * time-sensitive thread.
1457    *
1458    * Note that no locking is necessary at this level because compaction only
1459    * conflicts with a region split, and that cannot happen because the region
1460    * server does them sequentially and not in parallel.
1461    *
1462    * @param cr Compaction details, obtained by requestCompaction()
1463    * @return whether the compaction completed
1464    * @throws IOException e
1465    */
1466   public boolean compact(CompactionContext compaction, Store store) throws IOException {
1467     assert compaction != null && compaction.hasSelection();
1468     assert !compaction.getRequest().getFiles().isEmpty();
1469     if (this.closing.get() || this.closed.get()) {
1470       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1471       store.cancelRequestedCompaction(compaction);
1472       return false;
1473     }
1474     MonitoredTask status = null;
1475     boolean requestNeedsCancellation = true;
1476     // block waiting for the lock for compaction
1477     lock.readLock().lock();
1478     try {
1479       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1480       if (stores.get(cf) != store) {
1481         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1482             + " has been re-instantiated, cancel this compaction request. "
1483             + " It may be caused by the roll back of split transaction");
1484         return false;
1485       }
1486 
1487       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1488       if (this.closed.get()) {
1489         String msg = "Skipping compaction on " + this + " because closed";
1490         LOG.debug(msg);
1491         status.abort(msg);
1492         return false;
1493       }
1494       boolean wasStateSet = false;
1495       try {
1496         synchronized (writestate) {
1497           if (writestate.writesEnabled) {
1498             wasStateSet = true;
1499             ++writestate.compacting;
1500           } else {
1501             String msg = "NOT compacting region " + this + ". Writes disabled.";
1502             LOG.info(msg);
1503             status.abort(msg);
1504             return false;
1505           }
1506         }
1507         LOG.info("Starting compaction on " + store + " in region " + this
1508             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1509         doRegionCompactionPrep();
1510         try {
1511           status.setStatus("Compacting store " + store);
1512           // We no longer need to cancel the request on the way out of this
1513           // method because Store#compact will clean up unconditionally
1514           requestNeedsCancellation = false;
1515           store.compact(compaction);
1516         } catch (InterruptedIOException iioe) {
1517           String msg = "compaction interrupted";
1518           LOG.info(msg, iioe);
1519           status.abort(msg);
1520           return false;
1521         }
1522       } finally {
1523         if (wasStateSet) {
1524           synchronized (writestate) {
1525             --writestate.compacting;
1526             if (writestate.compacting <= 0) {
1527               writestate.notifyAll();
1528             }
1529           }
1530         }
1531       }
1532       status.markComplete("Compaction complete");
1533       return true;
1534     } finally {
1535       try {
1536         if (requestNeedsCancellation) store.cancelRequestedCompaction(compaction);
1537         if (status != null) status.cleanup();
1538       } finally {
1539         lock.readLock().unlock();
1540       }
1541     }
1542   }
1543 
1544   /**
1545    * Flush the cache.
1546    *
1547    * When this method is called the cache will be flushed unless:
1548    * <ol>
1549    *   <li>the cache is empty</li>
1550    *   <li>the region is closed.</li>
1551    *   <li>a flush is already in progress</li>
1552    *   <li>writes are disabled</li>
1553    * </ol>
1554    *
1555    * <p>This method may block for some time, so it should not be called from a
1556    * time-sensitive thread.
1557    *
1558    * @return true if the region needs compacting
1559    *
1560    * @throws IOException general io exceptions
1561    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1562    * because a Snapshot was not properly persisted.
1563    */
1564   public FlushResult flushcache() throws IOException {
1565     // fail-fast instead of waiting on the lock
1566     if (this.closing.get()) {
1567       String msg = "Skipping flush on " + this + " because closing";
1568       LOG.debug(msg);
1569       return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1570     }
1571     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1572     status.setStatus("Acquiring readlock on region");
1573     // block waiting for the lock for flushing cache
1574     lock.readLock().lock();
1575     try {
1576       if (this.closed.get()) {
1577         String msg = "Skipping flush on " + this + " because closed";
1578         LOG.debug(msg);
1579         status.abort(msg);
1580         return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1581       }
1582       if (coprocessorHost != null) {
1583         status.setStatus("Running coprocessor pre-flush hooks");
1584         coprocessorHost.preFlush();
1585       }
1586       if (numMutationsWithoutWAL.get() > 0) {
1587         numMutationsWithoutWAL.set(0);
1588         dataInMemoryWithoutWAL.set(0);
1589       }
1590       synchronized (writestate) {
1591         if (!writestate.flushing && writestate.writesEnabled) {
1592           this.writestate.flushing = true;
1593         } else {
1594           if (LOG.isDebugEnabled()) {
1595             LOG.debug("NOT flushing memstore for region " + this
1596                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1597                 + writestate.writesEnabled);
1598           }
1599           String msg = "Not flushing since "
1600               + (writestate.flushing ? "already flushing"
1601               : "writes not enabled");
1602           status.abort(msg);
1603           return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1604         }
1605       }
1606       try {
1607         FlushResult fs = internalFlushcache(status);
1608 
1609         if (coprocessorHost != null) {
1610           status.setStatus("Running post-flush coprocessor hooks");
1611           coprocessorHost.postFlush();
1612         }
1613 
1614         status.markComplete("Flush successful");
1615         return fs;
1616       } finally {
1617         synchronized (writestate) {
1618           writestate.flushing = false;
1619           this.writestate.flushRequested = false;
1620           writestate.notifyAll();
1621         }
1622       }
1623     } finally {
1624       lock.readLock().unlock();
1625       status.cleanup();
1626     }
1627   }
1628 
1629   /**
1630    * Should the memstore be flushed now
1631    */
1632   boolean shouldFlush() {
1633     // This is a rough measure.
1634     if (this.completeSequenceId > 0
1635           && (this.completeSequenceId + this.flushPerChanges < this.sequenceId.get())) {
1636       return true;
1637     }
1638     if (flushCheckInterval <= 0) { //disabled
1639       return false;
1640     }
1641     long now = EnvironmentEdgeManager.currentTimeMillis();
1642     //if we flushed in the recent past, we don't need to do again now
1643     if ((now - getLastFlushTime() < flushCheckInterval)) {
1644       return false;
1645     }
1646     //since we didn't flush in the recent past, flush now if certain conditions
1647     //are met. Return true on first such memstore hit.
1648     for (Store s : this.getStores().values()) {
1649       if (s.timeOfOldestEdit() < now - flushCheckInterval) {
1650         // we have an old enough edit in the memstore, flush
1651         return true;
1652       }
1653     }
1654     return false;
1655   }
1656 
1657   /**
1658    * Flush the memstore.
1659    *
1660    * Flushing the memstore is a little tricky. We have a lot of updates in the
1661    * memstore, all of which have also been written to the log. We need to
1662    * write those updates in the memstore out to disk, while being able to
1663    * process reads/writes as much as possible during the flush operation. Also,
1664    * the log has to state clearly the point in time at which the memstore was
1665    * flushed. (That way, during recovery, we know when we can rely on the
1666    * on-disk flushed structures and when we have to recover the memstore from
1667    * the log.)
1668    *
1669    * <p>So, we have a three-step process:
1670    *
1671    * <ul><li>A. Flush the memstore to the on-disk stores, noting the current
1672    * sequence ID for the log.<li>
1673    *
1674    * <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
1675    * ID that was current at the time of memstore-flush.</li>
1676    *
1677    * <li>C. Get rid of the memstore structures that are now redundant, as
1678    * they've been flushed to the on-disk HStores.</li>
1679    * </ul>
1680    * <p>This method is protected, but can be accessed via several public
1681    * routes.
1682    *
1683    * <p> This method may block for some time.
1684    * @param status
1685    *
1686    * @return object describing the flush's state
1687    *
1688    * @throws IOException general io exceptions
1689    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1690    * because a Snapshot was not properly persisted.
1691    */
1692   protected FlushResult internalFlushcache(MonitoredTask status)
1693       throws IOException {
1694     return internalFlushcache(this.log, -1, status);
1695   }
1696 
1697   /**
1698    * @param wal Null if we're NOT to go via hlog/wal.
1699    * @param myseqid The seqid to use if <code>wal</code> is null writing out
1700    * flush file.
1701    * @param status
1702    * @return true if the region needs compacting
1703    * @throws IOException
1704    * @see #internalFlushcache(MonitoredTask)
1705    */
1706   protected FlushResult internalFlushcache(
1707       final HLog wal, final long myseqid, MonitoredTask status)
1708   throws IOException {
1709     if (this.rsServices != null && this.rsServices.isAborted()) {
1710       // Don't flush when server aborting, it's unsafe
1711       throw new IOException("Aborting flush because server is abortted...");
1712     }
1713     final long startTime = EnvironmentEdgeManager.currentTimeMillis();
1714     // Clear flush flag.
1715     // If nothing to flush, return and avoid logging start/stop flush.
1716     if (this.memstoreSize.get() <= 0) {
1717       if(LOG.isDebugEnabled()) {
1718         LOG.debug("Empty memstore size for the current region "+this);
1719       }
1720       return new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush");
1721     }
1722 
1723     LOG.info("Started memstore flush for " + this +
1724       ", current region memstore size " +
1725       StringUtils.humanReadableInt(this.memstoreSize.get()) +
1726       ((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
1727 
1728     // Stop updates while we snapshot the memstore of all stores. We only have
1729     // to do this for a moment.  Its quick.  The subsequent sequence id that
1730     // goes into the HLog after we've flushed all these snapshots also goes
1731     // into the info file that sits beside the flushed files.
1732     // We also set the memstore size to zero here before we allow updates
1733     // again so its value will represent the size of the updates received
1734     // during the flush
1735     MultiVersionConsistencyControl.WriteEntry w = null;
1736 
1737     // We have to take a write lock during snapshot, or else a write could
1738     // end up in both snapshot and memstore (makes it difficult to do atomic
1739     // rows then)
1740     status.setStatus("Obtaining lock to block concurrent updates");
1741     // block waiting for the lock for internal flush
1742     this.updatesLock.writeLock().lock();
1743     long totalFlushableSize = 0;
1744     status.setStatus("Preparing to flush by snapshotting stores");
1745     List<StoreFlushContext> storeFlushCtxs = new ArrayList<StoreFlushContext>(stores.size());
1746     long flushSeqId = -1L;
1747     try {
1748       // Record the mvcc for all transactions in progress.
1749       w = mvcc.beginMemstoreInsert();
1750       mvcc.advanceMemstore(w);
1751       // check if it is not closing.
1752       if (wal != null) {
1753         if (!wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes())) {
1754           String msg = "Flush will not be started for ["
1755               + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.";
1756           status.setStatus(msg);
1757           return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
1758         }
1759         flushSeqId = this.sequenceId.incrementAndGet();
1760       } else {
1761         // use the provided sequence Id as WAL is not being used for this flush.
1762         flushSeqId = myseqid;
1763       }
1764 
1765       for (Store s : stores.values()) {
1766         totalFlushableSize += s.getFlushableSize();
1767         storeFlushCtxs.add(s.createFlushContext(flushSeqId));
1768       }
1769 
1770       // prepare flush (take a snapshot)
1771       for (StoreFlushContext flush : storeFlushCtxs) {
1772         flush.prepare();
1773       }
1774     } finally {
1775       this.updatesLock.writeLock().unlock();
1776     }
1777     String s = "Finished memstore snapshotting " + this +
1778       ", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSize;
1779     status.setStatus(s);
1780     if (LOG.isTraceEnabled()) LOG.trace(s);
1781 
1782     // sync unflushed WAL changes when deferred log sync is enabled
1783     // see HBASE-8208 for details
1784     if (wal != null && !shouldSyncLog()) {
1785       wal.sync();
1786     }
1787 
1788     // wait for all in-progress transactions to commit to HLog before
1789     // we can start the flush. This prevents
1790     // uncommitted transactions from being written into HFiles.
1791     // We have to block before we start the flush, otherwise keys that
1792     // were removed via a rollbackMemstore could be written to Hfiles.
1793     mvcc.waitForRead(w);
1794 
1795     s = "Flushing stores of " + this;
1796     status.setStatus(s);
1797     if (LOG.isTraceEnabled()) LOG.trace(s);
1798 
1799     // Any failure from here on out will be catastrophic requiring server
1800     // restart so hlog content can be replayed and put back into the memstore.
1801     // Otherwise, the snapshot content while backed up in the hlog, it will not
1802     // be part of the current running servers state.
1803     boolean compactionRequested = false;
1804     try {
1805       // A.  Flush memstore to all the HStores.
1806       // Keep running vector of all store files that includes both old and the
1807       // just-made new flush store file. The new flushed file is still in the
1808       // tmp directory.
1809 
1810       for (StoreFlushContext flush : storeFlushCtxs) {
1811         flush.flushCache(status);
1812       }
1813 
1814       // Switch snapshot (in memstore) -> new hfile (thus causing
1815       // all the store scanners to reset/reseek).
1816       for (StoreFlushContext flush : storeFlushCtxs) {
1817         boolean needsCompaction = flush.commit(status);
1818         if (needsCompaction) {
1819           compactionRequested = true;
1820         }
1821       }
1822       storeFlushCtxs.clear();
1823 
1824       // Set down the memstore size by amount of flush.
1825       this.addAndGetGlobalMemstoreSize(-totalFlushableSize);
1826     } catch (Throwable t) {
1827       // An exception here means that the snapshot was not persisted.
1828       // The hlog needs to be replayed so its content is restored to memstore.
1829       // Currently, only a server restart will do this.
1830       // We used to only catch IOEs but its possible that we'd get other
1831       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
1832       // all and sundry.
1833       if (wal != null) {
1834         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1835       }
1836       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
1837           Bytes.toStringBinary(getRegionName()));
1838       dse.initCause(t);
1839       status.abort("Flush failed: " + StringUtils.stringifyException(t));
1840       throw dse;
1841     }
1842 
1843     // If we get to here, the HStores have been written.
1844     if (wal != null) {
1845       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1846     }
1847 
1848     // Record latest flush time
1849     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
1850 
1851     // Update the last flushed sequence id for region
1852     completeSequenceId = flushSeqId;
1853 
1854     // C. Finally notify anyone waiting on memstore to clear:
1855     // e.g. checkResources().
1856     synchronized (this) {
1857       notifyAll(); // FindBugs NN_NAKED_NOTIFY
1858     }
1859 
1860     long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
1861     long memstoresize = this.memstoreSize.get();
1862     String msg = "Finished memstore flush of ~" +
1863       StringUtils.humanReadableInt(totalFlushableSize) + "/" + totalFlushableSize +
1864       ", currentsize=" +
1865       StringUtils.humanReadableInt(memstoresize) + "/" + memstoresize +
1866       " for region " + this + " in " + time + "ms, sequenceid=" + flushSeqId +
1867       ", compaction requested=" + compactionRequested +
1868       ((wal == null)? "; wal=null": "");
1869     LOG.info(msg);
1870     status.setStatus(msg);
1871     this.recentFlushes.add(new Pair<Long,Long>(time/1000, totalFlushableSize));
1872 
1873     return new FlushResult(compactionRequested ? FlushResult.Result.FLUSHED_COMPACTION_NEEDED :
1874         FlushResult.Result.FLUSHED_NO_COMPACTION_NEEDED, flushSeqId);
1875   }
1876 
1877   //////////////////////////////////////////////////////////////////////////////
1878   // get() methods for client use.
1879   //////////////////////////////////////////////////////////////////////////////
1880   /**
1881    * Return all the data for the row that matches <i>row</i> exactly,
1882    * or the one that immediately preceeds it, at or immediately before
1883    * <i>ts</i>.
1884    *
1885    * @param row row key
1886    * @return map of values
1887    * @throws IOException
1888    */
1889   Result getClosestRowBefore(final byte [] row)
1890   throws IOException{
1891     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
1892   }
1893 
1894   /**
1895    * Return all the data for the row that matches <i>row</i> exactly,
1896    * or the one that immediately preceeds it, at or immediately before
1897    * <i>ts</i>.
1898    *
1899    * @param row row key
1900    * @param family column family to find on
1901    * @return map of values
1902    * @throws IOException read exceptions
1903    */
1904   public Result getClosestRowBefore(final byte [] row, final byte [] family)
1905   throws IOException {
1906     if (coprocessorHost != null) {
1907       Result result = new Result();
1908       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
1909         return result;
1910       }
1911     }
1912     // look across all the HStores for this region and determine what the
1913     // closest key is across all column families, since the data may be sparse
1914     checkRow(row, "getClosestRowBefore");
1915     startRegionOperation(Operation.GET);
1916     this.readRequestsCount.increment();
1917     try {
1918       Store store = getStore(family);
1919       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
1920       KeyValue key = store.getRowKeyAtOrBefore(row);
1921       Result result = null;
1922       if (key != null) {
1923         Get get = new Get(key.getRow());
1924         get.addFamily(family);
1925         result = get(get);
1926       }
1927       if (coprocessorHost != null) {
1928         coprocessorHost.postGetClosestRowBefore(row, family, result);
1929       }
1930       return result;
1931     } finally {
1932       closeRegionOperation(Operation.GET);
1933     }
1934   }
1935 
1936   /**
1937    * Return an iterator that scans over the HRegion, returning the indicated
1938    * columns and rows specified by the {@link Scan}.
1939    * <p>
1940    * This Iterator must be closed by the caller.
1941    *
1942    * @param scan configured {@link Scan}
1943    * @return RegionScanner
1944    * @throws IOException read exceptions
1945    */
1946   public RegionScanner getScanner(Scan scan) throws IOException {
1947    return getScanner(scan, null);
1948   }
1949 
1950   void prepareScanner(Scan scan) throws IOException {
1951     if(!scan.hasFamilies()) {
1952       // Adding all families to scanner
1953       for(byte[] family: this.htableDescriptor.getFamiliesKeys()){
1954         scan.addFamily(family);
1955       }
1956     }
1957   }
1958 
1959   protected RegionScanner getScanner(Scan scan,
1960       List<KeyValueScanner> additionalScanners) throws IOException {
1961     startRegionOperation(Operation.SCAN);
1962     try {
1963       // Verify families are all valid
1964       prepareScanner(scan);
1965       if(scan.hasFamilies()) {
1966         for(byte [] family : scan.getFamilyMap().keySet()) {
1967           checkFamily(family);
1968         }
1969       }
1970       return instantiateRegionScanner(scan, additionalScanners);
1971     } finally {
1972       closeRegionOperation(Operation.SCAN);
1973     }
1974   }
1975 
1976   protected RegionScanner instantiateRegionScanner(Scan scan,
1977       List<KeyValueScanner> additionalScanners) throws IOException {
1978     if (scan.isReversed()) {
1979       if (scan.getFilter() != null) {
1980         scan.getFilter().setReversed(true);
1981       }
1982       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
1983     }
1984     return new RegionScannerImpl(scan, additionalScanners, this);
1985   }
1986 
1987   /*
1988    * @param delete The passed delete is modified by this method. WARNING!
1989    */
1990   void prepareDelete(Delete delete) throws IOException {
1991     // Check to see if this is a deleteRow insert
1992     if(delete.getFamilyCellMap().isEmpty()){
1993       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
1994         // Don't eat the timestamp
1995         delete.deleteFamily(family, delete.getTimeStamp());
1996       }
1997     } else {
1998       for(byte [] family : delete.getFamilyCellMap().keySet()) {
1999         if(family == null) {
2000           throw new NoSuchColumnFamilyException("Empty family is invalid");
2001         }
2002         checkFamily(family);
2003       }
2004     }
2005   }
2006 
2007   //////////////////////////////////////////////////////////////////////////////
2008   // set() methods for client use.
2009   //////////////////////////////////////////////////////////////////////////////
2010   /**
2011    * @param delete delete object
2012    * @throws IOException read exceptions
2013    */
2014   public void delete(Delete delete)
2015   throws IOException {
2016     checkReadOnly();
2017     checkResources();
2018     startRegionOperation(Operation.DELETE);
2019     try {
2020       delete.getRow();
2021       // All edits for the given row (across all column families) must happen atomically.
2022       doBatchMutate(delete);
2023     } finally {
2024       closeRegionOperation(Operation.DELETE);
2025     }
2026   }
2027 
2028   /**
2029    * Row needed by below method.
2030    */
2031   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
2032   /**
2033    * This is used only by unit tests. Not required to be a public API.
2034    * @param familyMap map of family to edits for the given family.
2035    * @param durability
2036    * @throws IOException
2037    */
2038   void delete(NavigableMap<byte[], List<Cell>> familyMap,
2039       Durability durability) throws IOException {
2040     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
2041     delete.setFamilyCellMap(familyMap);
2042     delete.setDurability(durability);
2043     doBatchMutate(delete);
2044   }
2045 
2046   /**
2047    * Setup correct timestamps in the KVs in Delete object.
2048    * Caller should have the row and region locks.
2049    * @param mutation
2050    * @param familyMap
2051    * @param byteNow
2052    * @throws IOException
2053    */
2054   void prepareDeleteTimestamps(Mutation mutation, Map<byte[], List<Cell>> familyMap,
2055       byte[] byteNow) throws IOException {
2056     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2057 
2058       byte[] family = e.getKey();
2059       List<Cell> cells = e.getValue();
2060       assert cells instanceof RandomAccess;
2061 
2062       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
2063       int listSize = cells.size();
2064       for (int i=0; i < listSize; i++) {
2065         Cell cell = cells.get(i);
2066         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2067         //  Check if time is LATEST, change to time of most recent addition if so
2068         //  This is expensive.
2069         if (kv.isLatestTimestamp() && kv.isDeleteType()) {
2070           byte[] qual = kv.getQualifier();
2071           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
2072 
2073           Integer count = kvCount.get(qual);
2074           if (count == null) {
2075             kvCount.put(qual, 1);
2076           } else {
2077             kvCount.put(qual, count + 1);
2078           }
2079           count = kvCount.get(qual);
2080 
2081           Get get = new Get(kv.getRow());
2082           get.setMaxVersions(count);
2083           get.addColumn(family, qual);
2084           if (coprocessorHost != null) {
2085             if (!coprocessorHost.prePrepareTimeStampForDeleteVersion(mutation, cell,
2086                 byteNow, get)) {
2087               updateDeleteLatestVersionTimeStamp(kv, get, count, byteNow);
2088             }
2089           } else {
2090             updateDeleteLatestVersionTimeStamp(kv, get, count, byteNow);
2091           }
2092         } else {
2093           kv.updateLatestStamp(byteNow);
2094         }
2095       }
2096     }
2097   }
2098 
2099   void updateDeleteLatestVersionTimeStamp(KeyValue kv, Get get, int count, byte[] byteNow)
2100       throws IOException {
2101     List<Cell> result = get(get, false);
2102 
2103     if (result.size() < count) {
2104       // Nothing to delete
2105       kv.updateLatestStamp(byteNow);
2106       return;
2107     }
2108     if (result.size() > count) {
2109       throw new RuntimeException("Unexpected size: " + result.size());
2110     }
2111     KeyValue getkv = KeyValueUtil.ensureKeyValue(result.get(count - 1));
2112     Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(), getkv.getBuffer(),
2113         getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
2114   }
2115 
2116   /**
2117    * @param put
2118    * @throws IOException
2119    */
2120   public void put(Put put)
2121   throws IOException {
2122     checkReadOnly();
2123 
2124     // Do a rough check that we have resources to accept a write.  The check is
2125     // 'rough' in that between the resource check and the call to obtain a
2126     // read lock, resources may run out.  For now, the thought is that this
2127     // will be extremely rare; we'll deal with it when it happens.
2128     checkResources();
2129     startRegionOperation(Operation.PUT);
2130     try {
2131       // All edits for the given row (across all column families) must happen atomically.
2132       doBatchMutate(put);
2133     } finally {
2134       closeRegionOperation(Operation.PUT);
2135     }
2136   }
2137 
2138   /**
2139    * Struct-like class that tracks the progress of a batch operation,
2140    * accumulating status codes and tracking the index at which processing
2141    * is proceeding.
2142    */
2143   private abstract static class BatchOperationInProgress<T> {
2144     T[] operations;
2145     int nextIndexToProcess = 0;
2146     OperationStatus[] retCodeDetails;
2147     WALEdit[] walEditsFromCoprocessors;
2148 
2149     public BatchOperationInProgress(T[] operations) {
2150       this.operations = operations;
2151       this.retCodeDetails = new OperationStatus[operations.length];
2152       this.walEditsFromCoprocessors = new WALEdit[operations.length];
2153       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
2154     }
2155 
2156     public abstract Mutation getMutation(int index);
2157     public abstract long getNonceGroup(int index);
2158     public abstract long getNonce(int index);
2159     /** This method is potentially expensive and should only be used for non-replay CP path. */
2160     public abstract Mutation[] getMutationsForCoprocs();
2161     public abstract boolean isInReplay();
2162 
2163     public boolean isDone() {
2164       return nextIndexToProcess == operations.length;
2165     }
2166   }
2167 
2168   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2169     private long nonceGroup;
2170     private long nonce;
2171     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2172       super(operations);
2173       this.nonceGroup = nonceGroup;
2174       this.nonce = nonce;
2175     }
2176 
2177     public Mutation getMutation(int index) {
2178       return this.operations[index];
2179     }
2180 
2181     @Override
2182     public long getNonceGroup(int index) {
2183       return nonceGroup;
2184     }
2185 
2186     @Override
2187     public long getNonce(int index) {
2188       return nonce;
2189     }
2190 
2191     @Override
2192     public Mutation[] getMutationsForCoprocs() {
2193       return this.operations;
2194     }
2195 
2196     @Override
2197     public boolean isInReplay() {
2198       return false;
2199     }
2200   }
2201 
2202   private static class ReplayBatch extends BatchOperationInProgress<HLogSplitter.MutationReplay> {
2203     public ReplayBatch(MutationReplay[] operations) {
2204       super(operations);
2205     }
2206 
2207     @Override
2208     public Mutation getMutation(int index) {
2209       return this.operations[index].mutation;
2210     }
2211 
2212     @Override
2213     public long getNonceGroup(int index) {
2214       return this.operations[index].nonceGroup;
2215     }
2216 
2217     @Override
2218     public long getNonce(int index) {
2219       return this.operations[index].nonce;
2220     }
2221 
2222     @Override
2223     public Mutation[] getMutationsForCoprocs() {
2224       assert false;
2225       throw new RuntimeException("Should not be called for replay batch");
2226     }
2227 
2228     @Override
2229     public boolean isInReplay() {
2230       return true;
2231     }
2232   }
2233 
2234   /**
2235    * Perform a batch of mutations.
2236    * It supports only Put and Delete mutations and will ignore other types passed.
2237    * @param mutations the list of mutations
2238    * @return an array of OperationStatus which internally contains the
2239    *         OperationStatusCode and the exceptionMessage if any.
2240    * @throws IOException
2241    */
2242   public OperationStatus[] batchMutate(
2243       Mutation[] mutations, long nonceGroup, long nonce) throws IOException {
2244     // As it stands, this is used for 3 things
2245     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2246     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2247     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2248     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2249   }
2250 
2251   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2252     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2253   }
2254 
2255   /**
2256    * Replay a batch of mutations.
2257    * @param mutations mutations to replay.
2258    * @return an array of OperationStatus which internally contains the
2259    *         OperationStatusCode and the exceptionMessage if any.
2260    * @throws IOException
2261    */
2262   public OperationStatus[] batchReplay(HLogSplitter.MutationReplay[] mutations)
2263       throws IOException {
2264     return batchMutate(new ReplayBatch(mutations));
2265   }
2266 
2267   /**
2268    * Perform a batch of mutations.
2269    * It supports only Put and Delete mutations and will ignore other types passed.
2270    * @param batchOp contains the list of mutations
2271    * @return an array of OperationStatus which internally contains the
2272    *         OperationStatusCode and the exceptionMessage if any.
2273    * @throws IOException
2274    */
2275   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2276     boolean initialized = false;
2277     Operation op = batchOp.isInReplay() ? Operation.REPLAY_BATCH_MUTATE : Operation.BATCH_MUTATE;
2278     startRegionOperation(op);
2279     try {
2280       while (!batchOp.isDone()) {
2281         if (!batchOp.isInReplay()) {
2282           checkReadOnly();
2283         }
2284         checkResources();
2285 
2286         if (!initialized) {
2287           this.writeRequestsCount.add(batchOp.operations.length);
2288           if (!batchOp.isInReplay()) {
2289             doPreMutationHook(batchOp);
2290           }
2291           initialized = true;
2292         }
2293         long addedSize = doMiniBatchMutation(batchOp);
2294         long newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2295         if (isFlushSize(newSize)) {
2296           requestFlush();
2297         }
2298       }
2299     } finally {
2300       closeRegionOperation(op);
2301     }
2302     return batchOp.retCodeDetails;
2303   }
2304 
2305 
2306   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2307       throws IOException {
2308     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2309     WALEdit walEdit = new WALEdit();
2310     if (coprocessorHost != null) {
2311       for (int i = 0 ; i < batchOp.operations.length; i++) {
2312         Mutation m = batchOp.getMutation(i);
2313         if (m instanceof Put) {
2314           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2315             // pre hook says skip this Put
2316             // mark as success and skip in doMiniBatchMutation
2317             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2318           }
2319         } else if (m instanceof Delete) {
2320           Delete curDel = (Delete) m;
2321           if (curDel.getFamilyCellMap().isEmpty()) {
2322             // handle deleting a row case
2323             prepareDelete(curDel);
2324           }
2325           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2326             // pre hook says skip this Delete
2327             // mark as success and skip in doMiniBatchMutation
2328             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2329           }
2330         } else {
2331           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2332           // mark the operation return code as failure so that it will not be considered in
2333           // the doMiniBatchMutation
2334           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2335               "Put/Delete mutations only supported in batchMutate() now");
2336         }
2337         if (!walEdit.isEmpty()) {
2338           batchOp.walEditsFromCoprocessors[i] = walEdit;
2339           walEdit = new WALEdit();
2340         }
2341       }
2342     }
2343   }
2344 
2345   @SuppressWarnings("unchecked")
2346   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2347     boolean isInReplay = batchOp.isInReplay();
2348     // variable to note if all Put items are for the same CF -- metrics related
2349     boolean putsCfSetConsistent = true;
2350     //The set of columnFamilies first seen for Put.
2351     Set<byte[]> putsCfSet = null;
2352     // variable to note if all Delete items are for the same CF -- metrics related
2353     boolean deletesCfSetConsistent = true;
2354     //The set of columnFamilies first seen for Delete.
2355     Set<byte[]> deletesCfSet = null;
2356 
2357     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2358     WALEdit walEdit = new WALEdit(isInReplay);
2359     MultiVersionConsistencyControl.WriteEntry w = null;
2360     long txid = 0;
2361     boolean doRollBackMemstore = false;
2362     boolean locked = false;
2363 
2364     /** Keep track of the locks we hold so we can release them in finally clause */
2365     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2366     // reference family maps directly so coprocessors can mutate them if desired
2367     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2368     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2369     int firstIndex = batchOp.nextIndexToProcess;
2370     int lastIndexExclusive = firstIndex;
2371     boolean success = false;
2372     int noOfPuts = 0, noOfDeletes = 0;
2373     try {
2374       // ------------------------------------
2375       // STEP 1. Try to acquire as many locks as we can, and ensure
2376       // we acquire at least one.
2377       // ----------------------------------
2378       int numReadyToWrite = 0;
2379       long now = EnvironmentEdgeManager.currentTimeMillis();
2380       while (lastIndexExclusive < batchOp.operations.length) {
2381         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
2382         boolean isPutMutation = mutation instanceof Put;
2383 
2384         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2385         // store the family map reference to allow for mutations
2386         familyMaps[lastIndexExclusive] = familyMap;
2387 
2388         // skip anything that "ran" already
2389         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
2390             != OperationStatusCode.NOT_RUN) {
2391           lastIndexExclusive++;
2392           continue;
2393         }
2394 
2395         try {
2396           if (isPutMutation) {
2397             // Check the families in the put. If bad, skip this one.
2398             if (isInReplay) {
2399               removeNonExistentColumnFamilyForReplay(familyMap);
2400             } else {
2401               checkFamilies(familyMap.keySet());
2402             }
2403             checkTimestamps(mutation.getFamilyCellMap(), now);
2404           } else {
2405             prepareDelete((Delete) mutation);
2406           }
2407         } catch (NoSuchColumnFamilyException nscf) {
2408           LOG.warn("No such column family in batch mutation", nscf);
2409           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2410               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
2411           lastIndexExclusive++;
2412           continue;
2413         } catch (FailedSanityCheckException fsce) {
2414           LOG.warn("Batch Mutation did not pass sanity check", fsce);
2415           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2416               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
2417           lastIndexExclusive++;
2418           continue;
2419         }
2420 
2421         // If we haven't got any rows in our batch, we should block to
2422         // get the next one.
2423         boolean shouldBlock = numReadyToWrite == 0;
2424         RowLock rowLock = null;
2425         try {
2426           rowLock = getRowLockInternal(mutation.getRow(), shouldBlock);
2427         } catch (IOException ioe) {
2428           LOG.warn("Failed getting lock in batch put, row="
2429             + Bytes.toStringBinary(mutation.getRow()), ioe);
2430         }
2431         if (rowLock == null) {
2432           // We failed to grab another lock
2433           assert !shouldBlock : "Should never fail to get lock when blocking";
2434           break; // stop acquiring more rows for this batch
2435         } else {
2436           acquiredRowLocks.add(rowLock);
2437         }
2438 
2439         lastIndexExclusive++;
2440         numReadyToWrite++;
2441 
2442         if (isPutMutation) {
2443           // If Column Families stay consistent through out all of the
2444           // individual puts then metrics can be reported as a mutliput across
2445           // column families in the first put.
2446           if (putsCfSet == null) {
2447             putsCfSet = mutation.getFamilyCellMap().keySet();
2448           } else {
2449             putsCfSetConsistent = putsCfSetConsistent
2450                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
2451           }
2452         } else {
2453           if (deletesCfSet == null) {
2454             deletesCfSet = mutation.getFamilyCellMap().keySet();
2455           } else {
2456             deletesCfSetConsistent = deletesCfSetConsistent
2457                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
2458           }
2459         }
2460       }
2461 
2462       // we should record the timestamp only after we have acquired the rowLock,
2463       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
2464       now = EnvironmentEdgeManager.currentTimeMillis();
2465       byte[] byteNow = Bytes.toBytes(now);
2466 
2467       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
2468       if (numReadyToWrite <= 0) return 0L;
2469 
2470       // We've now grabbed as many mutations off the list as we can
2471 
2472       // ------------------------------------
2473       // STEP 2. Update any LATEST_TIMESTAMP timestamps
2474       // ----------------------------------
2475       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2476         // skip invalid
2477         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2478             != OperationStatusCode.NOT_RUN) continue;
2479 
2480         Mutation mutation = batchOp.getMutation(i);
2481         if (mutation instanceof Put) {
2482           updateKVTimestamps(familyMaps[i].values(), byteNow);
2483           noOfPuts++;
2484         } else {
2485           if (!isInReplay) {
2486             prepareDeleteTimestamps(mutation, familyMaps[i], byteNow);
2487           }
2488           noOfDeletes++;
2489         }
2490         rewriteCellTags(familyMaps[i], mutation);
2491       }
2492 
2493       lock(this.updatesLock.readLock(), numReadyToWrite);
2494       locked = true;
2495 
2496       //
2497       // ------------------------------------
2498       // Acquire the latest mvcc number
2499       // ----------------------------------
2500       w = mvcc.beginMemstoreInsert();
2501 
2502       // calling the pre CP hook for batch mutation
2503       if (!isInReplay && coprocessorHost != null) {
2504         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2505           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2506           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2507         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
2508       }
2509 
2510       // ------------------------------------
2511       // STEP 3. Write back to memstore
2512       // Write to memstore. It is ok to write to memstore
2513       // first without updating the HLog because we do not roll
2514       // forward the memstore MVCC. The MVCC will be moved up when
2515       // the complete operation is done. These changes are not yet
2516       // visible to scanners till we update the MVCC. The MVCC is
2517       // moved only when the sync is complete.
2518       // ----------------------------------
2519       long addedSize = 0;
2520       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2521         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2522             != OperationStatusCode.NOT_RUN) {
2523           continue;
2524         }
2525         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
2526         addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
2527       }
2528 
2529       // ------------------------------------
2530       // STEP 4. Build WAL edit
2531       // ----------------------------------
2532       boolean hasWalAppends = false;
2533       Durability durability = Durability.USE_DEFAULT;
2534       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2535         // Skip puts that were determined to be invalid during preprocessing
2536         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2537             != OperationStatusCode.NOT_RUN) {
2538           continue;
2539         }
2540         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2541 
2542         Mutation m = batchOp.getMutation(i);
2543         Durability tmpDur = getEffectiveDurability(m.getDurability());
2544         if (tmpDur.ordinal() > durability.ordinal()) {
2545           durability = tmpDur;
2546         }
2547         if (tmpDur == Durability.SKIP_WAL) {
2548           recordMutationWithoutWal(m.getFamilyCellMap());
2549           continue;
2550         }
2551 
2552         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
2553         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
2554         // Given how nonces are originally written, these should be contiguous.
2555         // They don't have to be, it will still work, just write more WALEdits than needed.
2556         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
2557           if (walEdit.size() > 0) {
2558             assert isInReplay;
2559             if (!isInReplay) {
2560               throw new IOException("Multiple nonces per batch and not in replay");
2561             }
2562             // txid should always increase, so having the one from the last call is ok.
2563             txid = this.log.appendNoSync(this.getRegionInfo(), htableDescriptor.getTableName(),
2564                   walEdit, m.getClusterIds(), now, htableDescriptor, this.sequenceId, true,
2565                   currentNonceGroup, currentNonce);
2566             hasWalAppends = true;
2567             walEdit = new WALEdit(isInReplay);
2568           }
2569           currentNonceGroup = nonceGroup;
2570           currentNonce = nonce;
2571         }
2572 
2573         // Add WAL edits by CP
2574         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
2575         if (fromCP != null) {
2576           for (KeyValue kv : fromCP.getKeyValues()) {
2577             walEdit.add(kv);
2578           }
2579         }
2580         addFamilyMapToWALEdit(familyMaps[i], walEdit);
2581       }
2582 
2583       // -------------------------
2584       // STEP 5. Append the final edit to WAL. Do not sync wal.
2585       // -------------------------
2586       Mutation mutation = batchOp.getMutation(firstIndex);
2587       if (walEdit.size() > 0) {
2588         txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
2589               walEdit, mutation.getClusterIds(), now, this.htableDescriptor, this.sequenceId,
2590               true, currentNonceGroup, currentNonce);
2591         hasWalAppends = true;
2592       }
2593 
2594       // -------------------------------
2595       // STEP 6. Release row locks, etc.
2596       // -------------------------------
2597       if (locked) {
2598         this.updatesLock.readLock().unlock();
2599         locked = false;
2600       }
2601       releaseRowLocks(acquiredRowLocks);
2602 
2603       // -------------------------
2604       // STEP 7. Sync wal.
2605       // -------------------------
2606       if (hasWalAppends) {
2607         syncOrDefer(txid, durability);
2608       }
2609       doRollBackMemstore = false;
2610       // calling the post CP hook for batch mutation
2611       if (!isInReplay && coprocessorHost != null) {
2612         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2613           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2614           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2615         coprocessorHost.postBatchMutate(miniBatchOp);
2616       }
2617 
2618       // ------------------------------------------------------------------
2619       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
2620       // ------------------------------------------------------------------
2621       if (w != null) {
2622         mvcc.completeMemstoreInsert(w);
2623         w = null;
2624       }
2625 
2626       // ------------------------------------
2627       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
2628       // synced so that the coprocessor contract is adhered to.
2629       // ------------------------------------
2630       if (!isInReplay && coprocessorHost != null) {
2631         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2632           // only for successful puts
2633           if (batchOp.retCodeDetails[i].getOperationStatusCode()
2634               != OperationStatusCode.SUCCESS) {
2635             continue;
2636           }
2637           Mutation m = batchOp.getMutation(i);
2638           if (m instanceof Put) {
2639             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
2640           } else {
2641             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
2642           }
2643         }
2644       }
2645 
2646       success = true;
2647       return addedSize;
2648     } finally {
2649 
2650       // if the wal sync was unsuccessful, remove keys from memstore
2651       if (doRollBackMemstore) {
2652         rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive);
2653       }
2654       if (w != null) mvcc.completeMemstoreInsert(w);
2655 
2656       if (locked) {
2657         this.updatesLock.readLock().unlock();
2658       }
2659       releaseRowLocks(acquiredRowLocks);
2660 
2661       // See if the column families were consistent through the whole thing.
2662       // if they were then keep them. If they were not then pass a null.
2663       // null will be treated as unknown.
2664       // Total time taken might be involving Puts and Deletes.
2665       // Split the time for puts and deletes based on the total number of Puts and Deletes.
2666 
2667       if (noOfPuts > 0) {
2668         // There were some Puts in the batch.
2669         if (this.metricsRegion != null) {
2670           this.metricsRegion.updatePut();
2671         }
2672       }
2673       if (noOfDeletes > 0) {
2674         // There were some Deletes in the batch.
2675         if (this.metricsRegion != null) {
2676           this.metricsRegion.updateDelete();
2677         }
2678       }
2679       if (!success) {
2680         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2681           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
2682             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
2683           }
2684         }
2685       }
2686       if (coprocessorHost != null && !batchOp.isInReplay()) {
2687         // call the coprocessor hook to do any finalization steps
2688         // after the put is done
2689         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2690             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2691                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
2692                 lastIndexExclusive);
2693         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
2694       }
2695 
2696       batchOp.nextIndexToProcess = lastIndexExclusive;
2697     }
2698   }
2699 
2700   /**
2701    * Returns effective durability from the passed durability and
2702    * the table descriptor.
2703    */
2704   protected Durability getEffectiveDurability(Durability d) {
2705     return d == Durability.USE_DEFAULT ? this.durability : d;
2706   }
2707 
2708   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2709   //the getting of the lock happens before, so that you would just pass it into
2710   //the methods. So in the case of checkAndMutate you could just do lockRow,
2711   //get, put, unlockRow or something
2712   /**
2713    *
2714    * @param row
2715    * @param family
2716    * @param qualifier
2717    * @param compareOp
2718    * @param comparator
2719    * @param w
2720    * @param writeToWAL
2721    * @throws IOException
2722    * @return true if the new put was executed, false otherwise
2723    */
2724   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
2725       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
2726       boolean writeToWAL)
2727   throws IOException{
2728     checkReadOnly();
2729     //TODO, add check for value length or maybe even better move this to the
2730     //client if this becomes a global setting
2731     checkResources();
2732     boolean isPut = w instanceof Put;
2733     if (!isPut && !(w instanceof Delete))
2734       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
2735           "be Put or Delete");
2736     if (!Bytes.equals(row, w.getRow())) {
2737       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
2738           "getRow must match the passed row");
2739     }
2740 
2741     startRegionOperation();
2742     try {
2743       Get get = new Get(row);
2744       checkFamily(family);
2745       get.addColumn(family, qualifier);
2746 
2747       // Lock row - note that doBatchMutate will relock this row if called
2748       RowLock rowLock = getRowLock(get.getRow());
2749       // wait for all previous transactions to complete (with lock held)
2750       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2751       try {
2752         if (this.getCoprocessorHost() != null) {
2753           Boolean processed = null;
2754           if (w instanceof Put) {
2755             processed = this.getCoprocessorHost().preCheckAndPutAfterRowLock(row, family,
2756                 qualifier, compareOp, comparator, (Put) w);
2757           } else if (w instanceof Delete) {
2758             processed = this.getCoprocessorHost().preCheckAndDeleteAfterRowLock(row, family,
2759                 qualifier, compareOp, comparator, (Delete) w);
2760           }
2761           if (processed != null) {
2762             return processed;
2763           }
2764         }
2765         List<Cell> result = get(get, false);
2766 
2767         boolean valueIsNull = comparator.getValue() == null ||
2768           comparator.getValue().length == 0;
2769         boolean matches = false;
2770         if (result.size() == 0 && valueIsNull) {
2771           matches = true;
2772         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2773             valueIsNull) {
2774           matches = true;
2775         } else if (result.size() == 1 && !valueIsNull) {
2776           Cell kv = result.get(0);
2777           int compareResult = comparator.compareTo(kv.getValueArray(),
2778               kv.getValueOffset(), kv.getValueLength());
2779           switch (compareOp) {
2780           case LESS:
2781             matches = compareResult < 0;
2782             break;
2783           case LESS_OR_EQUAL:
2784             matches = compareResult <= 0;
2785             break;
2786           case EQUAL:
2787             matches = compareResult == 0;
2788             break;
2789           case NOT_EQUAL:
2790             matches = compareResult != 0;
2791             break;
2792           case GREATER_OR_EQUAL:
2793             matches = compareResult >= 0;
2794             break;
2795           case GREATER:
2796             matches = compareResult > 0;
2797             break;
2798           default:
2799             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2800           }
2801         }
2802         //If matches put the new put or delete the new delete
2803         if (matches) {
2804           // All edits for the given row (across all column families) must
2805           // happen atomically.
2806           doBatchMutate((Mutation)w);
2807           this.checkAndMutateChecksPassed.increment();
2808           return true;
2809         }
2810         this.checkAndMutateChecksFailed.increment();
2811         return false;
2812       } finally {
2813         rowLock.release();
2814       }
2815     } finally {
2816       closeRegionOperation();
2817     }
2818   }
2819 
2820   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2821   //the getting of the lock happens before, so that you would just pass it into
2822   //the methods. So in the case of checkAndMutate you could just do lockRow,
2823   //get, put, unlockRow or something
2824   /**
2825    *
2826    * @throws IOException
2827    * @return true if the new put was executed, false otherwise
2828    */
2829   public boolean checkAndRowMutate(byte [] row, byte [] family, byte [] qualifier,
2830       CompareOp compareOp, ByteArrayComparable comparator, RowMutations rm,
2831       boolean writeToWAL)
2832       throws IOException{
2833     checkReadOnly();
2834     //TODO, add check for value length or maybe even better move this to the
2835     //client if this becomes a global setting
2836     checkResources();
2837 
2838     startRegionOperation();
2839     try {
2840       Get get = new Get(row);
2841       checkFamily(family);
2842       get.addColumn(family, qualifier);
2843 
2844       // Lock row - note that doBatchMutate will relock this row if called
2845       RowLock rowLock = getRowLock(get.getRow());
2846       // wait for all previous transactions to complete (with lock held)
2847       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2848       try {
2849         List<Cell> result = get(get, false);
2850 
2851         boolean valueIsNull = comparator.getValue() == null ||
2852             comparator.getValue().length == 0;
2853         boolean matches = false;
2854         if (result.size() == 0 && valueIsNull) {
2855           matches = true;
2856         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2857             valueIsNull) {
2858           matches = true;
2859         } else if (result.size() == 1 && !valueIsNull) {
2860           Cell kv = result.get(0);
2861           int compareResult = comparator.compareTo(kv.getValueArray(),
2862               kv.getValueOffset(), kv.getValueLength());
2863           switch (compareOp) {
2864           case LESS:
2865             matches = compareResult < 0;
2866             break;
2867           case LESS_OR_EQUAL:
2868             matches = compareResult <= 0;
2869             break;
2870           case EQUAL:
2871             matches = compareResult == 0;
2872             break;
2873           case NOT_EQUAL:
2874             matches = compareResult != 0;
2875             break;
2876           case GREATER_OR_EQUAL:
2877             matches = compareResult >= 0;
2878             break;
2879           case GREATER:
2880             matches = compareResult > 0;
2881             break;
2882           default:
2883             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2884           }
2885         }
2886         //If matches put the new put or delete the new delete
2887         if (matches) {
2888           // All edits for the given row (across all column families) must
2889           // happen atomically.
2890           mutateRow(rm);
2891           this.checkAndMutateChecksPassed.increment();
2892           return true;
2893         }
2894         this.checkAndMutateChecksFailed.increment();
2895         return false;
2896       } finally {
2897         rowLock.release();
2898       }
2899     } finally {
2900       closeRegionOperation();
2901     }
2902   }
2903 
2904   private void doBatchMutate(Mutation mutation) throws IOException, DoNotRetryIOException {
2905     // Currently this is only called for puts and deletes, so no nonces.
2906     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation },
2907         HConstants.NO_NONCE, HConstants.NO_NONCE);
2908     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
2909       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
2910     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
2911       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
2912     }
2913   }
2914 
2915   /**
2916    * Complete taking the snapshot on the region. Writes the region info and adds references to the
2917    * working snapshot directory.
2918    *
2919    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
2920    * arg.  (In the future other cancellable HRegion methods could eventually add a
2921    * {@link ForeignExceptionSnare}, or we could do something fancier).
2922    *
2923    * @param desc snasphot description object
2924    * @param exnSnare ForeignExceptionSnare that captures external exeptions in case we need to
2925    *   bail out.  This is allowed to be null and will just be ignored in that case.
2926    * @throws IOException if there is an external or internal error causing the snapshot to fail
2927    */
2928   public void addRegionToSnapshot(SnapshotDescription desc,
2929       ForeignExceptionSnare exnSnare) throws IOException {
2930     Path rootDir = FSUtils.getRootDir(conf);
2931     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
2932 
2933     SnapshotManifest manifest = SnapshotManifest.create(conf, getFilesystem(),
2934                                                         snapshotDir, desc, exnSnare);
2935     manifest.addRegion(this);
2936   }
2937 
2938   /**
2939    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP} with the
2940    * provided current timestamp.
2941    */
2942   void updateKVTimestamps(final Iterable<List<Cell>> keyLists, final byte[] now) {
2943     for (List<Cell> cells: keyLists) {
2944       if (cells == null) continue;
2945       assert cells instanceof RandomAccess;
2946       int listSize = cells.size();
2947       for (int i=0; i < listSize; i++) {
2948         Cell cell = cells.get(i);
2949         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2950         kv.updateLatestStamp(now);
2951       }
2952     }
2953   }
2954 
2955   /**
2956    * Possibly rewrite incoming cell tags.
2957    */
2958   void rewriteCellTags(Map<byte[], List<Cell>> familyMap, final Mutation m) {
2959     // Check if we have any work to do and early out otherwise
2960     // Update these checks as more logic is added here
2961 
2962     if (m.getTTL() == Long.MAX_VALUE) {
2963       return;
2964     }
2965 
2966     // From this point we know we have some work to do
2967 
2968     for (Map.Entry<byte[], List<Cell>> e: familyMap.entrySet()) {
2969       List<Cell> cells = e.getValue();
2970       assert cells instanceof RandomAccess;
2971       int listSize = cells.size();
2972       for (int i = 0; i < listSize; i++) {
2973         Cell cell = cells.get(i);
2974         List<Tag> newTags = new ArrayList<Tag>();
2975         Iterator<Tag> tagIterator = CellUtil.tagsIterator(cell.getTagsArray(),
2976           cell.getTagsOffset(), cell.getTagsLengthUnsigned());
2977 
2978         // Carry forward existing tags
2979 
2980         while (tagIterator.hasNext()) {
2981 
2982           // Add any filters or tag specific rewrites here
2983 
2984           newTags.add(tagIterator.next());
2985         }
2986 
2987         // Cell TTL handling
2988 
2989         // Check again if we need to add a cell TTL because early out logic
2990         // above may change when there are more tag based features in core.
2991         if (m.getTTL() != Long.MAX_VALUE) {
2992           // Add a cell TTL tag
2993           newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(m.getTTL())));
2994         }
2995 
2996         // Rewrite the cell with the updated set of tags
2997 
2998         cells.set(i, new KeyValue(cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
2999           cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
3000           cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength(),
3001           cell.getTimestamp(), KeyValue.Type.codeToType(cell.getTypeByte()),
3002           cell.getValueArray(), cell.getValueOffset(), cell.getValueLength(),
3003           newTags));
3004       }
3005     }
3006   }
3007 
3008   /*
3009    * Check if resources to support an update.
3010    *
3011    * We throw RegionTooBusyException if above memstore limit
3012    * and expect client to retry using some kind of backoff
3013   */
3014   private void checkResources()
3015     throws RegionTooBusyException {
3016     // If catalog region, do not impose resource constraints or block updates.
3017     if (this.getRegionInfo().isMetaRegion()) return;
3018 
3019     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
3020       blockedRequestsCount.increment();
3021       requestFlush();
3022       throw new RegionTooBusyException("Above memstore limit, " +
3023           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
3024           this.getRegionInfo().getRegionNameAsString()) +
3025           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
3026           this.getRegionServerServices().getServerName()) +
3027           ", memstoreSize=" + memstoreSize.get() +
3028           ", blockingMemStoreSize=" + blockingMemStoreSize);
3029     }
3030   }
3031 
3032   /**
3033    * @throws IOException Throws exception if region is in read-only mode.
3034    */
3035   protected void checkReadOnly() throws IOException {
3036     if (this.writestate.isReadOnly()) {
3037       throw new IOException("region is read only");
3038     }
3039   }
3040 
3041   /**
3042    * Add updates first to the hlog and then add values to memstore.
3043    * Warning: Assumption is caller has lock on passed in row.
3044    * @param family
3045    * @param edits Cell updates by column
3046    * @praram now
3047    * @throws IOException
3048    */
3049   private void put(final byte [] row, byte [] family, List<Cell> edits)
3050   throws IOException {
3051     NavigableMap<byte[], List<Cell>> familyMap;
3052     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
3053 
3054     familyMap.put(family, edits);
3055     Put p = new Put(row);
3056     p.setFamilyCellMap(familyMap);
3057     doBatchMutate(p);
3058   }
3059 
3060   /**
3061    * Atomically apply the given map of family->edits to the memstore.
3062    * This handles the consistency control on its own, but the caller
3063    * should already have locked updatesLock.readLock(). This also does
3064    * <b>not</b> check the families for validity.
3065    *
3066    * @param familyMap Map of kvs per family
3067    * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
3068    *        If null, then this method internally creates a mvcc transaction.
3069    * @return the additional memory usage of the memstore caused by the
3070    * new entries.
3071    */
3072   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
3073     MultiVersionConsistencyControl.WriteEntry localizedWriteEntry) {
3074     long size = 0;
3075     boolean freemvcc = false;
3076 
3077     try {
3078       if (localizedWriteEntry == null) {
3079         localizedWriteEntry = mvcc.beginMemstoreInsert();
3080         freemvcc = true;
3081       }
3082 
3083       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3084         byte[] family = e.getKey();
3085         List<Cell> cells = e.getValue();
3086         assert cells instanceof RandomAccess;
3087         Store store = getStore(family);
3088         int listSize = cells.size();
3089         for (int i=0; i < listSize; i++) {
3090           Cell cell = cells.get(i);
3091           KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
3092           kv.setMvccVersion(localizedWriteEntry.getWriteNumber());
3093           size += store.add(kv);
3094         }
3095       }
3096     } finally {
3097       if (freemvcc) {
3098         mvcc.completeMemstoreInsert(localizedWriteEntry);
3099       }
3100     }
3101 
3102      return size;
3103    }
3104 
3105   /**
3106    * Remove all the keys listed in the map from the memstore. This method is
3107    * called when a Put/Delete has updated memstore but subequently fails to update
3108    * the wal. This method is then invoked to rollback the memstore.
3109    */
3110   private void rollbackMemstore(BatchOperationInProgress<?> batchOp,
3111                                 Map<byte[], List<Cell>>[] familyMaps,
3112                                 int start, int end) {
3113     int kvsRolledback = 0;
3114     for (int i = start; i < end; i++) {
3115       // skip over request that never succeeded in the first place.
3116       if (batchOp.retCodeDetails[i].getOperationStatusCode()
3117             != OperationStatusCode.SUCCESS) {
3118         continue;
3119       }
3120 
3121       // Rollback all the kvs for this row.
3122       Map<byte[], List<Cell>> familyMap  = familyMaps[i];
3123       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
3124         byte[] family = e.getKey();
3125         List<Cell> cells = e.getValue();
3126 
3127         // Remove those keys from the memstore that matches our
3128         // key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
3129         // that even the memstoreTS has to match for keys that will be rolleded-back.
3130         Store store = getStore(family);
3131         for (Cell cell: cells) {
3132           store.rollback(KeyValueUtil.ensureKeyValue(cell));
3133           kvsRolledback++;
3134         }
3135       }
3136     }
3137     LOG.debug("rollbackMemstore rolled back " + kvsRolledback +
3138         " keyvalues from start:" + start + " to end:" + end);
3139   }
3140 
3141   /**
3142    * Check the collection of families for validity.
3143    * @throws NoSuchColumnFamilyException if a family does not exist.
3144    */
3145   void checkFamilies(Collection<byte[]> families)
3146   throws NoSuchColumnFamilyException {
3147     for (byte[] family : families) {
3148       checkFamily(family);
3149     }
3150   }
3151 
3152   /**
3153    * During replay, there could exist column families which are removed between region server
3154    * failure and replay
3155    */
3156   private void removeNonExistentColumnFamilyForReplay(
3157       final Map<byte[], List<Cell>> familyMap) {
3158     List<byte[]> nonExistentList = null;
3159     for (byte[] family : familyMap.keySet()) {
3160       if (!this.htableDescriptor.hasFamily(family)) {
3161         if (nonExistentList == null) {
3162           nonExistentList = new ArrayList<byte[]>();
3163         }
3164         nonExistentList.add(family);
3165       }
3166     }
3167     if (nonExistentList != null) {
3168       for (byte[] family : nonExistentList) {
3169         // Perhaps schema was changed between crash and replay
3170         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
3171         familyMap.remove(family);
3172       }
3173     }
3174   }
3175 
3176   void checkTimestamps(final Map<byte[], List<Cell>> familyMap,
3177       long now) throws FailedSanityCheckException {
3178     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
3179       return;
3180     }
3181     long maxTs = now + timestampSlop;
3182     for (List<Cell> kvs : familyMap.values()) {
3183       assert kvs instanceof RandomAccess;
3184       int listSize  = kvs.size();
3185       for (int i=0; i < listSize; i++) {
3186         Cell cell = kvs.get(i);
3187         // see if the user-side TS is out of range. latest = server-side
3188         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
3189         if (!kv.isLatestTimestamp() && kv.getTimestamp() > maxTs) {
3190           throw new FailedSanityCheckException("Timestamp for KV out of range "
3191               + cell + " (too.new=" + timestampSlop + ")");
3192         }
3193       }
3194     }
3195   }
3196 
3197   /**
3198    * Append the given map of family->edits to a WALEdit data structure.
3199    * This does not write to the HLog itself.
3200    * @param familyMap map of family->edits
3201    * @param walEdit the destination entry to append into
3202    */
3203   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
3204       WALEdit walEdit) {
3205     for (List<Cell> edits : familyMap.values()) {
3206       assert edits instanceof RandomAccess;
3207       int listSize = edits.size();
3208       for (int i=0; i < listSize; i++) {
3209         Cell cell = edits.get(i);
3210         walEdit.add(KeyValueUtil.ensureKeyValue(cell));
3211       }
3212     }
3213   }
3214 
3215   private void requestFlush() {
3216     if (this.rsServices == null) {
3217       return;
3218     }
3219     synchronized (writestate) {
3220       if (this.writestate.isFlushRequested()) {
3221         return;
3222       }
3223       writestate.flushRequested = true;
3224     }
3225     // Make request outside of synchronize block; HBASE-818.
3226     this.rsServices.getFlushRequester().requestFlush(this);
3227     if (LOG.isDebugEnabled()) {
3228       LOG.debug("Flush requested on " + this);
3229     }
3230   }
3231 
3232   /*
3233    * @param size
3234    * @return True if size is over the flush threshold
3235    */
3236   private boolean isFlushSize(final long size) {
3237     return size > this.memstoreFlushSize;
3238   }
3239 
3240   /**
3241    * Read the edits log put under this region by wal log splitting process.  Put
3242    * the recovered edits back up into this region.
3243    *
3244    * <p>We can ignore any log message that has a sequence ID that's equal to or
3245    * lower than minSeqId.  (Because we know such log messages are already
3246    * reflected in the HFiles.)
3247    *
3248    * <p>While this is running we are putting pressure on memory yet we are
3249    * outside of our usual accounting because we are not yet an onlined region
3250    * (this stuff is being run as part of Region initialization).  This means
3251    * that if we're up against global memory limits, we'll not be flagged to flush
3252    * because we are not online. We can't be flushed by usual mechanisms anyways;
3253    * we're not yet online so our relative sequenceids are not yet aligned with
3254    * HLog sequenceids -- not till we come up online, post processing of split
3255    * edits.
3256    *
3257    * <p>But to help relieve memory pressure, at least manage our own heap size
3258    * flushing if are in excess of per-region limits.  Flushing, though, we have
3259    * to be careful and avoid using the regionserver/hlog sequenceid.  Its running
3260    * on a different line to whats going on in here in this region context so if we
3261    * crashed replaying these edits, but in the midst had a flush that used the
3262    * regionserver log with a sequenceid in excess of whats going on in here
3263    * in this region and with its split editlogs, then we could miss edits the
3264    * next time we go to recover. So, we have to flush inline, using seqids that
3265    * make sense in a this single region context only -- until we online.
3266    *
3267    * @param regiondir
3268    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
3269    * the maxSeqId for the store to be applied, else its skipped.
3270    * @param reporter
3271    * @return the sequence id of the last edit added to this region out of the
3272    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3273    * @throws UnsupportedEncodingException
3274    * @throws IOException
3275    */
3276   protected long replayRecoveredEditsIfAny(final Path regiondir,
3277       Map<byte[], Long> maxSeqIdInStores,
3278       final CancelableProgressable reporter, final MonitoredTask status)
3279       throws UnsupportedEncodingException, IOException {
3280     long minSeqIdForTheRegion = -1;
3281     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3282       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3283         minSeqIdForTheRegion = maxSeqIdInStore;
3284       }
3285     }
3286     long seqid = minSeqIdForTheRegion;
3287 
3288     FileSystem fs = this.fs.getFileSystem();
3289     NavigableSet<Path> files = HLogUtil.getSplitEditFilesSorted(fs, regiondir);
3290     if (LOG.isDebugEnabled()) {
3291       LOG.debug("Found " + (files == null ? 0 : files.size())
3292         + " recovered edits file(s) under " + regiondir);
3293     }
3294 
3295     if (files == null || files.isEmpty()) return seqid;
3296 
3297     for (Path edits: files) {
3298       if (edits == null || !fs.exists(edits)) {
3299         LOG.warn("Null or non-existent edits file: " + edits);
3300         continue;
3301       }
3302       if (isZeroLengthThenDelete(fs, edits)) continue;
3303 
3304       long maxSeqId;
3305       String fileName = edits.getName();
3306       maxSeqId = Math.abs(Long.parseLong(fileName));
3307       if (maxSeqId <= minSeqIdForTheRegion) {
3308         if (LOG.isDebugEnabled()) {
3309           String msg = "Maximum sequenceid for this log is " + maxSeqId
3310             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3311             + ", skipped the whole file, path=" + edits;
3312           LOG.debug(msg);
3313         }
3314         continue;
3315       }
3316 
3317       try {
3318         // replay the edits. Replay can return -1 if everything is skipped, only update if seqId is greater
3319         seqid = Math.max(seqid, replayRecoveredEdits(edits, maxSeqIdInStores, reporter));
3320       } catch (IOException e) {
3321         boolean skipErrors = conf.getBoolean(
3322             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3323             conf.getBoolean(
3324                 "hbase.skip.errors",
3325                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3326         if (conf.get("hbase.skip.errors") != null) {
3327           LOG.warn(
3328               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3329               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3330         }
3331         if (skipErrors) {
3332           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3333           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3334               + "=true so continuing. Renamed " + edits +
3335               " as " + p, e);
3336         } else {
3337           throw e;
3338         }
3339       }
3340     }
3341     // The edits size added into rsAccounting during this replaying will not
3342     // be required any more. So just clear it.
3343     if (this.rsAccounting != null) {
3344       this.rsAccounting.clearRegionReplayEditsSize(this.getRegionName());
3345     }
3346     if (seqid > minSeqIdForTheRegion) {
3347       // Then we added some edits to memory. Flush and cleanup split edit files.
3348       internalFlushcache(null, seqid, status);
3349     }
3350     // Now delete the content of recovered edits.  We're done w/ them.
3351     for (Path file: files) {
3352       if (!fs.delete(file, false)) {
3353         LOG.error("Failed delete of " + file);
3354       } else {
3355         LOG.debug("Deleted recovered.edits file=" + file);
3356       }
3357     }
3358     return seqid;
3359   }
3360 
3361   /*
3362    * @param edits File of recovered edits.
3363    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in log
3364    * must be larger than this to be replayed for each store.
3365    * @param reporter
3366    * @return the sequence id of the last edit added to this region out of the
3367    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3368    * @throws IOException
3369    */
3370   private long replayRecoveredEdits(final Path edits,
3371       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
3372     throws IOException {
3373     String msg = "Replaying edits from " + edits;
3374     LOG.info(msg);
3375     MonitoredTask status = TaskMonitor.get().createStatus(msg);
3376     FileSystem fs = this.fs.getFileSystem();
3377 
3378     status.setStatus("Opening logs");
3379     HLog.Reader reader = null;
3380     try {
3381       reader = HLogFactory.createReader(fs, edits, conf);
3382       long currentEditSeqId = -1;
3383       long firstSeqIdInLog = -1;
3384       long skippedEdits = 0;
3385       long editsCount = 0;
3386       long intervalEdits = 0;
3387       HLog.Entry entry;
3388       Store store = null;
3389       boolean reported_once = false;
3390       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
3391 
3392       try {
3393         // How many edits seen before we check elapsed time
3394         int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
3395             2000);
3396         // How often to send a progress report (default 1/2 master timeout)
3397         int period = this.conf.getInt("hbase.hstore.report.period",
3398           this.conf.getInt(AssignmentManager.ASSIGNMENT_TIMEOUT,
3399             AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT) / 2);
3400         long lastReport = EnvironmentEdgeManager.currentTimeMillis();
3401 
3402         while ((entry = reader.next()) != null) {
3403           HLogKey key = entry.getKey();
3404           WALEdit val = entry.getEdit();
3405 
3406           if (ng != null) { // some test, or nonces disabled
3407             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
3408           }
3409 
3410           if (reporter != null) {
3411             intervalEdits += val.size();
3412             if (intervalEdits >= interval) {
3413               // Number of edits interval reached
3414               intervalEdits = 0;
3415               long cur = EnvironmentEdgeManager.currentTimeMillis();
3416               if (lastReport + period <= cur) {
3417                 status.setStatus("Replaying edits..." +
3418                     " skipped=" + skippedEdits +
3419                     " edits=" + editsCount);
3420                 // Timeout reached
3421                 if(!reporter.progress()) {
3422                   msg = "Progressable reporter failed, stopping replay";
3423                   LOG.warn(msg);
3424                   status.abort(msg);
3425                   throw new IOException(msg);
3426                 }
3427                 reported_once = true;
3428                 lastReport = cur;
3429               }
3430             }
3431           }
3432 
3433           if (firstSeqIdInLog == -1) {
3434             firstSeqIdInLog = key.getLogSeqNum();
3435           }
3436           currentEditSeqId = key.getLogSeqNum();
3437 
3438           // Start coprocessor replay here. The coprocessor is for each WALEdit
3439           // instead of a KeyValue.
3440           if (coprocessorHost != null) {
3441             status.setStatus("Running pre-WAL-restore hook in coprocessors");
3442             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
3443               // if bypass this log entry, ignore it ...
3444               continue;
3445             }
3446           }
3447 
3448           boolean flush = false;
3449           for (KeyValue kv: val.getKeyValues()) {
3450             // Check this edit is for me. Also, guard against writing the special
3451             // METACOLUMN info such as HBASE::CACHEFLUSH entries
3452             if (kv.matchingFamily(WALEdit.METAFAMILY) ||
3453                 !Bytes.equals(key.getEncodedRegionName(),
3454                   this.getRegionInfo().getEncodedNameAsBytes())) {
3455               //this is a special edit, we should handle it
3456               CompactionDescriptor compaction = WALEdit.getCompaction(kv);
3457               if (compaction != null) {
3458                 //replay the compaction
3459                 completeCompactionMarker(compaction);
3460               }
3461 
3462               skippedEdits++;
3463               continue;
3464             }
3465             // Figure which store the edit is meant for.
3466             if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
3467               store = this.stores.get(kv.getFamily());
3468             }
3469             if (store == null) {
3470               // This should never happen.  Perhaps schema was changed between
3471               // crash and redeploy?
3472               LOG.warn("No family for " + kv);
3473               skippedEdits++;
3474               continue;
3475             }
3476             // Now, figure if we should skip this edit.
3477             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
3478                 .getName())) {
3479               skippedEdits++;
3480               continue;
3481             }
3482             // Once we are over the limit, restoreEdit will keep returning true to
3483             // flush -- but don't flush until we've played all the kvs that make up
3484             // the WALEdit.
3485             flush |= restoreEdit(store, kv);
3486             editsCount++;
3487           }
3488           if (flush) {
3489             internalFlushcache(null, currentEditSeqId, status);
3490           }
3491 
3492           if (coprocessorHost != null) {
3493             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
3494           }
3495         }
3496       } catch (EOFException eof) {
3497         Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3498         msg = "Encountered EOF. Most likely due to Master failure during " +
3499             "log spliting, so we have this data in another edit.  " +
3500             "Continuing, but renaming " + edits + " as " + p;
3501         LOG.warn(msg, eof);
3502         status.abort(msg);
3503       } catch (IOException ioe) {
3504         // If the IOE resulted from bad file format,
3505         // then this problem is idempotent and retrying won't help
3506         if (ioe.getCause() instanceof ParseException) {
3507           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3508           msg = "File corruption encountered!  " +
3509               "Continuing, but renaming " + edits + " as " + p;
3510           LOG.warn(msg, ioe);
3511           status.setStatus(msg);
3512         } else {
3513           status.abort(StringUtils.stringifyException(ioe));
3514           // other IO errors may be transient (bad network connection,
3515           // checksum exception on one datanode, etc).  throw & retry
3516           throw ioe;
3517         }
3518       }
3519       if (reporter != null && !reported_once) {
3520         reporter.progress();
3521       }
3522       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
3523         ", firstSequenceidInLog=" + firstSeqIdInLog +
3524         ", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits;
3525       status.markComplete(msg);
3526       LOG.debug(msg);
3527       return currentEditSeqId;
3528     } finally {
3529       status.cleanup();
3530       if (reader != null) {
3531          reader.close();
3532       }
3533     }
3534   }
3535 
3536   /**
3537    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
3538    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
3539    * See HBASE-2331.
3540    * @param compaction
3541    */
3542   void completeCompactionMarker(CompactionDescriptor compaction)
3543       throws IOException {
3544     Store store = this.getStore(compaction.getFamilyName().toByteArray());
3545     if (store == null) {
3546       LOG.warn("Found Compaction WAL edit for deleted family:" +
3547           Bytes.toString(compaction.getFamilyName().toByteArray()));
3548       return;
3549     }
3550     store.completeCompactionMarker(compaction);
3551   }
3552 
3553   /**
3554    * Used by tests
3555    * @param s Store to add edit too.
3556    * @param kv KeyValue to add.
3557    * @return True if we should flush.
3558    */
3559   protected boolean restoreEdit(final Store s, final KeyValue kv) {
3560     long kvSize = s.add(kv);
3561     if (this.rsAccounting != null) {
3562       rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
3563     }
3564     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
3565   }
3566 
3567   /*
3568    * @param fs
3569    * @param p File to check.
3570    * @return True if file was zero-length (and if so, we'll delete it in here).
3571    * @throws IOException
3572    */
3573   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
3574       throws IOException {
3575     FileStatus stat = fs.getFileStatus(p);
3576     if (stat.getLen() > 0) return false;
3577     LOG.warn("File " + p + " is zero-length, deleting.");
3578     fs.delete(p, false);
3579     return true;
3580   }
3581 
3582   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
3583     return new HStore(this, family, this.conf);
3584   }
3585 
3586   /**
3587    * Return HStore instance.
3588    * Use with caution.  Exposed for use of fixup utilities.
3589    * @param column Name of column family hosted by this region.
3590    * @return Store that goes with the family on passed <code>column</code>.
3591    * TODO: Make this lookup faster.
3592    */
3593   public Store getStore(final byte[] column) {
3594     return this.stores.get(column);
3595   }
3596 
3597   public Map<byte[], Store> getStores() {
3598     return this.stores;
3599   }
3600 
3601   /**
3602    * Return list of storeFiles for the set of CFs.
3603    * Uses closeLock to prevent the race condition where a region closes
3604    * in between the for loop - closing the stores one by one, some stores
3605    * will return 0 files.
3606    * @return List of storeFiles.
3607    */
3608   public List<String> getStoreFileList(final byte [][] columns)
3609     throws IllegalArgumentException {
3610     List<String> storeFileNames = new ArrayList<String>();
3611     synchronized(closeLock) {
3612       for(byte[] column : columns) {
3613         Store store = this.stores.get(column);
3614         if (store == null) {
3615           throw new IllegalArgumentException("No column family : " +
3616               new String(column) + " available");
3617         }
3618         for (StoreFile storeFile: store.getStorefiles()) {
3619           storeFileNames.add(storeFile.getPath().toString());
3620         }
3621       }
3622     }
3623     return storeFileNames;
3624   }
3625   //////////////////////////////////////////////////////////////////////////////
3626   // Support code
3627   //////////////////////////////////////////////////////////////////////////////
3628 
3629   /** Make sure this is a valid row for the HRegion */
3630   void checkRow(final byte [] row, String op) throws IOException {
3631     if (!rowIsInRange(getRegionInfo(), row)) {
3632       throw new WrongRegionException("Requested row out of range for " +
3633           op + " on HRegion " + this + ", startKey='" +
3634           Bytes.toStringBinary(getStartKey()) + "', getEndKey()='" +
3635           Bytes.toStringBinary(getEndKey()) + "', row='" +
3636           Bytes.toStringBinary(row) + "'");
3637     }
3638   }
3639 
3640   /**
3641    * Tries to acquire a lock on the given row.
3642    * @param waitForLock if true, will block until the lock is available.
3643    *        Otherwise, just tries to obtain the lock and returns
3644    *        false if unavailable.
3645    * @return the row lock if acquired,
3646    *   null if waitForLock was false and the lock was not acquired
3647    * @throws IOException if waitForLock was true and the lock could not be acquired after waiting
3648    */
3649   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
3650     startRegionOperation();
3651     try {
3652       return getRowLockInternal(row, waitForLock);
3653     } finally {
3654       closeRegionOperation();
3655     }
3656   }
3657 
3658   /**
3659    * A version of getRowLock(byte[], boolean) to use when a region operation has already been
3660    * started (the calling thread has already acquired the region-close-lock).
3661    */
3662   protected RowLock getRowLockInternal(byte[] row, boolean waitForLock) throws IOException {
3663     checkRow(row, "row lock");
3664     HashedBytes rowKey = new HashedBytes(row);
3665     RowLockContext rowLockContext = new RowLockContext(rowKey);
3666 
3667     // loop until we acquire the row lock (unless !waitForLock)
3668     while (true) {
3669       RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
3670       if (existingContext == null) {
3671         // Row is not already locked by any thread, use newly created context.
3672         break;
3673       } else if (existingContext.ownedByCurrentThread()) {
3674         // Row is already locked by current thread, reuse existing context instead.
3675         rowLockContext = existingContext;
3676         break;
3677       } else {
3678         // Row is already locked by some other thread, give up or wait for it
3679         if (!waitForLock) {
3680           return null;
3681         }
3682         try {
3683           if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
3684             throw new IOException("Timed out waiting for lock for row: " + rowKey);
3685           }
3686         } catch (InterruptedException ie) {
3687           LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
3688           InterruptedIOException iie = new InterruptedIOException();
3689           iie.initCause(ie);
3690           throw iie;
3691         }
3692       }
3693     }
3694 
3695     // allocate new lock for this thread
3696     return rowLockContext.newLock();
3697   }
3698 
3699   /**
3700    * Acqures a lock on the given row.
3701    * The same thread may acquire multiple locks on the same row.
3702    * @return the acquired row lock
3703    * @throws IOException if the lock could not be acquired after waiting
3704    */
3705   public RowLock getRowLock(byte[] row) throws IOException {
3706     return getRowLock(row, true);
3707   }
3708 
3709   /**
3710    * If the given list of row locks is not null, releases all locks.
3711    */
3712   public void releaseRowLocks(List<RowLock> rowLocks) {
3713     if (rowLocks != null) {
3714       for (RowLock rowLock : rowLocks) {
3715         rowLock.release();
3716       }
3717       rowLocks.clear();
3718     }
3719   }
3720 
3721   /**
3722    * Determines whether multiple column families are present
3723    * Precondition: familyPaths is not null
3724    *
3725    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3726    */
3727   private static boolean hasMultipleColumnFamilies(
3728       List<Pair<byte[], String>> familyPaths) {
3729     boolean multipleFamilies = false;
3730     byte[] family = null;
3731     for (Pair<byte[], String> pair : familyPaths) {
3732       byte[] fam = pair.getFirst();
3733       if (family == null) {
3734         family = fam;
3735       } else if (!Bytes.equals(family, fam)) {
3736         multipleFamilies = true;
3737         break;
3738       }
3739     }
3740     return multipleFamilies;
3741   }
3742 
3743 
3744   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths,
3745                                 boolean assignSeqId) throws IOException {
3746     return bulkLoadHFiles(familyPaths, assignSeqId, null);
3747   }
3748 
3749   /**
3750    * Attempts to atomically load a group of hfiles.  This is critical for loading
3751    * rows with multiple column families atomically.
3752    *
3753    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3754    * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
3755    * file about to be bulk loaded
3756    * @param assignSeqId
3757    * @return true if successful, false if failed recoverably
3758    * @throws IOException if failed unrecoverably.
3759    */
3760   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths, boolean assignSeqId,
3761       BulkLoadListener bulkLoadListener) throws IOException {
3762     Preconditions.checkNotNull(familyPaths);
3763     // we need writeLock for multi-family bulk load
3764     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
3765     try {
3766       this.writeRequestsCount.increment();
3767 
3768       // There possibly was a split that happend between when the split keys
3769       // were gathered and before the HReiogn's write lock was taken.  We need
3770       // to validate the HFile region before attempting to bulk load all of them
3771       List<IOException> ioes = new ArrayList<IOException>();
3772       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
3773       for (Pair<byte[], String> p : familyPaths) {
3774         byte[] familyName = p.getFirst();
3775         String path = p.getSecond();
3776 
3777         Store store = getStore(familyName);
3778         if (store == null) {
3779           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
3780               "No such column family " + Bytes.toStringBinary(familyName));
3781           ioes.add(ioe);
3782         } else {
3783           try {
3784             store.assertBulkLoadHFileOk(new Path(path));
3785           } catch (WrongRegionException wre) {
3786             // recoverable (file doesn't fit in region)
3787             failures.add(p);
3788           } catch (IOException ioe) {
3789             // unrecoverable (hdfs problem)
3790             ioes.add(ioe);
3791           }
3792         }
3793       }
3794 
3795       // validation failed because of some sort of IO problem.
3796       if (ioes.size() != 0) {
3797         IOException e = MultipleIOException.createIOException(ioes);
3798         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
3799         throw e;
3800       }
3801 
3802       // validation failed, bail out before doing anything permanent.
3803       if (failures.size() != 0) {
3804         StringBuilder list = new StringBuilder();
3805         for (Pair<byte[], String> p : failures) {
3806           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
3807             .append(p.getSecond());
3808         }
3809         // problem when validating
3810         LOG.warn("There was a recoverable bulk load failure likely due to a" +
3811             " split.  These (family, HFile) pairs were not loaded: " + list);
3812         return false;
3813       }
3814 
3815       long seqId = -1;
3816       // We need to assign a sequential ID that's in between two memstores in order to preserve
3817       // the guarantee that all the edits lower than the highest sequential ID from all the
3818       // HFiles are flushed on disk. See HBASE-10958.
3819       if (assignSeqId) {
3820         FlushResult fs = this.flushcache();
3821         if (fs.isFlushSucceeded()) {
3822           seqId = fs.flushSequenceId;
3823         } else if (fs.result == FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY) {
3824           seqId = this.sequenceId.incrementAndGet();
3825         } else {
3826           throw new IOException("Could not bulk load with an assigned sequential ID because the " +
3827               "flush didn't run. Reason for not flushing: " + fs.failureReason);
3828         }
3829       }
3830 
3831       for (Pair<byte[], String> p : familyPaths) {
3832         byte[] familyName = p.getFirst();
3833         String path = p.getSecond();
3834         Store store = getStore(familyName);
3835         try {
3836           String finalPath = path;
3837           if(bulkLoadListener != null) {
3838             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
3839           }
3840           store.bulkLoadHFile(finalPath, seqId);
3841           if(bulkLoadListener != null) {
3842             bulkLoadListener.doneBulkLoad(familyName, path);
3843           }
3844         } catch (IOException ioe) {
3845           // A failure here can cause an atomicity violation that we currently
3846           // cannot recover from since it is likely a failed HDFS operation.
3847 
3848           // TODO Need a better story for reverting partial failures due to HDFS.
3849           LOG.error("There was a partial failure due to IO when attempting to" +
3850               " load " + Bytes.toString(p.getFirst()) + " : "+ p.getSecond(), ioe);
3851           if(bulkLoadListener != null) {
3852             try {
3853               bulkLoadListener.failedBulkLoad(familyName, path);
3854             } catch (Exception ex) {
3855               LOG.error("Error while calling failedBulkLoad for family "+
3856                   Bytes.toString(familyName)+" with path "+path, ex);
3857             }
3858           }
3859           throw ioe;
3860         }
3861       }
3862       return true;
3863     } finally {
3864       closeBulkRegionOperation();
3865     }
3866   }
3867 
3868   @Override
3869   public boolean equals(Object o) {
3870     return o instanceof HRegion && Bytes.equals(this.getRegionName(),
3871                                                 ((HRegion) o).getRegionName());
3872   }
3873 
3874   @Override
3875   public int hashCode() {
3876     return Bytes.hashCode(this.getRegionName());
3877   }
3878 
3879   @Override
3880   public String toString() {
3881     return this.getRegionNameAsString();
3882   }
3883 
3884   /**
3885    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
3886    */
3887   class RegionScannerImpl implements RegionScanner {
3888     // Package local for testability
3889     KeyValueHeap storeHeap = null;
3890     /** Heap of key-values that are not essential for the provided filters and are thus read
3891      * on demand, if on-demand column family loading is enabled.*/
3892     KeyValueHeap joinedHeap = null;
3893     /**
3894      * If the joined heap data gathering is interrupted due to scan limits, this will
3895      * contain the row for which we are populating the values.*/
3896     protected KeyValue joinedContinuationRow = null;
3897     // KeyValue indicating that limit is reached when scanning
3898     private final KeyValue KV_LIMIT = new KeyValue();
3899     protected final byte[] stopRow;
3900     private final FilterWrapper filter;
3901     private int batch;
3902     protected int isScan;
3903     private boolean filterClosed = false;
3904     private long readPt;
3905     private long maxResultSize;
3906     protected HRegion region;
3907 
3908     @Override
3909     public HRegionInfo getRegionInfo() {
3910       return region.getRegionInfo();
3911     }
3912 
3913     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
3914         throws IOException {
3915 
3916       this.region = region;
3917       this.maxResultSize = scan.getMaxResultSize();
3918       if (scan.hasFilter()) {
3919         this.filter = new FilterWrapper(scan.getFilter());
3920       } else {
3921         this.filter = null;
3922       }
3923 
3924       this.batch = scan.getBatch();
3925       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
3926         this.stopRow = null;
3927       } else {
3928         this.stopRow = scan.getStopRow();
3929       }
3930       // If we are doing a get, we want to be [startRow,endRow] normally
3931       // it is [startRow,endRow) and if startRow=endRow we get nothing.
3932       this.isScan = scan.isGetScan() ? -1 : 0;
3933 
3934       // synchronize on scannerReadPoints so that nobody calculates
3935       // getSmallestReadPoint, before scannerReadPoints is updated.
3936       IsolationLevel isolationLevel = scan.getIsolationLevel();
3937       synchronized(scannerReadPoints) {
3938         this.readPt = getReadpoint(isolationLevel);
3939         scannerReadPoints.put(this, this.readPt);
3940       }
3941 
3942       // Here we separate all scanners into two lists - scanner that provide data required
3943       // by the filter to operate (scanners list) and all others (joinedScanners list).
3944       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
3945       List<KeyValueScanner> joinedScanners = new ArrayList<KeyValueScanner>();
3946       if (additionalScanners != null) {
3947         scanners.addAll(additionalScanners);
3948       }
3949 
3950       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
3951           scan.getFamilyMap().entrySet()) {
3952         Store store = stores.get(entry.getKey());
3953         KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
3954         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
3955           || this.filter.isFamilyEssential(entry.getKey())) {
3956           scanners.add(scanner);
3957         } else {
3958           joinedScanners.add(scanner);
3959         }
3960       }
3961       initializeKVHeap(scanners, joinedScanners, region);
3962     }
3963 
3964     RegionScannerImpl(Scan scan, HRegion region) throws IOException {
3965       this(scan, null, region);
3966     }
3967 
3968     protected void initializeKVHeap(List<KeyValueScanner> scanners,
3969         List<KeyValueScanner> joinedScanners, HRegion region)
3970         throws IOException {
3971       this.storeHeap = new KeyValueHeap(scanners, region.comparator);
3972       if (!joinedScanners.isEmpty()) {
3973         this.joinedHeap = new KeyValueHeap(joinedScanners, region.comparator);
3974       }
3975     }
3976 
3977     @Override
3978     public long getMaxResultSize() {
3979       return maxResultSize;
3980     }
3981 
3982     @Override
3983     public long getMvccReadPoint() {
3984       return this.readPt;
3985     }
3986 
3987     /**
3988      * Reset both the filter and the old filter.
3989      *
3990      * @throws IOException in case a filter raises an I/O exception.
3991      */
3992     protected void resetFilters() throws IOException {
3993       if (filter != null) {
3994         filter.reset();
3995       }
3996     }
3997 
3998     @Override
3999     public boolean next(List<Cell> outResults)
4000         throws IOException {
4001       // apply the batching limit by default
4002       return next(outResults, batch);
4003     }
4004 
4005     @Override
4006     public synchronized boolean next(List<Cell> outResults, int limit) throws IOException {
4007       if (this.filterClosed) {
4008         throw new UnknownScannerException("Scanner was closed (timed out?) " +
4009             "after we renewed it. Could be caused by a very slow scanner " +
4010             "or a lengthy garbage collection");
4011       }
4012       startRegionOperation(Operation.SCAN);
4013       readRequestsCount.increment();
4014       try {
4015         boolean returnResult = nextRaw(outResults, limit);
4016         if (region != null && region.metricsRegion != null) {
4017           long totalSize = 0;
4018           for (Cell cell: outResults) {
4019             KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4020             totalSize += kv.getLength();
4021           }
4022           region.metricsRegion.updateScanNext(totalSize);
4023         }
4024         return returnResult;
4025       } finally {
4026         closeRegionOperation(Operation.SCAN);
4027       }
4028     }
4029 
4030     @Override
4031     public boolean nextRaw(List<Cell> outResults)
4032         throws IOException {
4033       return nextRaw(outResults, batch);
4034     }
4035 
4036     @Override
4037     public boolean nextRaw(List<Cell> outResults, int limit) throws IOException {
4038       if (storeHeap == null) {
4039         // scanner is closed
4040         throw new UnknownScannerException("Scanner was closed");
4041       }
4042       boolean returnResult;
4043       if (outResults.isEmpty()) {
4044         // Usually outResults is empty. This is true when next is called
4045         // to handle scan or get operation.
4046         returnResult = nextInternal(outResults, limit);
4047       } else {
4048         List<Cell> tmpList = new ArrayList<Cell>();
4049         returnResult = nextInternal(tmpList, limit);
4050         outResults.addAll(tmpList);
4051       }
4052       resetFilters();
4053       if (isFilterDoneInternal()) {
4054         returnResult = false;
4055       }
4056       return returnResult;
4057     }
4058 
4059     private void populateFromJoinedHeap(List<Cell> results, int limit)
4060         throws IOException {
4061       assert joinedContinuationRow != null;
4062       KeyValue kv = populateResult(results, this.joinedHeap, limit,
4063           joinedContinuationRow.getBuffer(), joinedContinuationRow.getRowOffset(),
4064           joinedContinuationRow.getRowLength());
4065       if (kv != KV_LIMIT) {
4066         // We are done with this row, reset the continuation.
4067         joinedContinuationRow = null;
4068       }
4069       // As the data is obtained from two independent heaps, we need to
4070       // ensure that result list is sorted, because Result relies on that.
4071       Collections.sort(results, comparator);
4072     }
4073 
4074     /**
4075      * Fetches records with currentRow into results list, until next row or limit (if not -1).
4076      * @param results
4077      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
4078      * @param limit Max amount of KVs to place in result list, -1 means no limit.
4079      * @param currentRow Byte array with key we are fetching.
4080      * @param offset offset for currentRow
4081      * @param length length for currentRow
4082      * @return KV_LIMIT if limit reached, next KeyValue otherwise.
4083      */
4084     private KeyValue populateResult(List<Cell> results, KeyValueHeap heap, int limit,
4085         byte[] currentRow, int offset, short length) throws IOException {
4086       KeyValue nextKv;
4087       do {
4088         heap.next(results, limit - results.size());
4089         if (limit > 0 && results.size() == limit) {
4090           return KV_LIMIT;
4091         }
4092         nextKv = heap.peek();
4093       } while (nextKv != null && nextKv.matchingRow(currentRow, offset, length));
4094 
4095       return nextKv;
4096     }
4097 
4098     /*
4099      * @return True if a filter rules the scanner is over, done.
4100      */
4101     @Override
4102     public synchronized boolean isFilterDone() throws IOException {
4103       return isFilterDoneInternal();
4104     }
4105 
4106     private boolean isFilterDoneInternal() throws IOException {
4107       return this.filter != null && this.filter.filterAllRemaining();
4108     }
4109 
4110     private boolean nextInternal(List<Cell> results, int limit)
4111     throws IOException {
4112       if (!results.isEmpty()) {
4113         throw new IllegalArgumentException("First parameter should be an empty list");
4114       }
4115       RpcCallContext rpcCall = RpcServer.getCurrentCall();
4116       // The loop here is used only when at some point during the next we determine
4117       // that due to effects of filters or otherwise, we have an empty row in the result.
4118       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
4119       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
4120       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
4121       while (true) {
4122         if (rpcCall != null) {
4123           // If a user specifies a too-restrictive or too-slow scanner, the
4124           // client might time out and disconnect while the server side
4125           // is still processing the request. We should abort aggressively
4126           // in that case.
4127           long afterTime = rpcCall.disconnectSince();
4128           if (afterTime >= 0) {
4129             throw new CallerDisconnectedException(
4130                 "Aborting on region " + getRegionNameAsString() + ", call " +
4131                     this + " after " + afterTime + " ms, since " +
4132                     "caller disconnected");
4133           }
4134         }
4135 
4136         // Let's see what we have in the storeHeap.
4137         KeyValue current = this.storeHeap.peek();
4138 
4139         byte[] currentRow = null;
4140         int offset = 0;
4141         short length = 0;
4142         if (current != null) {
4143           currentRow = current.getBuffer();
4144           offset = current.getRowOffset();
4145           length = current.getRowLength();
4146         }
4147         boolean stopRow = isStopRow(currentRow, offset, length);
4148         // Check if we were getting data from the joinedHeap and hit the limit.
4149         // If not, then it's main path - getting results from storeHeap.
4150         if (joinedContinuationRow == null) {
4151           // First, check if we are at a stop row. If so, there are no more results.
4152           if (stopRow) {
4153             if (filter != null && filter.hasFilterRow()) {
4154               filter.filterRowCells(results);
4155             }
4156             return false;
4157           }
4158 
4159           // Check if rowkey filter wants to exclude this row. If so, loop to next.
4160           // Technically, if we hit limits before on this row, we don't need this call.
4161           if (filterRowKey(currentRow, offset, length)) {
4162             boolean moreRows = nextRow(currentRow, offset, length);
4163             if (!moreRows) return false;
4164             results.clear();
4165             continue;
4166           }
4167 
4168           KeyValue nextKv = populateResult(results, this.storeHeap, limit, currentRow, offset,
4169               length);
4170           // Ok, we are good, let's try to get some results from the main heap.
4171           if (nextKv == KV_LIMIT) {
4172             if (this.filter != null && filter.hasFilterRow()) {
4173               throw new IncompatibleFilterException(
4174                 "Filter whose hasFilterRow() returns true is incompatible with scan with limit!");
4175             }
4176             return true; // We hit the limit.
4177           }
4178 
4179           stopRow = nextKv == null ||
4180               isStopRow(nextKv.getBuffer(), nextKv.getRowOffset(), nextKv.getRowLength());
4181           // save that the row was empty before filters applied to it.
4182           final boolean isEmptyRow = results.isEmpty();
4183 
4184           // We have the part of the row necessary for filtering (all of it, usually).
4185           // First filter with the filterRow(List).
4186           FilterWrapper.FilterRowRetCode ret = FilterWrapper.FilterRowRetCode.NOT_CALLED;
4187           if (filter != null && filter.hasFilterRow()) {
4188             ret = filter.filterRowCellsWithRet(results);
4189           }
4190 
4191           if ((isEmptyRow || ret == FilterWrapper.FilterRowRetCode.EXCLUDE) || filterRow()) {
4192             results.clear();
4193             boolean moreRows = nextRow(currentRow, offset, length);
4194             if (!moreRows) return false;
4195 
4196             // This row was totally filtered out, if this is NOT the last row,
4197             // we should continue on. Otherwise, nothing else to do.
4198             if (!stopRow) continue;
4199             return false;
4200           }
4201 
4202           // Ok, we are done with storeHeap for this row.
4203           // Now we may need to fetch additional, non-essential data into row.
4204           // These values are not needed for filter to work, so we postpone their
4205           // fetch to (possibly) reduce amount of data loads from disk.
4206           if (this.joinedHeap != null) {
4207             KeyValue nextJoinedKv = joinedHeap.peek();
4208             // If joinedHeap is pointing to some other row, try to seek to a correct one.
4209             boolean mayHaveData =
4210               (nextJoinedKv != null && nextJoinedKv.matchingRow(currentRow, offset, length))
4211               || (this.joinedHeap.requestSeek(KeyValue.createFirstOnRow(currentRow, offset, length),
4212                 true, true)
4213                 && joinedHeap.peek() != null
4214                 && joinedHeap.peek().matchingRow(currentRow, offset, length));
4215             if (mayHaveData) {
4216               joinedContinuationRow = current;
4217               populateFromJoinedHeap(results, limit);
4218             }
4219           }
4220         } else {
4221           // Populating from the joined heap was stopped by limits, populate some more.
4222           populateFromJoinedHeap(results, limit);
4223         }
4224 
4225         // We may have just called populateFromJoinedMap and hit the limits. If that is
4226         // the case, we need to call it again on the next next() invocation.
4227         if (joinedContinuationRow != null) {
4228           return true;
4229         }
4230 
4231         // Finally, we are done with both joinedHeap and storeHeap.
4232         // Double check to prevent empty rows from appearing in result. It could be
4233         // the case when SingleColumnValueExcludeFilter is used.
4234         if (results.isEmpty()) {
4235           boolean moreRows = nextRow(currentRow, offset, length);
4236           if (!moreRows) return false;
4237           if (!stopRow) continue;
4238         }
4239 
4240         // We are done. Return the result.
4241         return !stopRow;
4242       }
4243     }
4244 
4245     /**
4246      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
4247      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
4248      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
4249      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
4250      * filterRow() will be skipped.
4251      */
4252     private boolean filterRow() throws IOException {
4253       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
4254       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
4255       return filter != null && (!filter.hasFilterRow())
4256           && filter.filterRow();
4257     }
4258 
4259     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
4260       return filter != null
4261           && filter.filterRowKey(row, offset, length);
4262     }
4263 
4264     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
4265       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
4266       KeyValue next;
4267       while ((next = this.storeHeap.peek()) != null &&
4268              next.matchingRow(currentRow, offset, length)) {
4269         this.storeHeap.next(MOCKED_LIST);
4270       }
4271       resetFilters();
4272       // Calling the hook in CP which allows it to do a fast forward
4273       return this.region.getCoprocessorHost() == null
4274           || this.region.getCoprocessorHost()
4275               .postScannerFilterRow(this, currentRow, offset, length);
4276     }
4277 
4278     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
4279       return currentRow == null ||
4280           (stopRow != null &&
4281           comparator.compareRows(stopRow, 0, stopRow.length,
4282             currentRow, offset, length) <= isScan);
4283     }
4284 
4285     @Override
4286     public synchronized void close() {
4287       if (storeHeap != null) {
4288         storeHeap.close();
4289         storeHeap = null;
4290       }
4291       if (joinedHeap != null) {
4292         joinedHeap.close();
4293         joinedHeap = null;
4294       }
4295       // no need to sychronize here.
4296       scannerReadPoints.remove(this);
4297       this.filterClosed = true;
4298     }
4299 
4300     KeyValueHeap getStoreHeapForTesting() {
4301       return storeHeap;
4302     }
4303 
4304     @Override
4305     public synchronized boolean reseek(byte[] row) throws IOException {
4306       if (row == null) {
4307         throw new IllegalArgumentException("Row cannot be null.");
4308       }
4309       boolean result = false;
4310       startRegionOperation();
4311       try {
4312         KeyValue kv = KeyValue.createFirstOnRow(row);
4313         // use request seek to make use of the lazy seek option. See HBASE-5520
4314         result = this.storeHeap.requestSeek(kv, true, true);
4315         if (this.joinedHeap != null) {
4316           result = this.joinedHeap.requestSeek(kv, true, true) || result;
4317         }
4318       } finally {
4319         closeRegionOperation();
4320       }
4321       return result;
4322     }
4323   }
4324 
4325   // Utility methods
4326   /**
4327    * A utility method to create new instances of HRegion based on the
4328    * {@link HConstants#REGION_IMPL} configuration property.
4329    * @param tableDir qualified path of directory where region should be located,
4330    * usually the table directory.
4331    * @param log The HLog is the outbound log for any updates to the HRegion
4332    * (There's a single HLog for all the HRegions on a single HRegionServer.)
4333    * The log file is a logfile from the previous execution that's
4334    * custom-computed for this HRegion. The HRegionServer computes and sorts the
4335    * appropriate log info for this HRegion. If there is a previous log file
4336    * (implying that the HRegion has been written-to before), then read it from
4337    * the supplied path.
4338    * @param fs is the filesystem.
4339    * @param conf is global configuration settings.
4340    * @param regionInfo - HRegionInfo that describes the region
4341    * is new), then read them from the supplied path.
4342    * @param htd the table descriptor
4343    * @param rsServices
4344    * @return the new instance
4345    */
4346   static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs,
4347       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
4348       RegionServerServices rsServices) {
4349     try {
4350       @SuppressWarnings("unchecked")
4351       Class<? extends HRegion> regionClass =
4352           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
4353 
4354       Constructor<? extends HRegion> c =
4355           regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
4356               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
4357               RegionServerServices.class);
4358 
4359       return c.newInstance(tableDir, log, fs, conf, regionInfo, htd, rsServices);
4360     } catch (Throwable e) {
4361       // todo: what should I throw here?
4362       throw new IllegalStateException("Could not instantiate a region instance.", e);
4363     }
4364   }
4365 
4366   /**
4367    * Convenience method creating new HRegions. Used by createTable and by the
4368    * bootstrap code in the HMaster constructor.
4369    * Note, this method creates an {@link HLog} for the created region. It
4370    * needs to be closed explicitly.  Use {@link HRegion#getLog()} to get
4371    * access.  <b>When done with a region created using this method, you will
4372    * need to explicitly close the {@link HLog} it created too; it will not be
4373    * done for you.  Not closing the log will leave at least a daemon thread
4374    * running.</b>  Call {@link #closeHRegion(HRegion)} and it will do
4375    * necessary cleanup for you.
4376    * @param info Info for region to create.
4377    * @param rootDir Root directory for HBase instance
4378    * @param conf
4379    * @param hTableDescriptor
4380    * @return new HRegion
4381    *
4382    * @throws IOException
4383    */
4384   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4385       final Configuration conf, final HTableDescriptor hTableDescriptor)
4386   throws IOException {
4387     return createHRegion(info, rootDir, conf, hTableDescriptor, null);
4388   }
4389 
4390   /**
4391    * This will do the necessary cleanup a call to
4392    * {@link #createHRegion(HRegionInfo, Path, Configuration, HTableDescriptor)}
4393    * requires.  This method will close the region and then close its
4394    * associated {@link HLog} file.  You use it if you call the other createHRegion,
4395    * the one that takes an {@link HLog} instance but don't be surprised by the
4396    * call to the {@link HLog#closeAndDelete()} on the {@link HLog} the
4397    * HRegion was carrying.
4398    * @param r
4399    * @throws IOException
4400    */
4401   public static void closeHRegion(final HRegion r) throws IOException {
4402     if (r == null) return;
4403     r.close();
4404     if (r.getLog() == null) return;
4405     r.getLog().closeAndDelete();
4406   }
4407 
4408   /**
4409    * Convenience method creating new HRegions. Used by createTable.
4410    * The {@link HLog} for the created region needs to be closed explicitly.
4411    * Use {@link HRegion#getLog()} to get access.
4412    *
4413    * @param info Info for region to create.
4414    * @param rootDir Root directory for HBase instance
4415    * @param conf
4416    * @param hTableDescriptor
4417    * @param hlog shared HLog
4418    * @param initialize - true to initialize the region
4419    * @return new HRegion
4420    *
4421    * @throws IOException
4422    */
4423   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4424                                       final Configuration conf,
4425                                       final HTableDescriptor hTableDescriptor,
4426                                       final HLog hlog,
4427                                       final boolean initialize)
4428       throws IOException {
4429     return createHRegion(info, rootDir, conf, hTableDescriptor,
4430         hlog, initialize, false);
4431   }
4432 
4433   /**
4434    * Convenience method creating new HRegions. Used by createTable.
4435    * The {@link HLog} for the created region needs to be closed
4436    * explicitly, if it is not null.
4437    * Use {@link HRegion#getLog()} to get access.
4438    *
4439    * @param info Info for region to create.
4440    * @param rootDir Root directory for HBase instance
4441    * @param conf
4442    * @param hTableDescriptor
4443    * @param hlog shared HLog
4444    * @param initialize - true to initialize the region
4445    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4446    * @return new HRegion
4447    * @throws IOException
4448    */
4449   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4450                                       final Configuration conf,
4451                                       final HTableDescriptor hTableDescriptor,
4452                                       final HLog hlog,
4453                                       final boolean initialize, final boolean ignoreHLog)
4454       throws IOException {
4455       Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4456       return createHRegion(info, rootDir, tableDir, conf, hTableDescriptor, hlog, initialize, ignoreHLog);
4457   }
4458 
4459   /**
4460    * Convenience method creating new HRegions. Used by createTable.
4461    * The {@link HLog} for the created region needs to be closed
4462    * explicitly, if it is not null.
4463    * Use {@link HRegion#getLog()} to get access.
4464    *
4465    * @param info Info for region to create.
4466    * @param rootDir Root directory for HBase instance
4467    * @param tableDir table directory
4468    * @param conf
4469    * @param hTableDescriptor
4470    * @param hlog shared HLog
4471    * @param initialize - true to initialize the region
4472    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4473    * @return new HRegion
4474    * @throws IOException
4475    */
4476   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir, final Path tableDir,
4477                                       final Configuration conf,
4478                                       final HTableDescriptor hTableDescriptor,
4479                                       final HLog hlog,
4480                                       final boolean initialize, final boolean ignoreHLog)
4481       throws IOException {
4482     LOG.info("creating HRegion " + info.getTable().getNameAsString()
4483         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
4484         " Table name == " + info.getTable().getNameAsString());
4485     FileSystem fs = FileSystem.get(conf);
4486     HRegionFileSystem rfs = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
4487     HLog effectiveHLog = hlog;
4488     if (hlog == null && !ignoreHLog) {
4489       effectiveHLog = HLogFactory.createHLog(fs, rfs.getRegionDir(),
4490                                              HConstants.HREGION_LOGDIR_NAME, conf);
4491     }
4492     HRegion region = HRegion.newHRegion(tableDir,
4493         effectiveHLog, fs, conf, info, hTableDescriptor, null);
4494     if (initialize) {
4495       // If initializing, set the sequenceId. It is also required by HLogPerformanceEvaluation when
4496       // verifying the WALEdits.
4497       region.setSequenceId(region.initialize());
4498     }
4499     return region;
4500   }
4501 
4502   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4503                                       final Configuration conf,
4504                                       final HTableDescriptor hTableDescriptor,
4505                                       final HLog hlog)
4506     throws IOException {
4507     return createHRegion(info, rootDir, conf, hTableDescriptor, hlog, true);
4508   }
4509 
4510 
4511   /**
4512    * Open a Region.
4513    * @param info Info for region to be opened.
4514    * @param wal HLog for region to use. This method will call
4515    * HLog#setSequenceNumber(long) passing the result of the call to
4516    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4517    * up.  HRegionStore does this every time it opens a new region.
4518    * @param conf
4519    * @return new HRegion
4520    *
4521    * @throws IOException
4522    */
4523   public static HRegion openHRegion(final HRegionInfo info,
4524       final HTableDescriptor htd, final HLog wal,
4525       final Configuration conf)
4526   throws IOException {
4527     return openHRegion(info, htd, wal, conf, null, null);
4528   }
4529 
4530   /**
4531    * Open a Region.
4532    * @param info Info for region to be opened
4533    * @param htd the table descriptor
4534    * @param wal HLog for region to use. This method will call
4535    * HLog#setSequenceNumber(long) passing the result of the call to
4536    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4537    * up.  HRegionStore does this every time it opens a new region.
4538    * @param conf The Configuration object to use.
4539    * @param rsServices An interface we can request flushes against.
4540    * @param reporter An interface we can report progress against.
4541    * @return new HRegion
4542    *
4543    * @throws IOException
4544    */
4545   public static HRegion openHRegion(final HRegionInfo info,
4546     final HTableDescriptor htd, final HLog wal, final Configuration conf,
4547     final RegionServerServices rsServices,
4548     final CancelableProgressable reporter)
4549   throws IOException {
4550     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
4551   }
4552 
4553   /**
4554    * Open a Region.
4555    * @param rootDir Root directory for HBase instance
4556    * @param info Info for region to be opened.
4557    * @param htd the table descriptor
4558    * @param wal HLog for region to use. This method will call
4559    * HLog#setSequenceNumber(long) passing the result of the call to
4560    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4561    * up.  HRegionStore does this every time it opens a new region.
4562    * @param conf The Configuration object to use.
4563    * @return new HRegion
4564    * @throws IOException
4565    */
4566   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
4567       final HTableDescriptor htd, final HLog wal, final Configuration conf)
4568   throws IOException {
4569     return openHRegion(rootDir, info, htd, wal, conf, null, null);
4570   }
4571 
4572   /**
4573    * Open a Region.
4574    * @param rootDir Root directory for HBase instance
4575    * @param info Info for region to be opened.
4576    * @param htd the table descriptor
4577    * @param wal HLog for region to use. This method will call
4578    * HLog#setSequenceNumber(long) passing the result of the call to
4579    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4580    * up.  HRegionStore does this every time it opens a new region.
4581    * @param conf The Configuration object to use.
4582    * @param rsServices An interface we can request flushes against.
4583    * @param reporter An interface we can report progress against.
4584    * @return new HRegion
4585    * @throws IOException
4586    */
4587   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
4588       final HTableDescriptor htd, final HLog wal, final Configuration conf,
4589       final RegionServerServices rsServices,
4590       final CancelableProgressable reporter)
4591   throws IOException {
4592     FileSystem fs = null;
4593     if (rsServices != null) {
4594       fs = rsServices.getFileSystem();
4595     }
4596     if (fs == null) {
4597       fs = FileSystem.get(conf);
4598     }
4599     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
4600   }
4601 
4602   /**
4603    * Open a Region.
4604    * @param conf The Configuration object to use.
4605    * @param fs Filesystem to use
4606    * @param rootDir Root directory for HBase instance
4607    * @param info Info for region to be opened.
4608    * @param htd the table descriptor
4609    * @param wal HLog for region to use. This method will call
4610    * HLog#setSequenceNumber(long) passing the result of the call to
4611    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4612    * up.  HRegionStore does this every time it opens a new region.
4613    * @return new HRegion
4614    * @throws IOException
4615    */
4616   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4617       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal)
4618       throws IOException {
4619     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
4620   }
4621 
4622   /**
4623    * Open a Region.
4624    * @param conf The Configuration object to use.
4625    * @param fs Filesystem to use
4626    * @param rootDir Root directory for HBase instance
4627    * @param info Info for region to be opened.
4628    * @param htd the table descriptor
4629    * @param wal HLog for region to use. This method will call
4630    * HLog#setSequenceNumber(long) passing the result of the call to
4631    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4632    * up.  HRegionStore does this every time it opens a new region.
4633    * @param rsServices An interface we can request flushes against.
4634    * @param reporter An interface we can report progress against.
4635    * @return new HRegion
4636    * @throws IOException
4637    */
4638   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4639       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4640       final RegionServerServices rsServices, final CancelableProgressable reporter)
4641       throws IOException {
4642     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4643     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
4644   }
4645 
4646   /**
4647    * Open a Region.
4648    * @param conf The Configuration object to use.
4649    * @param fs Filesystem to use
4650    * @param rootDir Root directory for HBase instance
4651    * @param info Info for region to be opened.
4652    * @param htd the table descriptor
4653    * @param wal HLog for region to use. This method will call
4654    * HLog#setSequenceNumber(long) passing the result of the call to
4655    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4656    * up.  HRegionStore does this every time it opens a new region.
4657    * @param rsServices An interface we can request flushes against.
4658    * @param reporter An interface we can report progress against.
4659    * @return new HRegion
4660    * @throws IOException
4661    */
4662   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4663       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4664       final RegionServerServices rsServices, final CancelableProgressable reporter)
4665       throws IOException {
4666     if (info == null) throw new NullPointerException("Passed region info is null");
4667     if (LOG.isDebugEnabled()) {
4668       LOG.debug("Opening region: " + info);
4669     }
4670     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
4671     return r.openHRegion(reporter);
4672   }
4673 
4674 
4675   /**
4676    * Useful when reopening a closed region (normally for unit tests)
4677    * @param other original object
4678    * @param reporter An interface we can report progress against.
4679    * @return new HRegion
4680    * @throws IOException
4681    */
4682   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
4683       throws IOException {
4684     HRegionFileSystem regionFs = other.getRegionFileSystem();
4685     HRegion r = newHRegion(regionFs.getTableDir(), other.getLog(), regionFs.getFileSystem(),
4686         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
4687     return r.openHRegion(reporter);
4688   }
4689 
4690   /**
4691    * Open HRegion.
4692    * Calls initialize and sets sequenceid.
4693    * @param reporter
4694    * @return Returns <code>this</code>
4695    * @throws IOException
4696    */
4697   protected HRegion openHRegion(final CancelableProgressable reporter)
4698   throws IOException {
4699     // Refuse to open the region if we are missing local compression support
4700     checkCompressionCodecs();
4701     // Refuse to open the region if encryption configuration is incorrect or
4702     // codec support is missing
4703     checkEncryption();
4704     // Refuse to open the region if a required class cannot be loaded
4705     checkClassLoading();
4706     this.openSeqNum = initialize(reporter);
4707     this.setSequenceId(openSeqNum);
4708     return this;
4709   }
4710 
4711   private void checkCompressionCodecs() throws IOException {
4712     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4713       CompressionTest.testCompression(fam.getCompression());
4714       CompressionTest.testCompression(fam.getCompactionCompression());
4715     }
4716   }
4717 
4718   private void checkEncryption() throws IOException {
4719     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4720       EncryptionTest.testEncryption(conf, fam.getEncryptionType(), fam.getEncryptionKey());
4721     }
4722   }
4723 
4724   private void checkClassLoading() throws IOException {
4725     RegionSplitPolicy.getSplitPolicyClass(this.htableDescriptor, conf);
4726     RegionCoprocessorHost.testTableCoprocessorAttrs(conf, this.htableDescriptor);
4727   }
4728 
4729   /**
4730    * Create a daughter region from given a temp directory with the region data.
4731    * @param hri Spec. for daughter region to open.
4732    * @throws IOException
4733    */
4734   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
4735     // Move the files from the temporary .splits to the final /table/region directory
4736     fs.commitDaughterRegion(hri);
4737 
4738     // Create the daughter HRegion instance
4739     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(), fs.getFileSystem(),
4740         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
4741     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
4742     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
4743     return r;
4744   }
4745 
4746   /**
4747    * Create a merged region given a temp directory with the region data.
4748    * @param mergedRegionInfo
4749    * @param region_b another merging region
4750    * @return merged hregion
4751    * @throws IOException
4752    */
4753   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
4754       final HRegion region_b) throws IOException {
4755     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(),
4756         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
4757         this.getTableDesc(), this.rsServices);
4758     r.readRequestsCount.set(this.getReadRequestsCount()
4759         + region_b.getReadRequestsCount());
4760     r.writeRequestsCount.set(this.getWriteRequestsCount()
4761 
4762         + region_b.getWriteRequestsCount());
4763     this.fs.commitMergedRegion(mergedRegionInfo);
4764     return r;
4765   }
4766 
4767   /**
4768    * Inserts a new region's meta information into the passed
4769    * <code>meta</code> region. Used by the HMaster bootstrap code adding
4770    * new table to hbase:meta table.
4771    *
4772    * @param meta hbase:meta HRegion to be updated
4773    * @param r HRegion to add to <code>meta</code>
4774    *
4775    * @throws IOException
4776    */
4777   // TODO remove since only test and merge use this
4778   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
4779     meta.checkResources();
4780     // The row key is the region name
4781     byte[] row = r.getRegionName();
4782     final long now = EnvironmentEdgeManager.currentTimeMillis();
4783     final List<Cell> cells = new ArrayList<Cell>(2);
4784     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4785       HConstants.REGIONINFO_QUALIFIER, now,
4786       r.getRegionInfo().toByteArray()));
4787     // Set into the root table the version of the meta table.
4788     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4789       HConstants.META_VERSION_QUALIFIER, now,
4790       Bytes.toBytes(HConstants.META_VERSION)));
4791     meta.put(row, HConstants.CATALOG_FAMILY, cells);
4792   }
4793 
4794   /**
4795    * Computes the Path of the HRegion
4796    *
4797    * @param tabledir qualified path for table
4798    * @param name ENCODED region name
4799    * @return Path of HRegion directory
4800    */
4801   @Deprecated
4802   public static Path getRegionDir(final Path tabledir, final String name) {
4803     return new Path(tabledir, name);
4804   }
4805 
4806   /**
4807    * Computes the Path of the HRegion
4808    *
4809    * @param rootdir qualified path of HBase root directory
4810    * @param info HRegionInfo for the region
4811    * @return qualified path of region directory
4812    */
4813   @Deprecated
4814   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
4815     return new Path(
4816       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
4817   }
4818 
4819   /**
4820    * Determines if the specified row is within the row range specified by the
4821    * specified HRegionInfo
4822    *
4823    * @param info HRegionInfo that specifies the row range
4824    * @param row row to be checked
4825    * @return true if the row is within the range specified by the HRegionInfo
4826    */
4827   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
4828     return ((info.getStartKey().length == 0) ||
4829         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
4830         ((info.getEndKey().length == 0) ||
4831             (Bytes.compareTo(info.getEndKey(), row) > 0));
4832   }
4833 
4834   /**
4835    * Merge two HRegions.  The regions must be adjacent and must not overlap.
4836    *
4837    * @param srcA
4838    * @param srcB
4839    * @return new merged HRegion
4840    * @throws IOException
4841    */
4842   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
4843   throws IOException {
4844     HRegion a = srcA;
4845     HRegion b = srcB;
4846 
4847     // Make sure that srcA comes first; important for key-ordering during
4848     // write of the merged file.
4849     if (srcA.getStartKey() == null) {
4850       if (srcB.getStartKey() == null) {
4851         throw new IOException("Cannot merge two regions with null start key");
4852       }
4853       // A's start key is null but B's isn't. Assume A comes before B
4854     } else if ((srcB.getStartKey() == null) ||
4855       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
4856       a = srcB;
4857       b = srcA;
4858     }
4859 
4860     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
4861       throw new IOException("Cannot merge non-adjacent regions");
4862     }
4863     return merge(a, b);
4864   }
4865 
4866   /**
4867    * Merge two regions whether they are adjacent or not.
4868    *
4869    * @param a region a
4870    * @param b region b
4871    * @return new merged region
4872    * @throws IOException
4873    */
4874   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
4875     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
4876       throw new IOException("Regions do not belong to the same table");
4877     }
4878 
4879     FileSystem fs = a.getRegionFileSystem().getFileSystem();
4880     // Make sure each region's cache is empty
4881     a.flushcache();
4882     b.flushcache();
4883 
4884     // Compact each region so we only have one store file per family
4885     a.compactStores(true);
4886     if (LOG.isDebugEnabled()) {
4887       LOG.debug("Files for region: " + a);
4888       a.getRegionFileSystem().logFileSystemState(LOG);
4889     }
4890     b.compactStores(true);
4891     if (LOG.isDebugEnabled()) {
4892       LOG.debug("Files for region: " + b);
4893       b.getRegionFileSystem().logFileSystemState(LOG);
4894     }
4895 
4896     RegionMergeTransaction rmt = new RegionMergeTransaction(a, b, true);
4897     if (!rmt.prepare(null)) {
4898       throw new IOException("Unable to merge regions " + a + " and " + b);
4899     }
4900     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
4901     LOG.info("starting merge of regions: " + a + " and " + b
4902         + " into new region " + mergedRegionInfo.getRegionNameAsString()
4903         + " with start key <"
4904         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
4905         + "> and end key <"
4906         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
4907     HRegion dstRegion;
4908     try {
4909       dstRegion = rmt.execute(null, null);
4910     } catch (IOException ioe) {
4911       rmt.rollback(null, null);
4912       throw new IOException("Failed merging region " + a + " and " + b
4913           + ", and succssfully rolled back");
4914     }
4915     dstRegion.compactStores(true);
4916 
4917     if (LOG.isDebugEnabled()) {
4918       LOG.debug("Files for new region");
4919       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
4920     }
4921 
4922     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
4923       throw new IOException("Merged region " + dstRegion
4924           + " still has references after the compaction, is compaction canceled?");
4925     }
4926 
4927     // Archiving the 'A' region
4928     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
4929     // Archiving the 'B' region
4930     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
4931 
4932     LOG.info("merge completed. New region is " + dstRegion);
4933     return dstRegion;
4934   }
4935 
4936   /**
4937    * @return True if needs a major compaction.
4938    * @throws IOException
4939    */
4940   boolean isMajorCompaction() throws IOException {
4941     for (Store store : this.stores.values()) {
4942       if (store.isMajorCompaction()) {
4943         return true;
4944       }
4945     }
4946     return false;
4947   }
4948 
4949   //
4950   // HBASE-880
4951   //
4952   /**
4953    * @param get get object
4954    * @return result
4955    * @throws IOException read exceptions
4956    */
4957   public Result get(final Get get) throws IOException {
4958     checkRow(get.getRow(), "Get");
4959     // Verify families are all valid
4960     if (get.hasFamilies()) {
4961       for (byte [] family: get.familySet()) {
4962         checkFamily(family);
4963       }
4964     } else { // Adding all families to scanner
4965       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
4966         get.addFamily(family);
4967       }
4968     }
4969     List<Cell> results = get(get, true);
4970     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null);
4971   }
4972 
4973   /*
4974    * Do a get based on the get parameter.
4975    * @param withCoprocessor invoke coprocessor or not. We don't want to
4976    * always invoke cp for this private method.
4977    */
4978   public List<Cell> get(Get get, boolean withCoprocessor)
4979   throws IOException {
4980 
4981     List<Cell> results = new ArrayList<Cell>();
4982 
4983     // pre-get CP hook
4984     if (withCoprocessor && (coprocessorHost != null)) {
4985        if (coprocessorHost.preGet(get, results)) {
4986          return results;
4987        }
4988     }
4989 
4990     Scan scan = new Scan(get);
4991 
4992     RegionScanner scanner = null;
4993     try {
4994       scanner = getScanner(scan);
4995       scanner.next(results);
4996     } finally {
4997       if (scanner != null)
4998         scanner.close();
4999     }
5000 
5001     // post-get CP hook
5002     if (withCoprocessor && (coprocessorHost != null)) {
5003       coprocessorHost.postGet(get, results);
5004     }
5005 
5006     // do after lock
5007     if (this.metricsRegion != null) {
5008       long totalSize = 0l;
5009       if (results != null) {
5010         for (Cell kv:results) {
5011           totalSize += KeyValueUtil.ensureKeyValue(kv).getLength();
5012         }
5013       }
5014       this.metricsRegion.updateGet(totalSize);
5015     }
5016 
5017     return results;
5018   }
5019 
5020   public void mutateRow(RowMutations rm) throws IOException {
5021     // Don't need nonces here - RowMutations only supports puts and deletes
5022     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
5023   }
5024 
5025   /**
5026    * Perform atomic mutations within the region w/o nonces.
5027    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
5028    */
5029   public void mutateRowsWithLocks(Collection<Mutation> mutations,
5030       Collection<byte[]> rowsToLock) throws IOException {
5031     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
5032   }
5033 
5034   /**
5035    * Perform atomic mutations within the region.
5036    * @param mutations The list of mutations to perform.
5037    * <code>mutations</code> can contain operations for multiple rows.
5038    * Caller has to ensure that all rows are contained in this region.
5039    * @param rowsToLock Rows to lock
5040    * @param nonceGroup Optional nonce group of the operation (client Id)
5041    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5042    * If multiple rows are locked care should be taken that
5043    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
5044    * @throws IOException
5045    */
5046   public void mutateRowsWithLocks(Collection<Mutation> mutations,
5047       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
5048     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
5049     processRowsWithLocks(proc, -1, nonceGroup, nonce);
5050   }
5051 
5052   /**
5053    * @return the current load statistics for the the region
5054    */
5055   public ClientProtos.RegionLoadStats getRegionStats() {
5056     if (!regionStatsEnabled) {
5057       return null;
5058     }
5059     ClientProtos.RegionLoadStats.Builder stats = ClientProtos.RegionLoadStats.newBuilder();
5060     stats.setMemstoreLoad((int) (Math.min(100, (this.memstoreSize.get() * 100) / this
5061         .memstoreFlushSize)));
5062     stats.setHeapOccupancy((int)rsServices.getHeapMemoryManager().getHeapOccupancyPercent()*100);
5063     return stats.build();
5064   }
5065 
5066   /**
5067    * Performs atomic multiple reads and writes on a given row.
5068    *
5069    * @param processor The object defines the reads and writes to a row.
5070    * @param nonceGroup Optional nonce group of the operation (client Id)
5071    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5072    */
5073   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
5074       throws IOException {
5075     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
5076   }
5077 
5078   /**
5079    * Performs atomic multiple reads and writes on a given row.
5080    *
5081    * @param processor The object defines the reads and writes to a row.
5082    * @param timeout The timeout of the processor.process() execution
5083    *                Use a negative number to switch off the time bound
5084    * @param nonceGroup Optional nonce group of the operation (client Id)
5085    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
5086    */
5087   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
5088       long nonceGroup, long nonce) throws IOException {
5089 
5090     for (byte[] row : processor.getRowsToLock()) {
5091       checkRow(row, "processRowsWithLocks");
5092     }
5093     if (!processor.readOnly()) {
5094       checkReadOnly();
5095     }
5096     checkResources();
5097 
5098     startRegionOperation();
5099     WALEdit walEdit = new WALEdit();
5100 
5101     // 1. Run pre-process hook
5102     try {
5103       processor.preProcess(this, walEdit);
5104     } catch (IOException e) {
5105       closeRegionOperation();
5106       throw e;
5107     }
5108     // Short circuit the read only case
5109     if (processor.readOnly()) {
5110       try {
5111         long now = EnvironmentEdgeManager.currentTimeMillis();
5112         doProcessRowWithTimeout(
5113             processor, now, this, null, null, timeout);
5114         processor.postProcess(this, walEdit, true);
5115       } catch (IOException e) {
5116         throw e;
5117       } finally {
5118         closeRegionOperation();
5119       }
5120       return;
5121     }
5122 
5123     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
5124     boolean locked = false;
5125     boolean walSyncSuccessful = false;
5126     List<RowLock> acquiredRowLocks = null;
5127     long addedSize = 0;
5128     List<Mutation> mutations = new ArrayList<Mutation>();
5129     Collection<byte[]> rowsToLock = processor.getRowsToLock();
5130     try {
5131       // 2. Acquire the row lock(s)
5132       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
5133       for (byte[] row : rowsToLock) {
5134         // Attempt to lock all involved rows, throw if any lock times out
5135         acquiredRowLocks.add(getRowLock(row));
5136       }
5137       // 3. Region lock
5138       lock(this.updatesLock.readLock(), acquiredRowLocks.size() == 0 ? 1 : acquiredRowLocks.size());
5139       locked = true;
5140 
5141       long now = EnvironmentEdgeManager.currentTimeMillis();
5142       try {
5143         // 4. Let the processor scan the rows, generate mutations and add
5144         //    waledits
5145         doProcessRowWithTimeout(
5146             processor, now, this, mutations, walEdit, timeout);
5147 
5148         if (!mutations.isEmpty()) {
5149           // 5. Get a mvcc write number
5150           writeEntry = mvcc.beginMemstoreInsert();
5151           // 6. Call the preBatchMutate hook
5152           processor.preBatchMutate(this, walEdit);
5153           // 7. Apply to memstore
5154           for (Mutation m : mutations) {
5155             // Handle any tag based cell features
5156             rewriteCellTags(m.getFamilyCellMap(), m);
5157 
5158             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
5159               KeyValue kv = KeyValueUtil.ensureKeyValue(cellScanner.current());
5160               kv.setMvccVersion(writeEntry.getWriteNumber());
5161               byte[] family = kv.getFamily();
5162               checkFamily(family);
5163               addedSize += stores.get(family).add(kv);
5164             }
5165           }
5166 
5167           long txid = 0;
5168           // 8. Append no sync
5169           if (!walEdit.isEmpty()) {
5170             txid = this.log.appendNoSync(this.getRegionInfo(),
5171               this.htableDescriptor.getTableName(), walEdit, processor.getClusterIds(), now,
5172               this.htableDescriptor, this.sequenceId, true, nonceGroup, nonce);
5173           }
5174           // 9. Release region lock
5175           if (locked) {
5176             this.updatesLock.readLock().unlock();
5177             locked = false;
5178           }
5179 
5180           // 10. Release row lock(s)
5181           releaseRowLocks(acquiredRowLocks);
5182 
5183           // 11. Sync edit log
5184           if (txid != 0) {
5185             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
5186           }
5187           walSyncSuccessful = true;
5188           // 12. call postBatchMutate hook
5189           processor.postBatchMutate(this);
5190         }
5191       } finally {
5192         if (!mutations.isEmpty() && !walSyncSuccessful) {
5193           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
5194               " memstore keyvalues for row(s):" +
5195               processor.getRowsToLock().iterator().next() + "...");
5196           for (Mutation m : mutations) {
5197             for (CellScanner cellScanner = m.cellScanner(); cellScanner.advance();) {
5198               KeyValue kv = KeyValueUtil.ensureKeyValue(cellScanner.current());
5199               stores.get(kv.getFamily()).rollback(kv);
5200             }
5201           }
5202         }
5203         // 13. Roll mvcc forward
5204         if (writeEntry != null) {
5205           mvcc.completeMemstoreInsert(writeEntry);
5206           writeEntry = null;
5207         }
5208         if (locked) {
5209           this.updatesLock.readLock().unlock();
5210           locked = false;
5211         }
5212         // release locks if some were acquired but another timed out
5213         releaseRowLocks(acquiredRowLocks);
5214       }
5215 
5216       // 14. Run post-process hook
5217       processor.postProcess(this, walEdit, walSyncSuccessful);
5218 
5219     } catch (IOException e) {
5220       throw e;
5221     } finally {
5222       closeRegionOperation();
5223       if (!mutations.isEmpty() &&
5224           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
5225         requestFlush();
5226       }
5227     }
5228   }
5229 
5230   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
5231                                        final long now,
5232                                        final HRegion region,
5233                                        final List<Mutation> mutations,
5234                                        final WALEdit walEdit,
5235                                        final long timeout) throws IOException {
5236     // Short circuit the no time bound case.
5237     if (timeout < 0) {
5238       try {
5239         processor.process(now, region, mutations, walEdit);
5240       } catch (IOException e) {
5241         LOG.warn("RowProcessor:" + processor.getClass().getName() +
5242             " throws Exception on row(s):" +
5243             Bytes.toStringBinary(
5244               processor.getRowsToLock().iterator().next()) + "...", e);
5245         throw e;
5246       }
5247       return;
5248     }
5249 
5250     // Case with time bound
5251     FutureTask<Void> task =
5252       new FutureTask<Void>(new Callable<Void>() {
5253         @Override
5254         public Void call() throws IOException {
5255           try {
5256             processor.process(now, region, mutations, walEdit);
5257             return null;
5258           } catch (IOException e) {
5259             LOG.warn("RowProcessor:" + processor.getClass().getName() +
5260                 " throws Exception on row(s):" +
5261                 Bytes.toStringBinary(
5262                     processor.getRowsToLock().iterator().next()) + "...", e);
5263             throw e;
5264           }
5265         }
5266       });
5267     rowProcessorExecutor.execute(task);
5268     try {
5269       task.get(timeout, TimeUnit.MILLISECONDS);
5270     } catch (TimeoutException te) {
5271       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
5272           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
5273           "...");
5274       throw new IOException(te);
5275     } catch (Exception e) {
5276       throw new IOException(e);
5277     }
5278   }
5279 
5280   public Result append(Append append) throws IOException {
5281     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
5282   }
5283 
5284   // TODO: There's a lot of boiler plate code identical to increment.
5285   // We should refactor append and increment as local get-mutate-put
5286   // transactions, so all stores only go through one code path for puts.
5287   /**
5288    * Perform one or more append operations on a row.
5289    *
5290    * @param append
5291    * @return new keyvalues after increment
5292    * @throws IOException
5293    */
5294   public Result append(Append append, long nonceGroup, long nonce)
5295       throws IOException {
5296     byte[] row = append.getRow();
5297     checkRow(row, "append");
5298     boolean flush = false;
5299     Durability durability = getEffectiveDurability(append.getDurability());
5300     boolean writeToWAL = durability != Durability.SKIP_WAL;
5301     WALEdit walEdits = null;
5302     List<Cell> allKVs = new ArrayList<Cell>(append.size());
5303     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
5304 
5305     long size = 0;
5306     long txid = 0;
5307 
5308     checkReadOnly();
5309     checkResources();
5310     // Lock row
5311     startRegionOperation(Operation.APPEND);
5312     this.writeRequestsCount.increment();
5313     WriteEntry w = null;
5314     RowLock rowLock;
5315     try {
5316       rowLock = getRowLock(row);
5317       try {
5318         lock(this.updatesLock.readLock());
5319         try {
5320           // wait for all prior MVCC transactions to finish - while we hold the row lock
5321           // (so that we are guaranteed to see the latest state)
5322           mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
5323           if (this.coprocessorHost != null) {
5324             Result r = this.coprocessorHost.preAppendAfterRowLock(append);
5325             if(r!= null) {
5326               return r;
5327             }
5328           }
5329           // now start my own transaction
5330           w = mvcc.beginMemstoreInsert();
5331           long now = EnvironmentEdgeManager.currentTimeMillis();
5332           // Process each family
5333           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
5334 
5335             Store store = stores.get(family.getKey());
5336             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
5337 
5338             // Sort the cells so that they match the order that they
5339             // appear in the Get results. Otherwise, we won't be able to
5340             // find the existing values if the cells are not specified
5341             // in order by the client since cells are in an array list.
5342             Collections.sort(family.getValue(), store.getComparator());
5343             // Get previous values for all columns in this family
5344             Get get = new Get(row);
5345             for (Cell cell : family.getValue()) {
5346               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5347               get.addColumn(family.getKey(), kv.getQualifier());
5348             }
5349             List<Cell> results = get(get, false);
5350 
5351             // Iterate the input columns and update existing values if they were
5352             // found, otherwise add new column initialized to the append value
5353 
5354             // Avoid as much copying as possible. We may need to rewrite and
5355             // consolidate tags. Bytes are only copied once.
5356             // Would be nice if KeyValue had scatter/gather logic
5357             int idx = 0;
5358             for (Cell cell : family.getValue()) {
5359               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5360               KeyValue newKv;
5361               KeyValue oldKv = null;
5362               if (idx < results.size()
5363                   && CellUtil.matchingQualifier(results.get(idx), kv)) {
5364                 oldKv = KeyValueUtil.ensureKeyValue(results.get(idx));
5365                 long ts = Math.max(now, oldKv.getTimestamp());
5366 
5367                 // Process cell tags
5368                 List<Tag> newTags = new ArrayList<Tag>();
5369 
5370                 // Make a union of the set of tags in the old and new KVs
5371 
5372                 if (oldKv.getTagsLengthUnsigned() > 0) {
5373                   Iterator<Tag> i = CellUtil.tagsIterator(oldKv.getTagsArray(),
5374                     oldKv.getTagsOffset(), oldKv.getTagsLengthUnsigned());
5375                   while (i.hasNext()) {
5376                     newTags.add(i.next());
5377                   }
5378                 }
5379                 if (kv.getTagsLengthUnsigned() > 0) {
5380                   Iterator<Tag> i  = CellUtil.tagsIterator(kv.getTagsArray(), kv.getTagsOffset(),
5381                     kv.getTagsLengthUnsigned());
5382                   while (i.hasNext()) {
5383                     newTags.add(i.next());
5384                   }
5385                 }
5386 
5387                 // Cell TTL handling
5388 
5389                 if (append.getTTL() != Long.MAX_VALUE) {
5390                   // Add the new TTL tag
5391                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
5392                 }
5393 
5394                 // Rebuild tags
5395                 byte[] tagBytes = Tag.fromList(newTags);
5396 
5397                 // allocate an empty cell once
5398                 newKv = new KeyValue(row.length, kv.getFamilyLength(),
5399                     kv.getQualifierLength(), ts, KeyValue.Type.Put,
5400                     oldKv.getValueLength() + kv.getValueLength(),
5401                     tagBytes.length);
5402                 // copy in row, family, and qualifier
5403                 System.arraycopy(kv.getRowArray(), kv.getRowOffset(),
5404                   newKv.getRowArray(), newKv.getRowOffset(), kv.getRowLength());
5405                 System.arraycopy(kv.getFamilyArray(), kv.getFamilyOffset(),
5406                   newKv.getFamilyArray(), newKv.getFamilyOffset(),
5407                   kv.getFamilyLength());
5408                 System.arraycopy(kv.getQualifierArray(), kv.getQualifierOffset(),
5409                   newKv.getQualifierArray(), newKv.getQualifierOffset(),
5410                   kv.getQualifierLength());
5411                 // copy in the value
5412                 System.arraycopy(oldKv.getValueArray(), oldKv.getValueOffset(),
5413                   newKv.getValueArray(), newKv.getValueOffset(),
5414                   oldKv.getValueLength());
5415                 System.arraycopy(kv.getValueArray(), kv.getValueOffset(),
5416                   newKv.getValueArray(),
5417                   newKv.getValueOffset() + oldKv.getValueLength(),
5418                   kv.getValueLength());
5419                 // Copy in tag data
5420                 System.arraycopy(tagBytes, 0, newKv.getTagsArray(), newKv.getTagsOffset(),
5421                   tagBytes.length);
5422                 idx++;
5423               } else {
5424                 // Append's KeyValue.Type==Put and ts==HConstants.LATEST_TIMESTAMP,
5425                 // so only need to update the timestamp to 'now'
5426                 kv.updateLatestStamp(Bytes.toBytes(now));
5427 
5428                 // Cell TTL handling
5429 
5430                 if (append.getTTL() != Long.MAX_VALUE) {
5431                   List<Tag> newTags = new ArrayList<Tag>(1);
5432                   newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(append.getTTL())));
5433                   // Add the new TTL tag
5434                   newKv = new KeyValue(kv.getRowArray(), kv.getRowOffset(), kv.getRowLength(),
5435                     kv.getFamilyArray(), kv.getFamilyOffset(), kv.getFamilyLength(),
5436                     kv.getQualifierArray(), kv.getQualifierOffset(), kv.getQualifierLength(),
5437                     kv.getTimestamp(), KeyValue.Type.codeToType(kv.getTypeByte()),
5438                     kv.getValueArray(), kv.getValueOffset(), kv.getValueLength(),
5439                     newTags);
5440                 } else {
5441                   newKv = kv;
5442                 }
5443               }
5444               newKv.setMvccVersion(w.getWriteNumber());
5445 
5446               // Give coprocessors a chance to update the new cell
5447               if (coprocessorHost != null) {
5448                 newKv = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5449                     RegionObserver.MutationType.APPEND, append, oldKv, (Cell) newKv));
5450               }
5451               kvs.add(newKv);
5452 
5453               // Append update to WAL
5454               if (writeToWAL) {
5455                 if (walEdits == null) {
5456                   walEdits = new WALEdit();
5457                 }
5458                 walEdits.add(newKv);
5459               }
5460             }
5461 
5462             //store the kvs to the temporary memstore before writing HLog
5463             tempMemstore.put(store, kvs);
5464           }
5465 
5466           // Actually write to WAL now
5467           if (writeToWAL) {
5468             // Using default cluster id, as this can only happen in the orginating
5469             // cluster. A slave cluster receives the final value (not the delta)
5470             // as a Put.
5471             txid = this.log.appendNoSync(this.getRegionInfo(),
5472               this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5473               EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5474               true, nonceGroup, nonce);
5475           } else {
5476             recordMutationWithoutWal(append.getFamilyCellMap());
5477           }
5478 
5479           //Actually write to Memstore now
5480           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5481             Store store = entry.getKey();
5482             if (store.getFamily().getMaxVersions() == 1) {
5483               // upsert if VERSIONS for this CF == 1
5484               size += store.upsert(entry.getValue(), getSmallestReadPoint());
5485             } else {
5486               // otherwise keep older versions around
5487               for (Cell cell: entry.getValue()) {
5488                 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5489                 size += store.add(kv);
5490               }
5491             }
5492             allKVs.addAll(entry.getValue());
5493           }
5494           size = this.addAndGetGlobalMemstoreSize(size);
5495           flush = isFlushSize(size);
5496         } finally {
5497           this.updatesLock.readLock().unlock();
5498         }
5499       } finally {
5500         rowLock.release();
5501       }
5502       if (writeToWAL) {
5503         // sync the transaction log outside the rowlock
5504         syncOrDefer(txid, durability);
5505       }
5506     } finally {
5507       if (w != null) {
5508         mvcc.completeMemstoreInsert(w);
5509       }
5510       closeRegionOperation(Operation.APPEND);
5511     }
5512 
5513     if (this.metricsRegion != null) {
5514       this.metricsRegion.updateAppend();
5515     }
5516 
5517     if (flush) {
5518       // Request a cache flush. Do it outside update lock.
5519       requestFlush();
5520     }
5521 
5522 
5523     return append.isReturnResults() ? Result.create(allKVs) : null;
5524   }
5525 
5526   public Result increment(Increment increment) throws IOException {
5527     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
5528   }
5529 
5530   // TODO: There's a lot of boiler plate code identical to append.
5531   // We should refactor append and increment as local get-mutate-put
5532   // transactions, so all stores only go through one code path for puts.
5533   /**
5534    * Perform one or more increment operations on a row.
5535    * @param increment
5536    * @return new keyvalues after increment
5537    * @throws IOException
5538    */
5539   public Result increment(Increment increment, long nonceGroup, long nonce)
5540   throws IOException {
5541     byte [] row = increment.getRow();
5542     checkRow(row, "increment");
5543     TimeRange tr = increment.getTimeRange();
5544     boolean flush = false;
5545     Durability durability = getEffectiveDurability(increment.getDurability());
5546     boolean writeToWAL = durability != Durability.SKIP_WAL;
5547     WALEdit walEdits = null;
5548     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
5549     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
5550 
5551     long size = 0;
5552     long txid = 0;
5553 
5554     checkReadOnly();
5555     checkResources();
5556     // Lock row
5557     startRegionOperation(Operation.INCREMENT);
5558     this.writeRequestsCount.increment();
5559     WriteEntry w = null;
5560     try {
5561       RowLock rowLock = getRowLock(row);
5562       try {
5563         lock(this.updatesLock.readLock());
5564         try {
5565           // wait for all prior MVCC transactions to finish - while we hold the row lock
5566           // (so that we are guaranteed to see the latest state)
5567           mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
5568           if (this.coprocessorHost != null) {
5569             Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
5570             if (r != null) {
5571               return r;
5572             }
5573           }
5574           // now start my own transaction
5575           w = mvcc.beginMemstoreInsert();
5576           long now = EnvironmentEdgeManager.currentTimeMillis();
5577           // Process each family
5578           for (Map.Entry<byte [], List<Cell>> family:
5579               increment.getFamilyCellMap().entrySet()) {
5580 
5581             Store store = stores.get(family.getKey());
5582             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
5583 
5584             // Sort the cells so that they match the order that they
5585             // appear in the Get results. Otherwise, we won't be able to
5586             // find the existing values if the cells are not specified
5587             // in order by the client since cells are in an array list.
5588             Collections.sort(family.getValue(), store.getComparator());
5589             // Get previous values for all columns in this family
5590             Get get = new Get(row);
5591             for (Cell cell: family.getValue()) {
5592               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5593               get.addColumn(family.getKey(), kv.getQualifier());
5594             }
5595             get.setTimeRange(tr.getMin(), tr.getMax());
5596             List<Cell> results = get(get, false);
5597 
5598             // Iterate the input columns and update existing values if they were
5599             // found, otherwise add new column initialized to the increment amount
5600             int idx = 0;
5601             for (Cell cell: family.getValue()) {
5602               long amount = Bytes.toLong(CellUtil.cloneValue(cell));
5603               boolean noWriteBack = (amount == 0);
5604               List<Tag> newTags = new ArrayList<Tag>();
5605 
5606               // Carry forward any tags that might have been added by a coprocessor
5607               if (cell.getTagsLengthUnsigned() > 0) {
5608                 Iterator<Tag> i = CellUtil.tagsIterator(cell.getTagsArray(),
5609                   cell.getTagsOffset(), cell.getTagsLengthUnsigned());
5610                 while (i.hasNext()) {
5611                   newTags.add(i.next());
5612                 }
5613               }
5614 
5615               Cell c = null;
5616               long ts = now;
5617               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), cell)) {
5618                 c = results.get(idx);
5619                 ts = Math.max(now, c.getTimestamp());
5620                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
5621                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
5622                 } else {
5623                   // throw DoNotRetryIOException instead of IllegalArgumentException
5624                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
5625                       "Attempted to increment field that isn't 64 bits wide");
5626                 }
5627                 // Carry tags forward from previous version
5628                 if (c.getTagsLength() > 0) {
5629                   Iterator<Tag> i = CellUtil.tagsIterator(c.getTagsArray(),
5630                     c.getTagsOffset(), c.getTagsLength());
5631                   while (i.hasNext()) {
5632                     newTags.add(i.next());
5633                   }
5634                 }
5635                 idx++;
5636               }
5637 
5638               // Append new incremented KeyValue to list
5639               byte[] q = CellUtil.cloneQualifier(cell);
5640               byte[] val = Bytes.toBytes(amount);
5641 
5642               // Add the TTL tag if the mutation carried one
5643               if (increment.getTTL() != Long.MAX_VALUE) {
5644                 newTags.add(new Tag(TagType.TTL_TAG_TYPE, Bytes.toBytes(increment.getTTL())));
5645               }
5646 
5647               KeyValue newKv = new KeyValue(row, 0, row.length,
5648                 family.getKey(), 0, family.getKey().length,
5649                 q, 0, q.length,
5650                 ts,
5651                 KeyValue.Type.Put,
5652                 val, 0, val.length,
5653                 newTags);
5654 
5655               newKv.setMvccVersion(w.getWriteNumber());
5656 
5657               // Give coprocessors a chance to update the new cell
5658               if (coprocessorHost != null) {
5659                 newKv = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5660                     RegionObserver.MutationType.INCREMENT, increment, c, (Cell) newKv));
5661               }
5662               allKVs.add(newKv);
5663 
5664               if (!noWriteBack) {
5665                 kvs.add(newKv);
5666 
5667                 // Prepare WAL updates
5668                 if (writeToWAL) {
5669                   if (walEdits == null) {
5670                     walEdits = new WALEdit();
5671                   }
5672                   walEdits.add(newKv);
5673                 }
5674               }
5675             }
5676 
5677             //store the kvs to the temporary memstore before writing HLog
5678             if (!kvs.isEmpty()) {
5679               tempMemstore.put(store, kvs);
5680             }
5681           }
5682 
5683           // Actually write to WAL now
5684           if (walEdits != null && !walEdits.isEmpty()) {
5685             if (writeToWAL) {
5686               // Using default cluster id, as this can only happen in the orginating
5687               // cluster. A slave cluster receives the final value (not the delta)
5688               // as a Put.
5689               txid = this.log.appendNoSync(this.getRegionInfo(),
5690                   this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5691                   EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5692                   true, nonceGroup, nonce);
5693             } else {
5694               recordMutationWithoutWal(increment.getFamilyCellMap());
5695             }
5696           }
5697           //Actually write to Memstore now
5698           if (!tempMemstore.isEmpty()) {
5699             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5700               Store store = entry.getKey();
5701               if (store.getFamily().getMaxVersions() == 1) {
5702                 // upsert if VERSIONS for this CF == 1
5703                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
5704               } else {
5705                 // otherwise keep older versions around
5706                 for (Cell cell : entry.getValue()) {
5707                   KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5708                   size += store.add(kv);
5709                 }
5710               }
5711             }
5712             size = this.addAndGetGlobalMemstoreSize(size);
5713             flush = isFlushSize(size);
5714           }
5715         } finally {
5716           this.updatesLock.readLock().unlock();
5717         }
5718       } finally {
5719         rowLock.release();
5720       }
5721       if (writeToWAL && (walEdits != null) && !walEdits.isEmpty()) {
5722         // sync the transaction log outside the rowlock
5723         syncOrDefer(txid, durability);
5724       }
5725     } finally {
5726       if (w != null) {
5727         mvcc.completeMemstoreInsert(w);
5728       }
5729       closeRegionOperation(Operation.INCREMENT);
5730       if (this.metricsRegion != null) {
5731         this.metricsRegion.updateIncrement();
5732       }
5733     }
5734 
5735     if (flush) {
5736       // Request a cache flush.  Do it outside update lock.
5737       requestFlush();
5738     }
5739 
5740     return Result.create(allKVs);
5741   }
5742 
5743   //
5744   // New HBASE-880 Helpers
5745   //
5746 
5747   private void checkFamily(final byte [] family)
5748   throws NoSuchColumnFamilyException {
5749     if (!this.htableDescriptor.hasFamily(family)) {
5750       throw new NoSuchColumnFamilyException("Column family " +
5751           Bytes.toString(family) + " does not exist in region " + this
5752           + " in table " + this.htableDescriptor);
5753     }
5754   }
5755 
5756   public static final long FIXED_OVERHEAD = ClassSize.align(
5757       ClassSize.OBJECT +
5758       ClassSize.ARRAY +
5759       42 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
5760       (12 * Bytes.SIZEOF_LONG) +
5761       4 * Bytes.SIZEOF_BOOLEAN);
5762 
5763   // woefully out of date - currently missing:
5764   // 1 x HashMap - coprocessorServiceHandlers
5765   // 6 org.cliffc.high_scale_lib.Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
5766   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
5767   //   writeRequestsCount, updatesBlockedMs
5768   // 1 x HRegion$WriteState - writestate
5769   // 1 x RegionCoprocessorHost - coprocessorHost
5770   // 1 x RegionSplitPolicy - splitPolicy
5771   // 1 x MetricsRegion - metricsRegion
5772   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
5773   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
5774       ClassSize.OBJECT + // closeLock
5775       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
5776       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
5777       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
5778       WriteState.HEAP_SIZE + // writestate
5779       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
5780       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
5781       ClassSize.ARRAYLIST + // recentFlushes
5782       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
5783       + ClassSize.TREEMAP // maxSeqIdInStores
5784       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
5785       ;
5786 
5787   @Override
5788   public long heapSize() {
5789     long heapSize = DEEP_OVERHEAD;
5790     for (Store store : this.stores.values()) {
5791       heapSize += store.heapSize();
5792     }
5793     // this does not take into account row locks, recent flushes, mvcc entries, and more
5794     return heapSize;
5795   }
5796 
5797   /*
5798    * This method calls System.exit.
5799    * @param message Message to print out.  May be null.
5800    */
5801   private static void printUsageAndExit(final String message) {
5802     if (message != null && message.length() > 0) System.out.println(message);
5803     System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
5804     System.out.println("Options:");
5805     System.out.println(" major_compact  Pass this option to major compact " +
5806       "passed region.");
5807     System.out.println("Default outputs scan of passed region.");
5808     System.exit(1);
5809   }
5810 
5811   /**
5812    * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
5813    * be available for handling
5814    * {@link HRegion#execService(com.google.protobuf.RpcController,
5815    *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall)}} calls.
5816    *
5817    * <p>
5818    * Only a single instance may be registered per region for a given {@link Service} subclass (the
5819    * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
5820    * After the first registration, subsequent calls with the same service name will fail with
5821    * a return value of {@code false}.
5822    * </p>
5823    * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
5824    * @return {@code true} if the registration was successful, {@code false}
5825    * otherwise
5826    */
5827   public boolean registerService(Service instance) {
5828     /*
5829      * No stacking of instances is allowed for a single service name
5830      */
5831     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
5832     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
5833       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
5834           " already registered, rejecting request from "+instance
5835       );
5836       return false;
5837     }
5838 
5839     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
5840     if (LOG.isDebugEnabled()) {
5841       LOG.debug("Registered coprocessor service: region="+
5842           Bytes.toStringBinary(getRegionName())+" service="+serviceDesc.getFullName());
5843     }
5844     return true;
5845   }
5846 
5847   /**
5848    * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
5849    * the registered protocol handlers.  {@link Service} implementations must be registered via the
5850    * {@link HRegion#registerService(com.google.protobuf.Service)}
5851    * method before they are available.
5852    *
5853    * @param controller an {@code RpcContoller} implementation to pass to the invoked service
5854    * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
5855    *     and parameters for the method invocation
5856    * @return a protocol buffer {@code Message} instance containing the method's result
5857    * @throws IOException if no registered service handler is found or an error
5858    *     occurs during the invocation
5859    * @see org.apache.hadoop.hbase.regionserver.HRegion#registerService(com.google.protobuf.Service)
5860    */
5861   public Message execService(RpcController controller, CoprocessorServiceCall call)
5862       throws IOException {
5863     String serviceName = call.getServiceName();
5864     String methodName = call.getMethodName();
5865     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
5866       throw new UnknownProtocolException(null,
5867           "No registered coprocessor service found for name "+serviceName+
5868           " in region "+Bytes.toStringBinary(getRegionName()));
5869     }
5870 
5871     Service service = coprocessorServiceHandlers.get(serviceName);
5872     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
5873     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
5874     if (methodDesc == null) {
5875       throw new UnknownProtocolException(service.getClass(),
5876           "Unknown method "+methodName+" called on service "+serviceName+
5877               " in region "+Bytes.toStringBinary(getRegionName()));
5878     }
5879 
5880     Message request = service.getRequestPrototype(methodDesc).newBuilderForType()
5881         .mergeFrom(call.getRequest()).build();
5882 
5883     if (coprocessorHost != null) {
5884       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
5885     }
5886 
5887     final Message.Builder responseBuilder =
5888         service.getResponsePrototype(methodDesc).newBuilderForType();
5889     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
5890       @Override
5891       public void run(Message message) {
5892         if (message != null) {
5893           responseBuilder.mergeFrom(message);
5894         }
5895       }
5896     });
5897 
5898     if (coprocessorHost != null) {
5899       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
5900     }
5901 
5902     return responseBuilder.build();
5903   }
5904 
5905   /*
5906    * Process table.
5907    * Do major compaction or list content.
5908    * @param fs
5909    * @param p
5910    * @param log
5911    * @param c
5912    * @param majorCompact
5913    * @throws IOException
5914    */
5915   private static void processTable(final FileSystem fs, final Path p,
5916       final HLog log, final Configuration c,
5917       final boolean majorCompact)
5918   throws IOException {
5919     HRegion region = null;
5920     FSTableDescriptors fst = new FSTableDescriptors(c);
5921     // Currently expects tables have one region only.
5922     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
5923       region = HRegion.newHRegion(p, log, fs, c,
5924         HRegionInfo.FIRST_META_REGIONINFO, fst.get(TableName.META_TABLE_NAME), null);
5925     } else {
5926       throw new IOException("Not a known catalog table: " + p.toString());
5927     }
5928     try {
5929       region.initialize();
5930       if (majorCompact) {
5931         region.compactStores(true);
5932       } else {
5933         // Default behavior
5934         Scan scan = new Scan();
5935         // scan.addFamily(HConstants.CATALOG_FAMILY);
5936         RegionScanner scanner = region.getScanner(scan);
5937         try {
5938           List<Cell> kvs = new ArrayList<Cell>();
5939           boolean done;
5940           do {
5941             kvs.clear();
5942             done = scanner.next(kvs);
5943             if (kvs.size() > 0) LOG.info(kvs);
5944           } while (done);
5945         } finally {
5946           scanner.close();
5947         }
5948       }
5949     } finally {
5950       region.close();
5951     }
5952   }
5953 
5954   boolean shouldForceSplit() {
5955     return this.splitRequest;
5956   }
5957 
5958   byte[] getExplicitSplitPoint() {
5959     return this.explicitSplitPoint;
5960   }
5961 
5962   void forceSplit(byte[] sp) {
5963     // This HRegion will go away after the forced split is successful
5964     // But if a forced split fails, we need to clear forced split.
5965     this.splitRequest = true;
5966     if (sp != null) {
5967       this.explicitSplitPoint = sp;
5968     }
5969   }
5970 
5971   void clearSplit() {
5972     this.splitRequest = false;
5973     this.explicitSplitPoint = null;
5974   }
5975 
5976   /**
5977    * Give the region a chance to prepare before it is split.
5978    */
5979   protected void prepareToSplit() {
5980     // nothing
5981   }
5982 
5983   /**
5984    * Return the splitpoint. null indicates the region isn't splittable
5985    * If the splitpoint isn't explicitly specified, it will go over the stores
5986    * to find the best splitpoint. Currently the criteria of best splitpoint
5987    * is based on the size of the store.
5988    */
5989   public byte[] checkSplit() {
5990     // Can't split META
5991     if (this.getRegionInfo().isMetaTable() ||
5992         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
5993       if (shouldForceSplit()) {
5994         LOG.warn("Cannot split meta region in HBase 0.20 and above");
5995       }
5996       return null;
5997     }
5998 
5999     // Can't split region which is in recovering state
6000     if (this.isRecovering()) {
6001       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
6002       return null;
6003     }
6004 
6005     if (!splitPolicy.shouldSplit()) {
6006       return null;
6007     }
6008 
6009     byte[] ret = splitPolicy.getSplitPoint();
6010 
6011     if (ret != null) {
6012       try {
6013         checkRow(ret, "calculated split");
6014       } catch (IOException e) {
6015         LOG.error("Ignoring invalid split", e);
6016         return null;
6017       }
6018     }
6019     return ret;
6020   }
6021 
6022   /**
6023    * @return The priority that this region should have in the compaction queue
6024    */
6025   public int getCompactPriority() {
6026     int count = Integer.MAX_VALUE;
6027     for (Store store : stores.values()) {
6028       count = Math.min(count, store.getCompactPriority());
6029     }
6030     return count;
6031   }
6032 
6033   /**
6034    * Checks every store to see if one has too many
6035    * store files
6036    * @return true if any store has too many store files
6037    */
6038   public boolean needsCompaction() {
6039     for (Store store : stores.values()) {
6040       if(store.needsCompaction()) {
6041         return true;
6042       }
6043     }
6044     return false;
6045   }
6046 
6047   /** @return the coprocessor host */
6048   public RegionCoprocessorHost getCoprocessorHost() {
6049     return coprocessorHost;
6050   }
6051 
6052   /** @param coprocessorHost the new coprocessor host */
6053   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
6054     this.coprocessorHost = coprocessorHost;
6055   }
6056 
6057   /**
6058    * This method needs to be called before any public call that reads or
6059    * modifies data. It has to be called just before a try.
6060    * #closeRegionOperation needs to be called in the try's finally block
6061    * Acquires a read lock and checks if the region is closing or closed.
6062    * @throws IOException
6063    */
6064   public void startRegionOperation() throws IOException {
6065     startRegionOperation(Operation.ANY);
6066   }
6067 
6068   /**
6069    * @param op The operation is about to be taken on the region
6070    * @throws IOException
6071    */
6072   protected void startRegionOperation(Operation op) throws IOException {
6073     switch (op) {
6074     case INCREMENT:
6075     case APPEND:
6076     case GET:
6077     case SCAN:
6078     case SPLIT_REGION:
6079     case MERGE_REGION:
6080     case PUT:
6081     case DELETE:
6082     case BATCH_MUTATE:
6083     case COMPACT_REGION:
6084       // when a region is in recovering state, no read, split or merge is allowed
6085       if (isRecovering() && (this.disallowWritesInRecovering ||
6086               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
6087         throw new RegionInRecoveryException(this.getRegionNameAsString() +
6088           " is recovering; cannot take reads");
6089       }
6090       break;
6091     default:
6092       break;
6093     }
6094     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
6095         || op == Operation.COMPACT_REGION) {
6096       // split, merge or compact region doesn't need to check the closing/closed state or lock the
6097       // region
6098       return;
6099     }
6100     if (this.closing.get()) {
6101       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
6102     }
6103     lock(lock.readLock());
6104     if (this.closed.get()) {
6105       lock.readLock().unlock();
6106       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
6107     }
6108     try {
6109       if (coprocessorHost != null) {
6110         coprocessorHost.postStartRegionOperation(op);
6111       }
6112     } catch (Exception e) {
6113       lock.readLock().unlock();
6114       throw new IOException(e);
6115     }
6116   }
6117 
6118   /**
6119    * Closes the lock. This needs to be called in the finally block corresponding
6120    * to the try block of #startRegionOperation
6121    * @throws IOException
6122    */
6123   public void closeRegionOperation() throws IOException {
6124     closeRegionOperation(Operation.ANY);
6125   }
6126 
6127   /**
6128    * Closes the lock. This needs to be called in the finally block corresponding
6129    * to the try block of {@link #startRegionOperation(Operation)}
6130    * @param operation
6131    * @throws IOException
6132    */
6133   public void closeRegionOperation(Operation operation) throws IOException {
6134     lock.readLock().unlock();
6135     if (coprocessorHost != null) {
6136       coprocessorHost.postCloseRegionOperation(operation);
6137     }
6138   }
6139 
6140   /**
6141    * This method needs to be called before any public call that reads or
6142    * modifies stores in bulk. It has to be called just before a try.
6143    * #closeBulkRegionOperation needs to be called in the try's finally block
6144    * Acquires a writelock and checks if the region is closing or closed.
6145    * @throws NotServingRegionException when the region is closing or closed
6146    * @throws RegionTooBusyException if failed to get the lock in time
6147    * @throws InterruptedIOException if interrupted while waiting for a lock
6148    */
6149   private void startBulkRegionOperation(boolean writeLockNeeded)
6150       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
6151     if (this.closing.get()) {
6152       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
6153     }
6154     if (writeLockNeeded) lock(lock.writeLock());
6155     else lock(lock.readLock());
6156     if (this.closed.get()) {
6157       if (writeLockNeeded) lock.writeLock().unlock();
6158       else lock.readLock().unlock();
6159       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
6160     }
6161   }
6162 
6163   /**
6164    * Closes the lock. This needs to be called in the finally block corresponding
6165    * to the try block of #startRegionOperation
6166    */
6167   private void closeBulkRegionOperation(){
6168     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
6169     else lock.readLock().unlock();
6170   }
6171 
6172   /**
6173    * Update counters for numer of puts without wal and the size of possible data loss.
6174    * These information are exposed by the region server metrics.
6175    */
6176   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
6177     numMutationsWithoutWAL.increment();
6178     if (numMutationsWithoutWAL.get() <= 1) {
6179       LOG.info("writing data to region " + this +
6180                " with WAL disabled. Data may be lost in the event of a crash.");
6181     }
6182 
6183     long mutationSize = 0;
6184     for (List<Cell> cells: familyMap.values()) {
6185       assert cells instanceof RandomAccess;
6186       int listSize = cells.size();
6187       for (int i=0; i < listSize; i++) {
6188         Cell cell = cells.get(i);
6189         // TODO we need include tags length also here.
6190         mutationSize += KeyValueUtil.keyLength(cell) + cell.getValueLength();
6191       }
6192     }
6193 
6194     dataInMemoryWithoutWAL.add(mutationSize);
6195   }
6196 
6197   private void lock(final Lock lock)
6198       throws RegionTooBusyException, InterruptedIOException {
6199     lock(lock, 1);
6200   }
6201 
6202   /**
6203    * Try to acquire a lock.  Throw RegionTooBusyException
6204    * if failed to get the lock in time. Throw InterruptedIOException
6205    * if interrupted while waiting for the lock.
6206    */
6207   private void lock(final Lock lock, final int multiplier)
6208       throws RegionTooBusyException, InterruptedIOException {
6209     try {
6210       final long waitTime = Math.min(maxBusyWaitDuration,
6211           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
6212       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
6213         throw new RegionTooBusyException(
6214             "failed to get a lock in " + waitTime + " ms. " +
6215                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
6216                 this.getRegionInfo().getRegionNameAsString()) +
6217                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
6218                 this.getRegionServerServices().getServerName()));
6219       }
6220     } catch (InterruptedException ie) {
6221       LOG.info("Interrupted while waiting for a lock");
6222       InterruptedIOException iie = new InterruptedIOException();
6223       iie.initCause(ie);
6224       throw iie;
6225     }
6226   }
6227 
6228   /**
6229    * Calls sync with the given transaction ID if the region's table is not
6230    * deferring it.
6231    * @param txid should sync up to which transaction
6232    * @throws IOException If anything goes wrong with DFS
6233    */
6234   private void syncOrDefer(long txid, Durability durability) throws IOException {
6235     if (this.getRegionInfo().isMetaRegion()) {
6236       this.log.sync(txid);
6237     } else {
6238       switch(durability) {
6239       case USE_DEFAULT:
6240         // do what table defaults to
6241         if (shouldSyncLog()) {
6242           this.log.sync(txid);
6243         }
6244         break;
6245       case SKIP_WAL:
6246         // nothing do to
6247         break;
6248       case ASYNC_WAL:
6249         // nothing do to
6250         break;
6251       case SYNC_WAL:
6252       case FSYNC_WAL:
6253         // sync the WAL edit (SYNC and FSYNC treated the same for now)
6254         this.log.sync(txid);
6255         break;
6256       }
6257     }
6258   }
6259 
6260   /**
6261    * Check whether we should sync the log from the table's durability settings
6262    */
6263   private boolean shouldSyncLog() {
6264     return durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
6265   }
6266 
6267   /**
6268    * A mocked list implementaion - discards all updates.
6269    */
6270   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
6271 
6272     @Override
6273     public void add(int index, Cell element) {
6274       // do nothing
6275     }
6276 
6277     @Override
6278     public boolean addAll(int index, Collection<? extends Cell> c) {
6279       return false; // this list is never changed as a result of an update
6280     }
6281 
6282     @Override
6283     public KeyValue get(int index) {
6284       throw new UnsupportedOperationException();
6285     }
6286 
6287     @Override
6288     public int size() {
6289       return 0;
6290     }
6291   };
6292 
6293   /**
6294    * Facility for dumping and compacting catalog tables.
6295    * Only does catalog tables since these are only tables we for sure know
6296    * schema on.  For usage run:
6297    * <pre>
6298    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
6299    * </pre>
6300    * @param args
6301    * @throws IOException
6302    */
6303   public static void main(String[] args) throws IOException {
6304     if (args.length < 1) {
6305       printUsageAndExit(null);
6306     }
6307     boolean majorCompact = false;
6308     if (args.length > 1) {
6309       if (!args[1].toLowerCase().startsWith("major")) {
6310         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
6311       }
6312       majorCompact = true;
6313     }
6314     final Path tableDir = new Path(args[0]);
6315     final Configuration c = HBaseConfiguration.create();
6316     final FileSystem fs = FileSystem.get(c);
6317     final Path logdir = new Path(c.get("hbase.tmp.dir"));
6318     final String logname = "hlog" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
6319 
6320     final HLog log = HLogFactory.createHLog(fs, logdir, logname, c);
6321     try {
6322       processTable(fs, tableDir, log, c, majorCompact);
6323     } finally {
6324        log.close();
6325        // TODO: is this still right?
6326        BlockCache bc = new CacheConfig(c).getBlockCache();
6327        if (bc != null) bc.shutdown();
6328     }
6329   }
6330 
6331   /**
6332    * Gets the latest sequence number that was read from storage when this region was opened.
6333    */
6334   public long getOpenSeqNum() {
6335     return this.openSeqNum;
6336   }
6337 
6338   /**
6339    * Gets max sequence ids of stores that was read from storage when this region was opened. WAL
6340    * Edits with smaller or equal sequence number will be skipped from replay.
6341    */
6342   public Map<byte[], Long> getMaxStoreSeqIdForLogReplay() {
6343     return this.maxSeqIdInStores;
6344   }
6345 
6346   /**
6347    * @return if a given region is in compaction now.
6348    */
6349   public CompactionState getCompactionState() {
6350     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
6351     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
6352         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
6353   }
6354 
6355   public void reportCompactionRequestStart(boolean isMajor){
6356     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
6357   }
6358 
6359   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted){
6360     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
6361 
6362     // metrics
6363     compactionsFinished.incrementAndGet();
6364     compactionNumFilesCompacted.addAndGet(numFiles);
6365     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
6366 
6367     assert newValue >= 0;
6368   }
6369 
6370   /**
6371    * @return sequenceId.
6372    */
6373   public AtomicLong getSequenceId() {
6374     return this.sequenceId;
6375   }
6376 
6377   /**
6378    * sets this region's sequenceId.
6379    * @param value new value
6380    */
6381   private void setSequenceId(long value) {
6382     this.sequenceId.set(value);
6383   }
6384 
6385   /**
6386    * Listener class to enable callers of
6387    * bulkLoadHFile() to perform any necessary
6388    * pre/post processing of a given bulkload call
6389    */
6390   public interface BulkLoadListener {
6391 
6392     /**
6393      * Called before an HFile is actually loaded
6394      * @param family family being loaded to
6395      * @param srcPath path of HFile
6396      * @return final path to be used for actual loading
6397      * @throws IOException
6398      */
6399     String prepareBulkLoad(byte[] family, String srcPath) throws IOException;
6400 
6401     /**
6402      * Called after a successful HFile load
6403      * @param family family being loaded to
6404      * @param srcPath path of HFile
6405      * @throws IOException
6406      */
6407     void doneBulkLoad(byte[] family, String srcPath) throws IOException;
6408 
6409     /**
6410      * Called after a failed HFile load
6411      * @param family family being loaded to
6412      * @param srcPath path of HFile
6413      * @throws IOException
6414      */
6415     void failedBulkLoad(byte[] family, String srcPath) throws IOException;
6416   }
6417 
6418   @VisibleForTesting class RowLockContext {
6419     private final HashedBytes row;
6420     private final CountDownLatch latch = new CountDownLatch(1);
6421     private final Thread thread;
6422     private int lockCount = 0;
6423 
6424     RowLockContext(HashedBytes row) {
6425       this.row = row;
6426       this.thread = Thread.currentThread();
6427     }
6428 
6429     boolean ownedByCurrentThread() {
6430       return thread == Thread.currentThread();
6431     }
6432 
6433     RowLock newLock() {
6434       lockCount++;
6435       return new RowLock(this);
6436     }
6437 
6438     void releaseLock() {
6439       if (!ownedByCurrentThread()) {
6440         throw new IllegalArgumentException("Lock held by thread: " + thread
6441           + " cannot be released by different thread: " + Thread.currentThread());
6442       }
6443       lockCount--;
6444       if (lockCount == 0) {
6445         // no remaining locks by the thread, unlock and allow other threads to access
6446         RowLockContext existingContext = lockedRows.remove(row);
6447         if (existingContext != this) {
6448           throw new RuntimeException(
6449               "Internal row lock state inconsistent, should not happen, row: " + row);
6450         }
6451         latch.countDown();
6452       }
6453     }
6454   }
6455 
6456   /**
6457    * Row lock held by a given thread.
6458    * One thread may acquire multiple locks on the same row simultaneously.
6459    * The locks must be released by calling release() from the same thread.
6460    */
6461   public static class RowLock {
6462     @VisibleForTesting final RowLockContext context;
6463     private boolean released = false;
6464 
6465     @VisibleForTesting RowLock(RowLockContext context) {
6466       this.context = context;
6467     }
6468 
6469     /**
6470      * Release the given lock.  If there are no remaining locks held by the current thread
6471      * then unlock the row and allow other threads to acquire the lock.
6472      * @throws IllegalArgumentException if called by a different thread than the lock owning thread
6473      */
6474     public void release() {
6475       if (!released) {
6476         context.releaseLock();
6477         released = true;
6478       }
6479     }
6480   }
6481 
6482   /**
6483    * Lock the updates' readLock first, so that we could safely append logs in coprocessors.
6484    * @throws RegionTooBusyException
6485    * @throws InterruptedIOException
6486    */
6487   public void updatesLock() throws RegionTooBusyException, InterruptedIOException {
6488     lock(updatesLock.readLock());
6489   }
6490 
6491   /**
6492    * Unlock the updates' readLock after appending logs in coprocessors.
6493    * @throws InterruptedIOException
6494    */
6495   public void updatesUnlock() throws InterruptedIOException {
6496     updatesLock.readLock().unlock();
6497   }
6498 }