View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.io.UnsupportedEncodingException;
25  import java.lang.reflect.Constructor;
26  import java.text.ParseException;
27  import java.util.AbstractList;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Collection;
31  import java.util.Collections;
32  import java.util.HashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.NavigableMap;
36  import java.util.NavigableSet;
37  import java.util.Set;
38  import java.util.TreeMap;
39  import java.util.UUID;
40  import java.util.concurrent.Callable;
41  import java.util.concurrent.CompletionService;
42  import java.util.concurrent.ConcurrentHashMap;
43  import java.util.concurrent.ConcurrentSkipListMap;
44  import java.util.concurrent.CountDownLatch;
45  import java.util.concurrent.ExecutionException;
46  import java.util.concurrent.ExecutorCompletionService;
47  import java.util.concurrent.ExecutorService;
48  import java.util.concurrent.Executors;
49  import java.util.concurrent.Future;
50  import java.util.concurrent.FutureTask;
51  import java.util.concurrent.ThreadFactory;
52  import java.util.concurrent.ThreadPoolExecutor;
53  import java.util.concurrent.TimeUnit;
54  import java.util.concurrent.TimeoutException;
55  import java.util.concurrent.atomic.AtomicBoolean;
56  import java.util.concurrent.atomic.AtomicInteger;
57  import java.util.concurrent.atomic.AtomicLong;
58  import java.util.concurrent.locks.Lock;
59  import java.util.concurrent.locks.ReentrantReadWriteLock;
60  
61  import org.apache.commons.logging.Log;
62  import org.apache.commons.logging.LogFactory;
63  import org.apache.hadoop.classification.InterfaceAudience;
64  import org.apache.hadoop.conf.Configuration;
65  import org.apache.hadoop.fs.FileStatus;
66  import org.apache.hadoop.fs.FileSystem;
67  import org.apache.hadoop.fs.Path;
68  import org.apache.hadoop.hbase.Cell;
69  import org.apache.hadoop.hbase.CellUtil;
70  import org.apache.hadoop.hbase.CompoundConfiguration;
71  import org.apache.hadoop.hbase.DoNotRetryIOException;
72  import org.apache.hadoop.hbase.DroppedSnapshotException;
73  import org.apache.hadoop.hbase.HBaseConfiguration;
74  import org.apache.hadoop.hbase.HColumnDescriptor;
75  import org.apache.hadoop.hbase.HConstants;
76  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
77  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
78  import org.apache.hadoop.hbase.HRegionInfo;
79  import org.apache.hadoop.hbase.HTableDescriptor;
80  import org.apache.hadoop.hbase.KeyValue;
81  import org.apache.hadoop.hbase.KeyValueUtil;
82  import org.apache.hadoop.hbase.NotServingRegionException;
83  import org.apache.hadoop.hbase.RegionTooBusyException;
84  import org.apache.hadoop.hbase.TableName;
85  import org.apache.hadoop.hbase.UnknownScannerException;
86  import org.apache.hadoop.hbase.backup.HFileArchiver;
87  import org.apache.hadoop.hbase.client.Append;
88  import org.apache.hadoop.hbase.client.Delete;
89  import org.apache.hadoop.hbase.client.Durability;
90  import org.apache.hadoop.hbase.client.Get;
91  import org.apache.hadoop.hbase.client.Increment;
92  import org.apache.hadoop.hbase.client.IsolationLevel;
93  import org.apache.hadoop.hbase.client.Mutation;
94  import org.apache.hadoop.hbase.client.Put;
95  import org.apache.hadoop.hbase.client.Result;
96  import org.apache.hadoop.hbase.client.RowMutations;
97  import org.apache.hadoop.hbase.client.Scan;
98  import org.apache.hadoop.hbase.coprocessor.RegionObserver;
99  import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
100 import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
101 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
102 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
103 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
104 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
105 import org.apache.hadoop.hbase.filter.Filter;
106 import org.apache.hadoop.hbase.filter.FilterWrapper;
107 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
108 import org.apache.hadoop.hbase.io.HeapSize;
109 import org.apache.hadoop.hbase.io.TimeRange;
110 import org.apache.hadoop.hbase.io.hfile.BlockCache;
111 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
112 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
113 import org.apache.hadoop.hbase.ipc.RpcCallContext;
114 import org.apache.hadoop.hbase.ipc.RpcServer;
115 import org.apache.hadoop.hbase.master.AssignmentManager;
116 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
117 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
118 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
119 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
120 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
121 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
122 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
123 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
124 import org.apache.hadoop.hbase.regionserver.wal.HLog;
125 import org.apache.hadoop.hbase.regionserver.wal.HLogFactory;
126 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
127 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
128 import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter.MutationReplay;
129 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
130 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
131 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
132 import org.apache.hadoop.hbase.util.Bytes;
133 import org.apache.hadoop.hbase.util.CancelableProgressable;
134 import org.apache.hadoop.hbase.util.ClassSize;
135 import org.apache.hadoop.hbase.util.CompressionTest;
136 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
137 import org.apache.hadoop.hbase.util.FSUtils;
138 import org.apache.hadoop.hbase.util.HashedBytes;
139 import org.apache.hadoop.hbase.util.Pair;
140 import org.apache.hadoop.hbase.util.Threads;
141 import org.apache.hadoop.io.MultipleIOException;
142 import org.apache.hadoop.util.StringUtils;
143 import org.cliffc.high_scale_lib.Counter;
144 
145 import com.google.common.annotations.VisibleForTesting;
146 import com.google.common.base.Preconditions;
147 import com.google.common.collect.Lists;
148 import com.google.common.collect.Maps;
149 import com.google.common.io.Closeables;
150 import com.google.protobuf.Descriptors;
151 import com.google.protobuf.Message;
152 import com.google.protobuf.RpcCallback;
153 import com.google.protobuf.RpcController;
154 import com.google.protobuf.Service;
155 
156 /**
157  * HRegion stores data for a certain region of a table.  It stores all columns
158  * for each row. A given table consists of one or more HRegions.
159  *
160  * <p>We maintain multiple HStores for a single HRegion.
161  *
162  * <p>An Store is a set of rows with some column data; together,
163  * they make up all the data for the rows.
164  *
165  * <p>Each HRegion has a 'startKey' and 'endKey'.
166  * <p>The first is inclusive, the second is exclusive (except for
167  * the final region)  The endKey of region 0 is the same as
168  * startKey for region 1 (if it exists).  The startKey for the
169  * first region is null. The endKey for the final region is null.
170  *
171  * <p>Locking at the HRegion level serves only one purpose: preventing the
172  * region from being closed (and consequently split) while other operations
173  * are ongoing. Each row level operation obtains both a row lock and a region
174  * read lock for the duration of the operation. While a scanner is being
175  * constructed, getScanner holds a read lock. If the scanner is successfully
176  * constructed, it holds a read lock until it is closed. A close takes out a
177  * write lock and consequently will block for ongoing operations and will block
178  * new operations from starting while the close is in progress.
179  *
180  * <p>An HRegion is defined by its table and its key extent.
181  *
182  * <p>It consists of at least one Store.  The number of Stores should be
183  * configurable, so that data which is accessed together is stored in the same
184  * Store.  Right now, we approximate that by building a single Store for
185  * each column family.  (This config info will be communicated via the
186  * tabledesc.)
187  *
188  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
189  * regionName is a unique identifier for this HRegion. (startKey, endKey]
190  * defines the keyspace for this HRegion.
191  */
192 @InterfaceAudience.Private
193 public class HRegion implements HeapSize { // , Writable{
194   public static final Log LOG = LogFactory.getLog(HRegion.class);
195 
196   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
197       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
198       
199   /**
200    * This is the global default value for durability. All tables/mutations not
201    * defining a durability or using USE_DEFAULT will default to this value.
202    */
203   private static final Durability DEFAULT_DURABLITY = Durability.SYNC_WAL;
204 
205   final AtomicBoolean closed = new AtomicBoolean(false);
206   /* Closing can take some time; use the closing flag if there is stuff we don't
207    * want to do while in closing state; e.g. like offer this region up to the
208    * master as a region to close if the carrying regionserver is overloaded.
209    * Once set, it is never cleared.
210    */
211   final AtomicBoolean closing = new AtomicBoolean(false);
212 
213   protected volatile long completeSequenceId = -1L;
214 
215   /**
216    * Region level sequence Id. It is used for appending WALEdits in HLog. Its default value is -1,
217    * as a marker that the region hasn't opened yet. Once it is opened, it is set to
218    * {@link #openSeqNum}.
219    */
220   private final AtomicLong sequenceId = new AtomicLong(-1L);
221 
222   /**
223    * Operation enum is used in {@link HRegion#startRegionOperation} to provide operation context for
224    * startRegionOperation to possibly invoke different checks before any region operations. Not all
225    * operations have to be defined here. It's only needed when a special check is need in
226    * startRegionOperation
227    */
228   public enum Operation {
229     ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE,
230     REPLAY_BATCH_MUTATE, COMPACT_REGION
231   }
232 
233   //////////////////////////////////////////////////////////////////////////////
234   // Members
235   //////////////////////////////////////////////////////////////////////////////
236 
237   // map from a locked row to the context for that lock including:
238   // - CountDownLatch for threads waiting on that row
239   // - the thread that owns the lock (allow reentrancy)
240   // - reference count of (reentrant) locks held by the thread
241   // - the row itself
242   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
243       new ConcurrentHashMap<HashedBytes, RowLockContext>();
244 
245   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
246       Bytes.BYTES_RAWCOMPARATOR);
247 
248   // TODO: account for each registered handler in HeapSize computation
249   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
250 
251   public final AtomicLong memstoreSize = new AtomicLong(0);
252 
253   // Debug possible data loss due to WAL off
254   final Counter numMutationsWithoutWAL = new Counter();
255   final Counter dataInMemoryWithoutWAL = new Counter();
256 
257   // Debug why CAS operations are taking a while.
258   final Counter checkAndMutateChecksPassed = new Counter();
259   final Counter checkAndMutateChecksFailed = new Counter();
260 
261   //Number of requests
262   final Counter readRequestsCount = new Counter();
263   final Counter writeRequestsCount = new Counter();
264 
265   // Compaction counters
266   final AtomicLong compactionsFinished = new AtomicLong(0L);
267   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
268   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
269 
270 
271   private final HLog log;
272   private final HRegionFileSystem fs;
273   protected final Configuration conf;
274   private final Configuration baseConf;
275   private final KeyValue.KVComparator comparator;
276   private final int rowLockWaitDuration;
277   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
278 
279   // The internal wait duration to acquire a lock before read/update
280   // from the region. It is not per row. The purpose of this wait time
281   // is to avoid waiting a long time while the region is busy, so that
282   // we can release the IPC handler soon enough to improve the
283   // availability of the region server. It can be adjusted by
284   // tuning configuration "hbase.busy.wait.duration".
285   final long busyWaitDuration;
286   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
287 
288   // If updating multiple rows in one call, wait longer,
289   // i.e. waiting for busyWaitDuration * # of rows. However,
290   // we can limit the max multiplier.
291   final int maxBusyWaitMultiplier;
292 
293   // Max busy wait duration. There is no point to wait longer than the RPC
294   // purge timeout, when a RPC call will be terminated by the RPC engine.
295   final long maxBusyWaitDuration;
296 
297   // negative number indicates infinite timeout
298   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
299   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
300 
301   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
302 
303   /**
304    * The sequence ID that was encountered when this region was opened.
305    */
306   private long openSeqNum = HConstants.NO_SEQNUM;
307 
308   /**
309    * The default setting for whether to enable on-demand CF loading for
310    * scan requests to this region. Requests can override it.
311    */
312   private boolean isLoadingCfsOnDemandDefault = false;
313 
314   private final AtomicInteger majorInProgress = new AtomicInteger(0);
315   private final AtomicInteger minorInProgress = new AtomicInteger(0);
316 
317   //
318   // Context: During replay we want to ensure that we do not lose any data. So, we
319   // have to be conservative in how we replay logs. For each store, we calculate
320   // the maxSeqId up to which the store was flushed. And, skip the edits which
321   // are equal to or lower than maxSeqId for each store.
322   // The following map is populated when opening the region
323   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
324 
325   /**
326    * Config setting for whether to allow writes when a region is in recovering or not.
327    */
328   private boolean disallowWritesInRecovering = false;
329 
330   // when a region is in recovering state, it can only accept writes not reads
331   private volatile boolean isRecovering = false;
332 
333   /**
334    * @return The smallest mvcc readPoint across all the scanners in this
335    * region. Writes older than this readPoint, are included  in every
336    * read operation.
337    */
338   public long getSmallestReadPoint() {
339     long minimumReadPoint;
340     // We need to ensure that while we are calculating the smallestReadPoint
341     // no new RegionScanners can grab a readPoint that we are unaware of.
342     // We achieve this by synchronizing on the scannerReadPoints object.
343     synchronized(scannerReadPoints) {
344       minimumReadPoint = mvcc.memstoreReadPoint();
345 
346       for (Long readPoint: this.scannerReadPoints.values()) {
347         if (readPoint < minimumReadPoint) {
348           minimumReadPoint = readPoint;
349         }
350       }
351     }
352     return minimumReadPoint;
353   }
354   /*
355    * Data structure of write state flags used coordinating flushes,
356    * compactions and closes.
357    */
358   static class WriteState {
359     // Set while a memstore flush is happening.
360     volatile boolean flushing = false;
361     // Set when a flush has been requested.
362     volatile boolean flushRequested = false;
363     // Number of compactions running.
364     volatile int compacting = 0;
365     // Gets set in close. If set, cannot compact or flush again.
366     volatile boolean writesEnabled = true;
367     // Set if region is read-only
368     volatile boolean readOnly = false;
369 
370     /**
371      * Set flags that make this region read-only.
372      *
373      * @param onOff flip value for region r/o setting
374      */
375     synchronized void setReadOnly(final boolean onOff) {
376       this.writesEnabled = !onOff;
377       this.readOnly = onOff;
378     }
379 
380     boolean isReadOnly() {
381       return this.readOnly;
382     }
383 
384     boolean isFlushRequested() {
385       return this.flushRequested;
386     }
387 
388     static final long HEAP_SIZE = ClassSize.align(
389         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
390   }
391 
392   final WriteState writestate = new WriteState();
393 
394   long memstoreFlushSize;
395   final long timestampSlop;
396   final long rowProcessorTimeout;
397   private volatile long lastFlushTime;
398   final RegionServerServices rsServices;
399   private RegionServerAccounting rsAccounting;
400   private List<Pair<Long, Long>> recentFlushes = new ArrayList<Pair<Long,Long>>();
401   private long flushCheckInterval;
402   // flushPerChanges is to prevent too many changes in memstore    
403   private long flushPerChanges;
404   private long blockingMemStoreSize;
405   final long threadWakeFrequency;
406   // Used to guard closes
407   final ReentrantReadWriteLock lock =
408     new ReentrantReadWriteLock();
409 
410   // Stop updates lock
411   private final ReentrantReadWriteLock updatesLock =
412     new ReentrantReadWriteLock();
413   private boolean splitRequest;
414   private byte[] explicitSplitPoint = null;
415 
416   private final MultiVersionConsistencyControl mvcc =
417       new MultiVersionConsistencyControl();
418 
419   // Coprocessor host
420   private RegionCoprocessorHost coprocessorHost;
421 
422   private HTableDescriptor htableDescriptor = null;
423   private RegionSplitPolicy splitPolicy;
424 
425   private final MetricsRegion metricsRegion;
426   private final MetricsRegionWrapperImpl metricsRegionWrapper;
427   private final Durability durability;
428 
429   /**
430    * HRegion constructor. This constructor should only be used for testing and
431    * extensions.  Instances of HRegion should be instantiated with the
432    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
433    *
434    * @param tableDir qualified path of directory where region should be located,
435    * usually the table directory.
436    * @param log The HLog is the outbound log for any updates to the HRegion
437    * (There's a single HLog for all the HRegions on a single HRegionServer.)
438    * The log file is a logfile from the previous execution that's
439    * custom-computed for this HRegion. The HRegionServer computes and sorts the
440    * appropriate log info for this HRegion. If there is a previous log file
441    * (implying that the HRegion has been written-to before), then read it from
442    * the supplied path.
443    * @param fs is the filesystem.
444    * @param confParam is global configuration settings.
445    * @param regionInfo - HRegionInfo that describes the region
446    * is new), then read them from the supplied path.
447    * @param htd the table descriptor
448    * @param rsServices reference to {@link RegionServerServices} or null
449    */
450   @Deprecated
451   public HRegion(final Path tableDir, final HLog log, final FileSystem fs,
452       final Configuration confParam, final HRegionInfo regionInfo,
453       final HTableDescriptor htd, final RegionServerServices rsServices) {
454     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
455       log, confParam, htd, rsServices);
456   }
457 
458   /**
459    * HRegion constructor. This constructor should only be used for testing and
460    * extensions.  Instances of HRegion should be instantiated with the
461    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
462    *
463    * @param fs is the filesystem.
464    * @param log The HLog is the outbound log for any updates to the HRegion
465    * (There's a single HLog for all the HRegions on a single HRegionServer.)
466    * The log file is a logfile from the previous execution that's
467    * custom-computed for this HRegion. The HRegionServer computes and sorts the
468    * appropriate log info for this HRegion. If there is a previous log file
469    * (implying that the HRegion has been written-to before), then read it from
470    * the supplied path.
471    * @param confParam is global configuration settings.
472    * @param htd the table descriptor
473    * @param rsServices reference to {@link RegionServerServices} or null
474    */
475   public HRegion(final HRegionFileSystem fs, final HLog log, final Configuration confParam,
476       final HTableDescriptor htd, final RegionServerServices rsServices) {
477     if (htd == null) {
478       throw new IllegalArgumentException("Need table descriptor");
479     }
480 
481     if (confParam instanceof CompoundConfiguration) {
482       throw new IllegalArgumentException("Need original base configuration");
483     }
484 
485     this.comparator = fs.getRegionInfo().getComparator();
486     this.log = log;
487     this.fs = fs;
488 
489     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
490     this.baseConf = confParam;
491     this.conf = new CompoundConfiguration()
492       .add(confParam)
493       .addStringMap(htd.getConfiguration())
494       .addWritableMap(htd.getValues());
495     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
496         DEFAULT_CACHE_FLUSH_INTERVAL);
497     this.flushPerChanges = conf.getLong(MEMSTORE_FLUSH_PER_CHANGES, DEFAULT_FLUSH_PER_CHANGES);
498     if (this.flushPerChanges > MAX_FLUSH_PER_CHANGES) {
499       throw new IllegalArgumentException(MEMSTORE_FLUSH_PER_CHANGES + " can not exceed "
500           + MAX_FLUSH_PER_CHANGES);
501     }
502     
503     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
504                     DEFAULT_ROWLOCK_WAIT_DURATION);
505 
506     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
507     this.htableDescriptor = htd;
508     this.rsServices = rsServices;
509     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
510     setHTableSpecificConf();
511     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
512 
513     this.busyWaitDuration = conf.getLong(
514       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
515     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
516     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
517       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
518         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
519         + maxBusyWaitMultiplier + "). Their product should be positive");
520     }
521     this.maxBusyWaitDuration = conf.getLong("ipc.client.call.purge.timeout",
522       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
523 
524     /*
525      * timestamp.slop provides a server-side constraint on the timestamp. This
526      * assumes that you base your TS around currentTimeMillis(). In this case,
527      * throw an error to the user if the user-specified TS is newer than now +
528      * slop. LATEST_TIMESTAMP == don't use this functionality
529      */
530     this.timestampSlop = conf.getLong(
531         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
532         HConstants.LATEST_TIMESTAMP);
533 
534     /**
535      * Timeout for the process time in processRowsWithLocks().
536      * Use -1 to switch off time bound.
537      */
538     this.rowProcessorTimeout = conf.getLong(
539         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
540     this.durability = htd.getDurability() == Durability.USE_DEFAULT
541         ? DEFAULT_DURABLITY
542         : htd.getDurability();
543     if (rsServices != null) {
544       this.rsAccounting = this.rsServices.getRegionServerAccounting();
545       // don't initialize coprocessors if not running within a regionserver
546       // TODO: revisit if coprocessors should load in other cases
547       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
548       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
549       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
550 
551       Map<String, HRegion> recoveringRegions = rsServices.getRecoveringRegions();
552       String encodedName = getRegionInfo().getEncodedName();
553       if (recoveringRegions != null && recoveringRegions.containsKey(encodedName)) {
554         this.isRecovering = true;
555         recoveringRegions.put(encodedName, this);
556       }
557     } else {
558       this.metricsRegionWrapper = null;
559       this.metricsRegion = null;
560     }
561     if (LOG.isDebugEnabled()) {
562       // Write out region name as string and its encoded name.
563       LOG.debug("Instantiated " + this);
564     }
565 
566     // by default, we allow writes against a region when it's in recovering
567     this.disallowWritesInRecovering =
568         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
569           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
570   }
571 
572   void setHTableSpecificConf() {
573     if (this.htableDescriptor == null) return;
574     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
575 
576     if (flushSize <= 0) {
577       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
578         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
579     }
580     this.memstoreFlushSize = flushSize;
581     this.blockingMemStoreSize = this.memstoreFlushSize *
582         conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
583   }
584 
585   /**
586    * Initialize this region.
587    * Used only by tests and SplitTransaction to reopen the region.
588    * You should use createHRegion() or openHRegion()
589    * @return What the next sequence (edit) id should be.
590    * @throws IOException e
591    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
592    */
593   @Deprecated
594   public long initialize() throws IOException {
595     return initialize(null);
596   }
597 
598   /**
599    * Initialize this region.
600    *
601    * @param reporter Tickle every so often if initialize is taking a while.
602    * @return What the next sequence (edit) id should be.
603    * @throws IOException e
604    */
605   private long initialize(final CancelableProgressable reporter) throws IOException {
606     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
607     long nextSeqId = -1;
608     try {
609       nextSeqId = initializeRegionInternals(reporter, status);
610       return nextSeqId;
611     } finally {
612       // nextSeqid will be -1 if the initialization fails.
613       // At least it will be 0 otherwise.
614       if (nextSeqId == -1) {
615         status
616             .abort("Exception during region " + this.getRegionNameAsString() + " initialization.");
617       }
618     }
619   }
620 
621   private long initializeRegionInternals(final CancelableProgressable reporter,
622       final MonitoredTask status) throws IOException, UnsupportedEncodingException {
623     if (coprocessorHost != null) {
624       status.setStatus("Running coprocessor pre-open hook");
625       coprocessorHost.preOpen();
626     }
627 
628     // Write HRI to a file in case we need to recover hbase:meta
629     status.setStatus("Writing region info on filesystem");
630     fs.checkRegionInfoOnFilesystem();
631 
632     // Remove temporary data left over from old regions
633     status.setStatus("Cleaning up temporary data from old regions");
634     fs.cleanupTempDir();
635 
636     // Initialize all the HStores
637     status.setStatus("Initializing all the Stores");
638     long maxSeqId = initializeRegionStores(reporter, status);
639 
640     status.setStatus("Cleaning up detritus from prior splits");
641     // Get rid of any splits or merges that were lost in-progress.  Clean out
642     // these directories here on open.  We may be opening a region that was
643     // being split but we crashed in the middle of it all.
644     fs.cleanupAnySplitDetritus();
645     fs.cleanupMergesDir();
646 
647     this.writestate.setReadOnly(this.htableDescriptor.isReadOnly());
648     this.writestate.flushRequested = false;
649     this.writestate.compacting = 0;
650 
651     // Initialize split policy
652     this.splitPolicy = RegionSplitPolicy.create(this, conf);
653 
654     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
655     // Use maximum of log sequenceid or that which was found in stores
656     // (particularly if no recovered edits, seqid will be -1).
657     long nextSeqid = maxSeqId + 1;
658     if (this.isRecovering) {
659       // In distributedLogReplay mode, we don't know the last change sequence number because region
660       // is opened before recovery completes. So we add a safety bumper to avoid new sequence number
661       // overlaps used sequence numbers
662       nextSeqid += this.flushPerChanges + 10000000; // add another extra 10million
663     }
664     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
665       "; next sequenceid=" + nextSeqid);
666 
667     // A region can be reopened if failed a split; reset flags
668     this.closing.set(false);
669     this.closed.set(false);
670 
671     this.completeSequenceId = nextSeqid;
672     if (coprocessorHost != null) {
673       status.setStatus("Running coprocessor post-open hooks");
674       coprocessorHost.postOpen();
675     }
676 
677     status.markComplete("Region opened successfully");
678     return nextSeqid;
679   }
680 
681   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status)
682       throws IOException, UnsupportedEncodingException {
683     // Load in all the HStores.
684 
685     long maxSeqId = -1;
686     // initialized to -1 so that we pick up MemstoreTS from column families
687     long maxMemstoreTS = -1;
688 
689     if (!htableDescriptor.getFamilies().isEmpty()) {
690       // initialize the thread pool for opening stores in parallel.
691       ThreadPoolExecutor storeOpenerThreadPool =
692         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
693       CompletionService<HStore> completionService =
694         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
695 
696       // initialize each store in parallel
697       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
698         status.setStatus("Instantiating store for column family " + family);
699         completionService.submit(new Callable<HStore>() {
700           @Override
701           public HStore call() throws IOException {
702             return instantiateHStore(family);
703           }
704         });
705       }
706       boolean allStoresOpened = false;
707       try {
708         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
709           Future<HStore> future = completionService.take();
710           HStore store = future.get();
711 
712           this.stores.put(store.getColumnFamilyName().getBytes(), store);
713           // Do not include bulk loaded files when determining seqIdForReplay
714           long storeSeqIdForReplay = store.getMaxSequenceId(false);
715           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
716               storeSeqIdForReplay);
717           // Include bulk loaded files when determining seqIdForAssignment
718           long storeSeqIdForAssignment = store.getMaxSequenceId(true);
719           if (maxSeqId == -1 || storeSeqIdForAssignment > maxSeqId) {
720             maxSeqId = storeSeqIdForAssignment;
721           }
722           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
723           if (maxStoreMemstoreTS > maxMemstoreTS) {
724             maxMemstoreTS = maxStoreMemstoreTS;
725           }
726         }
727         allStoresOpened = true;
728       } catch (InterruptedException e) {
729         throw (InterruptedIOException)new InterruptedIOException().initCause(e);
730       } catch (ExecutionException e) {
731         throw new IOException(e.getCause());
732       } finally {
733         storeOpenerThreadPool.shutdownNow();
734         if (!allStoresOpened) {
735           // something went wrong, close all opened stores
736           LOG.error("Could not initialize all stores for the region=" + this);
737           for (Store store : this.stores.values()) {
738             try {
739               store.close();
740             } catch (IOException e) { 
741               LOG.warn(e.getMessage());
742             }
743           }
744         }
745       }
746     }
747     mvcc.initialize(maxMemstoreTS + 1);
748     // Recover any edits if available.
749     maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
750         this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
751     return maxSeqId;
752   }
753 
754   /**
755    * @return True if this region has references.
756    */
757   public boolean hasReferences() {
758     for (Store store : this.stores.values()) {
759       if (store.hasReferences()) return true;
760     }
761     return false;
762   }
763 
764   /**
765    * This function will return the HDFS blocks distribution based on the data
766    * captured when HFile is created
767    * @return The HDFS blocks distribution for the region.
768    */
769   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
770     HDFSBlocksDistribution hdfsBlocksDistribution =
771       new HDFSBlocksDistribution();
772     synchronized (this.stores) {
773       for (Store store : this.stores.values()) {
774         for (StoreFile sf : store.getStorefiles()) {
775           HDFSBlocksDistribution storeFileBlocksDistribution =
776             sf.getHDFSBlockDistribution();
777           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
778         }
779       }
780     }
781     return hdfsBlocksDistribution;
782   }
783 
784   /**
785    * This is a helper function to compute HDFS block distribution on demand
786    * @param conf configuration
787    * @param tableDescriptor HTableDescriptor of the table
788    * @param regionInfo encoded name of the region
789    * @return The HDFS blocks distribution for the given region.
790    * @throws IOException
791    */
792   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
793       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
794     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
795     return computeHDFSBlocksDistribution(conf, tableDescriptor, regionInfo, tablePath);
796   }
797 
798   /**
799    * This is a helper function to compute HDFS block distribution on demand
800    * @param conf configuration
801    * @param tableDescriptor HTableDescriptor of the table
802    * @param regionInfo encoded name of the region
803    * @param tablePath the table directory
804    * @return The HDFS blocks distribution for the given region.
805    * @throws IOException
806    */
807   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
808       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo,  Path tablePath)
809       throws IOException {
810     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
811     FileSystem fs = tablePath.getFileSystem(conf);
812 
813     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
814     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
815       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
816       if (storeFiles == null) continue;
817 
818       for (StoreFileInfo storeFileInfo : storeFiles) {
819         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
820       }
821     }
822     return hdfsBlocksDistribution;
823   }
824 
825   public AtomicLong getMemstoreSize() {
826     return memstoreSize;
827   }
828 
829   /**
830    * Increase the size of mem store in this region and the size of global mem
831    * store
832    * @param memStoreSize
833    * @return the size of memstore in this region
834    */
835   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
836     if (this.rsAccounting != null) {
837       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
838     }
839     return this.memstoreSize.getAndAdd(memStoreSize);
840   }
841 
842   /** @return a HRegionInfo object for this region */
843   public HRegionInfo getRegionInfo() {
844     return this.fs.getRegionInfo();
845   }
846 
847   /**
848    * @return Instance of {@link RegionServerServices} used by this HRegion.
849    * Can be null.
850    */
851   RegionServerServices getRegionServerServices() {
852     return this.rsServices;
853   }
854 
855   /** @return readRequestsCount for this region */
856   long getReadRequestsCount() {
857     return this.readRequestsCount.get();
858   }
859 
860   /** @return writeRequestsCount for this region */
861   long getWriteRequestsCount() {
862     return this.writeRequestsCount.get();
863   }
864 
865   MetricsRegion getMetrics() {
866     return metricsRegion;
867   }
868 
869   /** @return true if region is closed */
870   public boolean isClosed() {
871     return this.closed.get();
872   }
873 
874   /**
875    * @return True if closing process has started.
876    */
877   public boolean isClosing() {
878     return this.closing.get();
879   }
880 
881   /**
882    * Reset recovering state of current region
883    * @param newState
884    */
885   public void setRecovering(boolean newState) {
886     boolean wasRecovering = this.isRecovering;
887     this.isRecovering = newState;
888     if (wasRecovering && !isRecovering) {
889       // Call only when log replay is over.
890       coprocessorHost.postLogReplay();
891     }
892   }
893 
894   /**
895    * @return True if current region is in recovering
896    */
897   public boolean isRecovering() {
898     return this.isRecovering;
899   }
900 
901   /** @return true if region is available (not closed and not closing) */
902   public boolean isAvailable() {
903     return !isClosed() && !isClosing();
904   }
905 
906   /** @return true if region is splittable */
907   public boolean isSplittable() {
908     return isAvailable() && !hasReferences();
909   }
910 
911   /**
912    * @return true if region is mergeable
913    */
914   public boolean isMergeable() {
915     if (!isAvailable()) {
916       LOG.debug("Region " + this.getRegionNameAsString()
917           + " is not mergeable because it is closing or closed");
918       return false;
919     }
920     if (hasReferences()) {
921       LOG.debug("Region " + this.getRegionNameAsString()
922           + " is not mergeable because it has references");
923       return false;
924     }
925 
926     return true;
927   }
928 
929   public boolean areWritesEnabled() {
930     synchronized(this.writestate) {
931       return this.writestate.writesEnabled;
932     }
933   }
934 
935    public MultiVersionConsistencyControl getMVCC() {
936      return mvcc;
937    }
938 
939    /*
940     * Returns readpoint considering given IsolationLevel
941     */
942    public long getReadpoint(IsolationLevel isolationLevel) {
943      if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
944        // This scan can read even uncommitted transactions
945        return Long.MAX_VALUE;
946      }
947      return mvcc.memstoreReadPoint();
948    }
949 
950    public boolean isLoadingCfsOnDemandDefault() {
951      return this.isLoadingCfsOnDemandDefault;
952    }
953 
954   /**
955    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
956    * service any more calls.
957    *
958    * <p>This method could take some time to execute, so don't call it from a
959    * time-sensitive thread.
960    *
961    * @return Vector of all the storage files that the HRegion's component
962    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
963    * vector if already closed and null if judged that it should not close.
964    *
965    * @throws IOException e
966    */
967   public Map<byte[], List<StoreFile>> close() throws IOException {
968     return close(false);
969   }
970 
971   private final Object closeLock = new Object();
972 
973   /** Conf key for the periodic flush interval */
974   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
975       "hbase.regionserver.optionalcacheflushinterval";
976   /** Default interval for the memstore flush */
977   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
978 
979   /** Conf key to force a flush if there are already enough changes for one region in memstore */
980   public static final String MEMSTORE_FLUSH_PER_CHANGES =
981       "hbase.regionserver.flush.per.changes";
982   public static final long DEFAULT_FLUSH_PER_CHANGES = 30000000; // 30 millions
983   /**
984    * The following MAX_FLUSH_PER_CHANGES is large enough because each KeyValue has 20+ bytes
985    * overhead. Therefore, even 1G empty KVs occupy at least 20GB memstore size for a single region
986    */
987   public static final long MAX_FLUSH_PER_CHANGES = 1000000000; // 1G
988 
989   /**
990    * Close down this HRegion.  Flush the cache unless abort parameter is true,
991    * Shut down each HStore, don't service any more calls.
992    *
993    * This method could take some time to execute, so don't call it from a
994    * time-sensitive thread.
995    *
996    * @param abort true if server is aborting (only during testing)
997    * @return Vector of all the storage files that the HRegion's component
998    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
999    * we are not to close at this time or we are already closed.
1000    *
1001    * @throws IOException e
1002    */
1003   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
1004     // Only allow one thread to close at a time. Serialize them so dual
1005     // threads attempting to close will run up against each other.
1006     MonitoredTask status = TaskMonitor.get().createStatus(
1007         "Closing region " + this +
1008         (abort ? " due to abort" : ""));
1009 
1010     status.setStatus("Waiting for close lock");
1011     try {
1012       synchronized (closeLock) {
1013         return doClose(abort, status);
1014       }
1015     } finally {
1016       status.cleanup();
1017     }
1018   }
1019 
1020   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
1021       throws IOException {
1022     if (isClosed()) {
1023       LOG.warn("Region " + this + " already closed");
1024       return null;
1025     }
1026 
1027     if (coprocessorHost != null) {
1028       status.setStatus("Running coprocessor pre-close hooks");
1029       this.coprocessorHost.preClose(abort);
1030     }
1031 
1032     status.setStatus("Disabling compacts and flushes for region");
1033     boolean wasFlushing;
1034     synchronized (writestate) {
1035       // Disable compacting and flushing by background threads for this
1036       // region.
1037       writestate.writesEnabled = false;
1038       wasFlushing = writestate.flushing;
1039       LOG.debug("Closing " + this + ": disabling compactions & flushes");
1040       waitForFlushesAndCompactions();
1041     }
1042     // If we were not just flushing, is it worth doing a preflush...one
1043     // that will clear out of the bulk of the memstore before we put up
1044     // the close flag?
1045     if (!abort && !wasFlushing && worthPreFlushing()) {
1046       status.setStatus("Pre-flushing region before close");
1047       LOG.info("Running close preflush of " + this.getRegionNameAsString());
1048       internalFlushcache(status);
1049     }
1050 
1051     this.closing.set(true);
1052     status.setStatus("Disabling writes for close");
1053     // block waiting for the lock for closing
1054     lock.writeLock().lock();
1055     try {
1056       if (this.isClosed()) {
1057         status.abort("Already got closed by another process");
1058         // SplitTransaction handles the null
1059         return null;
1060       }
1061       LOG.debug("Updates disabled for region " + this);
1062       // Don't flush the cache if we are aborting
1063       if (!abort) {
1064         internalFlushcache(status);
1065       }
1066 
1067       Map<byte[], List<StoreFile>> result =
1068         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1069       if (!stores.isEmpty()) {
1070         // initialize the thread pool for closing stores in parallel.
1071         ThreadPoolExecutor storeCloserThreadPool =
1072           getStoreOpenAndCloseThreadPool("StoreCloserThread-" + this.getRegionNameAsString());
1073         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1074           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1075 
1076         // close each store in parallel
1077         for (final Store store : stores.values()) {
1078           completionService
1079               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1080                 @Override
1081                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1082                   return new Pair<byte[], Collection<StoreFile>>(
1083                     store.getFamily().getName(), store.close());
1084                 }
1085               });
1086         }
1087         try {
1088           for (int i = 0; i < stores.size(); i++) {
1089             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1090             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1091             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1092             if (familyFiles == null) {
1093               familyFiles = new ArrayList<StoreFile>();
1094               result.put(storeFiles.getFirst(), familyFiles);
1095             }
1096             familyFiles.addAll(storeFiles.getSecond());
1097           }
1098         } catch (InterruptedException e) {
1099           throw (InterruptedIOException)new InterruptedIOException().initCause(e);
1100         } catch (ExecutionException e) {
1101           throw new IOException(e.getCause());
1102         } finally {
1103           storeCloserThreadPool.shutdownNow();
1104         }
1105       }
1106       this.closed.set(true);
1107 
1108       if (coprocessorHost != null) {
1109         status.setStatus("Running coprocessor post-close hooks");
1110         this.coprocessorHost.postClose(abort);
1111       }
1112       if ( this.metricsRegion != null) {
1113         this.metricsRegion.close();
1114       }
1115       if ( this.metricsRegionWrapper != null) {
1116         Closeables.closeQuietly(this.metricsRegionWrapper);
1117       }
1118       status.markComplete("Closed");
1119       LOG.info("Closed " + this);
1120       return result;
1121     } finally {
1122       lock.writeLock().unlock();
1123     }
1124   }
1125 
1126   /**
1127    * Wait for all current flushes and compactions of the region to complete.
1128    * <p>
1129    * Exposed for TESTING.
1130    */
1131   public void waitForFlushesAndCompactions() {
1132     synchronized (writestate) {
1133       while (writestate.compacting > 0 || writestate.flushing) {
1134         LOG.debug("waiting for " + writestate.compacting + " compactions"
1135             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1136         try {
1137           writestate.wait();
1138         } catch (InterruptedException iex) {
1139           // essentially ignore and propagate the interrupt back up
1140           Thread.currentThread().interrupt();
1141         }
1142       }
1143     }
1144   }
1145 
1146   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1147       final String threadNamePrefix) {
1148     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1149     int maxThreads = Math.min(numStores,
1150         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1151             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1152     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1153   }
1154 
1155   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1156       final String threadNamePrefix) {
1157     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1158     int maxThreads = Math.max(1,
1159         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1160             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1161             / numStores);
1162     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1163   }
1164 
1165   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1166       final String threadNamePrefix) {
1167     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1168       new ThreadFactory() {
1169         private int count = 1;
1170 
1171         @Override
1172         public Thread newThread(Runnable r) {
1173           return new Thread(r, threadNamePrefix + "-" + count++);
1174         }
1175       });
1176   }
1177 
1178    /**
1179     * @return True if its worth doing a flush before we put up the close flag.
1180     */
1181   private boolean worthPreFlushing() {
1182     return this.memstoreSize.get() >
1183       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1184   }
1185 
1186   //////////////////////////////////////////////////////////////////////////////
1187   // HRegion accessors
1188   //////////////////////////////////////////////////////////////////////////////
1189 
1190   /** @return start key for region */
1191   public byte [] getStartKey() {
1192     return this.getRegionInfo().getStartKey();
1193   }
1194 
1195   /** @return end key for region */
1196   public byte [] getEndKey() {
1197     return this.getRegionInfo().getEndKey();
1198   }
1199 
1200   /** @return region id */
1201   public long getRegionId() {
1202     return this.getRegionInfo().getRegionId();
1203   }
1204 
1205   /** @return region name */
1206   public byte [] getRegionName() {
1207     return this.getRegionInfo().getRegionName();
1208   }
1209 
1210   /** @return region name as string for logging */
1211   public String getRegionNameAsString() {
1212     return this.getRegionInfo().getRegionNameAsString();
1213   }
1214 
1215   /** @return HTableDescriptor for this region */
1216   public HTableDescriptor getTableDesc() {
1217     return this.htableDescriptor;
1218   }
1219 
1220   /** @return HLog in use for this region */
1221   public HLog getLog() {
1222     return this.log;
1223   }
1224 
1225   /**
1226    * A split takes the config from the parent region & passes it to the daughter
1227    * region's constructor. If 'conf' was passed, you would end up using the HTD
1228    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1229    * to the daughter regions to avoid this tricky dedupe problem.
1230    * @return Configuration object
1231    */
1232   Configuration getBaseConf() {
1233     return this.baseConf;
1234   }
1235 
1236   /** @return {@link FileSystem} being used by this region */
1237   public FileSystem getFilesystem() {
1238     return fs.getFileSystem();
1239   }
1240 
1241   /** @return the {@link HRegionFileSystem} used by this region */
1242   public HRegionFileSystem getRegionFileSystem() {
1243     return this.fs;
1244   }
1245 
1246   /** @return the last time the region was flushed */
1247   public long getLastFlushTime() {
1248     return this.lastFlushTime;
1249   }
1250 
1251   //////////////////////////////////////////////////////////////////////////////
1252   // HRegion maintenance.
1253   //
1254   // These methods are meant to be called periodically by the HRegionServer for
1255   // upkeep.
1256   //////////////////////////////////////////////////////////////////////////////
1257 
1258   /** @return returns size of largest HStore. */
1259   public long getLargestHStoreSize() {
1260     long size = 0;
1261     for (Store h : stores.values()) {
1262       long storeSize = h.getSize();
1263       if (storeSize > size) {
1264         size = storeSize;
1265       }
1266     }
1267     return size;
1268   }
1269 
1270   /**
1271    * @return KeyValue Comparator
1272    */
1273   public KeyValue.KVComparator getComparator() {
1274     return this.comparator;
1275   }
1276 
1277   /*
1278    * Do preparation for pending compaction.
1279    * @throws IOException
1280    */
1281   protected void doRegionCompactionPrep() throws IOException {
1282   }
1283 
1284   void triggerMajorCompaction() {
1285     for (Store h : stores.values()) {
1286       h.triggerMajorCompaction();
1287     }
1288   }
1289 
1290   /**
1291    * This is a helper function that compact all the stores synchronously
1292    * It is used by utilities and testing
1293    *
1294    * @param majorCompaction True to force a major compaction regardless of thresholds
1295    * @throws IOException e
1296    */
1297   public void compactStores(final boolean majorCompaction)
1298   throws IOException {
1299     if (majorCompaction) {
1300       this.triggerMajorCompaction();
1301     }
1302     compactStores();
1303   }
1304 
1305   /**
1306    * This is a helper function that compact all the stores synchronously
1307    * It is used by utilities and testing
1308    *
1309    * @throws IOException e
1310    */
1311   public void compactStores() throws IOException {
1312     for (Store s : getStores().values()) {
1313       CompactionContext compaction = s.requestCompaction();
1314       if (compaction != null) {
1315         compact(compaction, s);
1316       }
1317     }
1318   }
1319 
1320   /*
1321    * Called by compaction thread and after region is opened to compact the
1322    * HStores if necessary.
1323    *
1324    * <p>This operation could block for a long time, so don't call it from a
1325    * time-sensitive thread.
1326    *
1327    * Note that no locking is necessary at this level because compaction only
1328    * conflicts with a region split, and that cannot happen because the region
1329    * server does them sequentially and not in parallel.
1330    *
1331    * @param cr Compaction details, obtained by requestCompaction()
1332    * @return whether the compaction completed
1333    * @throws IOException e
1334    */
1335   public boolean compact(CompactionContext compaction, Store store) throws IOException {
1336     assert compaction != null && compaction.hasSelection();
1337     assert !compaction.getRequest().getFiles().isEmpty();
1338     if (this.closing.get() || this.closed.get()) {
1339       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1340       store.cancelRequestedCompaction(compaction);
1341       return false;
1342     }
1343     MonitoredTask status = null;
1344     boolean didPerformCompaction = false;
1345     // block waiting for the lock for compaction
1346     lock.readLock().lock();
1347     try {
1348       byte[] cf = Bytes.toBytes(store.getColumnFamilyName());
1349       if (stores.get(cf) != store) {
1350         LOG.warn("Store " + store.getColumnFamilyName() + " on region " + this
1351             + " has been re-instantiated, cancel this compaction request. "
1352             + " It may be caused by the roll back of split transaction");
1353         return false;
1354       }
1355 
1356       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1357       if (this.closed.get()) {
1358         String msg = "Skipping compaction on " + this + " because closed";
1359         LOG.debug(msg);
1360         status.abort(msg);
1361         return false;
1362       }
1363       boolean wasStateSet = false;
1364       try {
1365         synchronized (writestate) {
1366           if (writestate.writesEnabled) {
1367             wasStateSet = true;
1368             ++writestate.compacting;
1369           } else {
1370             String msg = "NOT compacting region " + this + ". Writes disabled.";
1371             LOG.info(msg);
1372             status.abort(msg);
1373             return false;
1374           }
1375         }
1376         LOG.info("Starting compaction on " + store + " in region " + this
1377             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1378         doRegionCompactionPrep();
1379         try {
1380           status.setStatus("Compacting store " + store);
1381           didPerformCompaction = true;
1382           store.compact(compaction);
1383         } catch (InterruptedIOException iioe) {
1384           String msg = "compaction interrupted";
1385           LOG.info(msg, iioe);
1386           status.abort(msg);
1387           return false;
1388         }
1389       } finally {
1390         if (wasStateSet) {
1391           synchronized (writestate) {
1392             --writestate.compacting;
1393             if (writestate.compacting <= 0) {
1394               writestate.notifyAll();
1395             }
1396           }
1397         }
1398       }
1399       status.markComplete("Compaction complete");
1400       return true;
1401     } finally {
1402       try {
1403         if (!didPerformCompaction) store.cancelRequestedCompaction(compaction);
1404         if (status != null) status.cleanup();
1405       } finally {
1406         lock.readLock().unlock();
1407       }
1408     }
1409   }
1410 
1411   /**
1412    * Flush the cache.
1413    *
1414    * When this method is called the cache will be flushed unless:
1415    * <ol>
1416    *   <li>the cache is empty</li>
1417    *   <li>the region is closed.</li>
1418    *   <li>a flush is already in progress</li>
1419    *   <li>writes are disabled</li>
1420    * </ol>
1421    *
1422    * <p>This method may block for some time, so it should not be called from a
1423    * time-sensitive thread.
1424    *
1425    * @return true if the region needs compacting
1426    *
1427    * @throws IOException general io exceptions
1428    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1429    * because a Snapshot was not properly persisted.
1430    */
1431   public boolean flushcache() throws IOException {
1432     // fail-fast instead of waiting on the lock
1433     if (this.closing.get()) {
1434       LOG.debug("Skipping flush on " + this + " because closing");
1435       return false;
1436     }
1437     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1438     status.setStatus("Acquiring readlock on region");
1439     // block waiting for the lock for flushing cache
1440     lock.readLock().lock();
1441     try {
1442       if (this.closed.get()) {
1443         LOG.debug("Skipping flush on " + this + " because closed");
1444         status.abort("Skipped: closed");
1445         return false;
1446       }
1447       if (coprocessorHost != null) {
1448         status.setStatus("Running coprocessor pre-flush hooks");
1449         coprocessorHost.preFlush();
1450       }
1451       if (numMutationsWithoutWAL.get() > 0) {
1452         numMutationsWithoutWAL.set(0);
1453         dataInMemoryWithoutWAL.set(0);
1454       }
1455       synchronized (writestate) {
1456         if (!writestate.flushing && writestate.writesEnabled) {
1457           this.writestate.flushing = true;
1458         } else {
1459           if (LOG.isDebugEnabled()) {
1460             LOG.debug("NOT flushing memstore for region " + this
1461                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1462                 + writestate.writesEnabled);
1463           }
1464           status.abort("Not flushing since "
1465               + (writestate.flushing ? "already flushing"
1466                   : "writes not enabled"));
1467           return false;
1468         }
1469       }
1470       try {
1471         boolean result = internalFlushcache(status);
1472 
1473         if (coprocessorHost != null) {
1474           status.setStatus("Running post-flush coprocessor hooks");
1475           coprocessorHost.postFlush();
1476         }
1477 
1478         status.markComplete("Flush successful");
1479         return result;
1480       } finally {
1481         synchronized (writestate) {
1482           writestate.flushing = false;
1483           this.writestate.flushRequested = false;
1484           writestate.notifyAll();
1485         }
1486       }
1487     } finally {
1488       lock.readLock().unlock();
1489       status.cleanup();
1490     }
1491   }
1492 
1493   /**
1494    * Should the memstore be flushed now
1495    */
1496   boolean shouldFlush() {
1497     if(this.completeSequenceId + this.flushPerChanges < this.sequenceId.get()) {
1498       return true;
1499     }
1500     if (flushCheckInterval <= 0) { //disabled
1501       return false;
1502     }
1503     long now = EnvironmentEdgeManager.currentTimeMillis();
1504     //if we flushed in the recent past, we don't need to do again now
1505     if ((now - getLastFlushTime() < flushCheckInterval)) {
1506       return false;
1507     }
1508     //since we didn't flush in the recent past, flush now if certain conditions
1509     //are met. Return true on first such memstore hit.
1510     for (Store s : this.getStores().values()) {
1511       if (s.timeOfOldestEdit() < now - flushCheckInterval) {
1512         // we have an old enough edit in the memstore, flush
1513         return true;
1514       }
1515     }
1516     return false;
1517   }
1518 
1519   /**
1520    * Flush the memstore.
1521    *
1522    * Flushing the memstore is a little tricky. We have a lot of updates in the
1523    * memstore, all of which have also been written to the log. We need to
1524    * write those updates in the memstore out to disk, while being able to
1525    * process reads/writes as much as possible during the flush operation. Also,
1526    * the log has to state clearly the point in time at which the memstore was
1527    * flushed. (That way, during recovery, we know when we can rely on the
1528    * on-disk flushed structures and when we have to recover the memstore from
1529    * the log.)
1530    *
1531    * <p>So, we have a three-step process:
1532    *
1533    * <ul><li>A. Flush the memstore to the on-disk stores, noting the current
1534    * sequence ID for the log.<li>
1535    *
1536    * <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
1537    * ID that was current at the time of memstore-flush.</li>
1538    *
1539    * <li>C. Get rid of the memstore structures that are now redundant, as
1540    * they've been flushed to the on-disk HStores.</li>
1541    * </ul>
1542    * <p>This method is protected, but can be accessed via several public
1543    * routes.
1544    *
1545    * <p> This method may block for some time.
1546    * @param status
1547    *
1548    * @return true if the region needs compacting
1549    *
1550    * @throws IOException general io exceptions
1551    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1552    * because a Snapshot was not properly persisted.
1553    */
1554   protected boolean internalFlushcache(MonitoredTask status)
1555       throws IOException {
1556     return internalFlushcache(this.log, -1, status);
1557   }
1558 
1559   /**
1560    * @param wal Null if we're NOT to go via hlog/wal.
1561    * @param myseqid The seqid to use if <code>wal</code> is null writing out
1562    * flush file.
1563    * @param status
1564    * @return true if the region needs compacting
1565    * @throws IOException
1566    * @see #internalFlushcache(MonitoredTask)
1567    */
1568   protected boolean internalFlushcache(
1569       final HLog wal, final long myseqid, MonitoredTask status)
1570   throws IOException {
1571     if (this.rsServices != null && this.rsServices.isAborted()) {
1572       // Don't flush when server aborting, it's unsafe
1573       throw new IOException("Aborting flush because server is abortted...");
1574     }
1575     final long startTime = EnvironmentEdgeManager.currentTimeMillis();
1576     // Clear flush flag.
1577     // If nothing to flush, return and avoid logging start/stop flush.
1578     if (this.memstoreSize.get() <= 0) {
1579       return false;
1580     }
1581     if (LOG.isDebugEnabled()) {
1582       LOG.debug("Started memstore flush for " + this +
1583         ", current region memstore size " +
1584         StringUtils.humanReadableInt(this.memstoreSize.get()) +
1585         ((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
1586     }
1587 
1588     // Stop updates while we snapshot the memstore of all stores. We only have
1589     // to do this for a moment.  Its quick.  The subsequent sequence id that
1590     // goes into the HLog after we've flushed all these snapshots also goes
1591     // into the info file that sits beside the flushed files.
1592     // We also set the memstore size to zero here before we allow updates
1593     // again so its value will represent the size of the updates received
1594     // during the flush
1595     MultiVersionConsistencyControl.WriteEntry w = null;
1596 
1597     // We have to take a write lock during snapshot, or else a write could
1598     // end up in both snapshot and memstore (makes it difficult to do atomic
1599     // rows then)
1600     status.setStatus("Obtaining lock to block concurrent updates");
1601     // block waiting for the lock for internal flush
1602     this.updatesLock.writeLock().lock();
1603     long flushsize = this.memstoreSize.get();
1604     status.setStatus("Preparing to flush by snapshotting stores");
1605     List<StoreFlushContext> storeFlushCtxs = new ArrayList<StoreFlushContext>(stores.size());
1606     long flushSeqId = -1L;
1607     try {
1608       // Record the mvcc for all transactions in progress.
1609       w = mvcc.beginMemstoreInsert();
1610       mvcc.advanceMemstore(w);
1611       // check if it is not closing.
1612       if (wal != null) {
1613         if (!wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes())) {
1614           status.setStatus("Flush will not be started for ["
1615               + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.");
1616           return false;
1617         }
1618         flushSeqId = this.sequenceId.incrementAndGet();
1619       } else {
1620         // use the provided sequence Id as WAL is not being used for this flush.
1621         flushSeqId = myseqid;
1622       }
1623 
1624       for (Store s : stores.values()) {
1625         storeFlushCtxs.add(s.createFlushContext(flushSeqId));
1626       }
1627 
1628       // prepare flush (take a snapshot)
1629       for (StoreFlushContext flush : storeFlushCtxs) {
1630         flush.prepare();
1631       }
1632     } finally {
1633       this.updatesLock.writeLock().unlock();
1634     }
1635     String s = "Finished memstore snapshotting " + this +
1636       ", syncing WAL and waiting on mvcc, flushsize=" + flushsize;
1637     status.setStatus(s);
1638     if (LOG.isTraceEnabled()) LOG.trace(s);
1639 
1640     // sync unflushed WAL changes when deferred log sync is enabled
1641     // see HBASE-8208 for details
1642     if (wal != null && !shouldSyncLog()) {
1643       wal.sync();
1644     }
1645 
1646     // wait for all in-progress transactions to commit to HLog before
1647     // we can start the flush. This prevents
1648     // uncommitted transactions from being written into HFiles.
1649     // We have to block before we start the flush, otherwise keys that
1650     // were removed via a rollbackMemstore could be written to Hfiles.
1651     mvcc.waitForRead(w);
1652 
1653     s = "Flushing stores of " + this;
1654     status.setStatus(s);
1655     if (LOG.isTraceEnabled()) LOG.trace(s);
1656 
1657     // Any failure from here on out will be catastrophic requiring server
1658     // restart so hlog content can be replayed and put back into the memstore.
1659     // Otherwise, the snapshot content while backed up in the hlog, it will not
1660     // be part of the current running servers state.
1661     boolean compactionRequested = false;
1662     try {
1663       // A.  Flush memstore to all the HStores.
1664       // Keep running vector of all store files that includes both old and the
1665       // just-made new flush store file. The new flushed file is still in the
1666       // tmp directory.
1667 
1668       for (StoreFlushContext flush : storeFlushCtxs) {
1669         flush.flushCache(status);
1670       }
1671 
1672       // Switch snapshot (in memstore) -> new hfile (thus causing
1673       // all the store scanners to reset/reseek).
1674       for (StoreFlushContext flush : storeFlushCtxs) {
1675         boolean needsCompaction = flush.commit(status);
1676         if (needsCompaction) {
1677           compactionRequested = true;
1678         }
1679       }
1680       storeFlushCtxs.clear();
1681 
1682       // Set down the memstore size by amount of flush.
1683       this.addAndGetGlobalMemstoreSize(-flushsize);
1684     } catch (Throwable t) {
1685       // An exception here means that the snapshot was not persisted.
1686       // The hlog needs to be replayed so its content is restored to memstore.
1687       // Currently, only a server restart will do this.
1688       // We used to only catch IOEs but its possible that we'd get other
1689       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
1690       // all and sundry.
1691       if (wal != null) {
1692         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1693       }
1694       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
1695           Bytes.toStringBinary(getRegionName()));
1696       dse.initCause(t);
1697       status.abort("Flush failed: " + StringUtils.stringifyException(t));
1698       throw dse;
1699     }
1700 
1701     // If we get to here, the HStores have been written.
1702     if (wal != null) {
1703       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1704     }
1705 
1706     // Record latest flush time
1707     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
1708 
1709     // Update the last flushed sequence id for region
1710     completeSequenceId = flushSeqId;
1711 
1712     // C. Finally notify anyone waiting on memstore to clear:
1713     // e.g. checkResources().
1714     synchronized (this) {
1715       notifyAll(); // FindBugs NN_NAKED_NOTIFY
1716     }
1717 
1718     long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
1719     long memstoresize = this.memstoreSize.get();
1720     String msg = "Finished memstore flush of ~" +
1721       StringUtils.humanReadableInt(flushsize) + "/" + flushsize +
1722       ", currentsize=" +
1723       StringUtils.humanReadableInt(memstoresize) + "/" + memstoresize +
1724       " for region " + this + " in " + time + "ms, sequenceid=" + flushSeqId +
1725       ", compaction requested=" + compactionRequested +
1726       ((wal == null)? "; wal=null": "");
1727     LOG.info(msg);
1728     status.setStatus(msg);
1729     this.recentFlushes.add(new Pair<Long,Long>(time/1000, flushsize));
1730 
1731     return compactionRequested;
1732   }
1733 
1734   //////////////////////////////////////////////////////////////////////////////
1735   // get() methods for client use.
1736   //////////////////////////////////////////////////////////////////////////////
1737   /**
1738    * Return all the data for the row that matches <i>row</i> exactly,
1739    * or the one that immediately preceeds it, at or immediately before
1740    * <i>ts</i>.
1741    *
1742    * @param row row key
1743    * @return map of values
1744    * @throws IOException
1745    */
1746   Result getClosestRowBefore(final byte [] row)
1747   throws IOException{
1748     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
1749   }
1750 
1751   /**
1752    * Return all the data for the row that matches <i>row</i> exactly,
1753    * or the one that immediately preceeds it, at or immediately before
1754    * <i>ts</i>.
1755    *
1756    * @param row row key
1757    * @param family column family to find on
1758    * @return map of values
1759    * @throws IOException read exceptions
1760    */
1761   public Result getClosestRowBefore(final byte [] row, final byte [] family)
1762   throws IOException {
1763     if (coprocessorHost != null) {
1764       Result result = new Result();
1765       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
1766         return result;
1767       }
1768     }
1769     // look across all the HStores for this region and determine what the
1770     // closest key is across all column families, since the data may be sparse
1771     checkRow(row, "getClosestRowBefore");
1772     startRegionOperation(Operation.GET);
1773     this.readRequestsCount.increment();
1774     try {
1775       Store store = getStore(family);
1776       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
1777       KeyValue key = store.getRowKeyAtOrBefore(row);
1778       Result result = null;
1779       if (key != null) {
1780         Get get = new Get(key.getRow());
1781         get.addFamily(family);
1782         result = get(get);
1783       }
1784       if (coprocessorHost != null) {
1785         coprocessorHost.postGetClosestRowBefore(row, family, result);
1786       }
1787       return result;
1788     } finally {
1789       closeRegionOperation(Operation.GET);
1790     }
1791   }
1792 
1793   /**
1794    * Return an iterator that scans over the HRegion, returning the indicated
1795    * columns and rows specified by the {@link Scan}.
1796    * <p>
1797    * This Iterator must be closed by the caller.
1798    *
1799    * @param scan configured {@link Scan}
1800    * @return RegionScanner
1801    * @throws IOException read exceptions
1802    */
1803   public RegionScanner getScanner(Scan scan) throws IOException {
1804    return getScanner(scan, null);
1805   }
1806 
1807   void prepareScanner(Scan scan) throws IOException {
1808     if(!scan.hasFamilies()) {
1809       // Adding all families to scanner
1810       for(byte[] family: this.htableDescriptor.getFamiliesKeys()){
1811         scan.addFamily(family);
1812       }
1813     }
1814   }
1815 
1816   protected RegionScanner getScanner(Scan scan,
1817       List<KeyValueScanner> additionalScanners) throws IOException {
1818     startRegionOperation(Operation.SCAN);
1819     try {
1820       // Verify families are all valid
1821       prepareScanner(scan);
1822       if(scan.hasFamilies()) {
1823         for(byte [] family : scan.getFamilyMap().keySet()) {
1824           checkFamily(family);
1825         }
1826       }
1827       return instantiateRegionScanner(scan, additionalScanners);
1828     } finally {
1829       closeRegionOperation(Operation.SCAN);
1830     }
1831   }
1832 
1833   protected RegionScanner instantiateRegionScanner(Scan scan,
1834       List<KeyValueScanner> additionalScanners) throws IOException {
1835     if (scan.isReversed()) {
1836       if (scan.getFilter() != null) {
1837         scan.getFilter().setReversed(true);
1838       }
1839       return new ReversedRegionScannerImpl(scan, additionalScanners, this);
1840     }
1841     return new RegionScannerImpl(scan, additionalScanners, this);
1842   }
1843 
1844   /*
1845    * @param delete The passed delete is modified by this method. WARNING!
1846    */
1847   void prepareDelete(Delete delete) throws IOException {
1848     // Check to see if this is a deleteRow insert
1849     if(delete.getFamilyCellMap().isEmpty()){
1850       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
1851         // Don't eat the timestamp
1852         delete.deleteFamily(family, delete.getTimeStamp());
1853       }
1854     } else {
1855       for(byte [] family : delete.getFamilyCellMap().keySet()) {
1856         if(family == null) {
1857           throw new NoSuchColumnFamilyException("Empty family is invalid");
1858         }
1859         checkFamily(family);
1860       }
1861     }
1862   }
1863 
1864   //////////////////////////////////////////////////////////////////////////////
1865   // set() methods for client use.
1866   //////////////////////////////////////////////////////////////////////////////
1867   /**
1868    * @param delete delete object
1869    * @throws IOException read exceptions
1870    */
1871   public void delete(Delete delete)
1872   throws IOException {
1873     checkReadOnly();
1874     checkResources();
1875     startRegionOperation(Operation.DELETE);
1876     this.writeRequestsCount.increment();
1877     try {
1878       delete.getRow();
1879       // All edits for the given row (across all column families) must happen atomically.
1880       doBatchMutate(delete);
1881     } finally {
1882       closeRegionOperation(Operation.DELETE);
1883     }
1884   }
1885 
1886   /**
1887    * Row needed by below method.
1888    */
1889   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
1890   /**
1891    * This is used only by unit tests. Not required to be a public API.
1892    * @param familyMap map of family to edits for the given family.
1893    * @param durability
1894    * @throws IOException
1895    */
1896   void delete(NavigableMap<byte[], List<Cell>> familyMap,
1897       Durability durability) throws IOException {
1898     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
1899     delete.setFamilyCellMap(familyMap);
1900     delete.setDurability(durability);
1901     doBatchMutate(delete);
1902   }
1903 
1904   /**
1905    * Setup correct timestamps in the KVs in Delete object.
1906    * Caller should have the row and region locks.
1907    * @param familyMap
1908    * @param byteNow
1909    * @throws IOException
1910    */
1911   void prepareDeleteTimestamps(Map<byte[], List<Cell>> familyMap, byte[] byteNow)
1912       throws IOException {
1913     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
1914 
1915       byte[] family = e.getKey();
1916       List<Cell> cells = e.getValue();
1917       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
1918 
1919       for (Cell cell: cells) {
1920         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
1921         //  Check if time is LATEST, change to time of most recent addition if so
1922         //  This is expensive.
1923         if (kv.isLatestTimestamp() && kv.isDeleteType()) {
1924           byte[] qual = kv.getQualifier();
1925           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
1926 
1927           Integer count = kvCount.get(qual);
1928           if (count == null) {
1929             kvCount.put(qual, 1);
1930           } else {
1931             kvCount.put(qual, count + 1);
1932           }
1933           count = kvCount.get(qual);
1934 
1935           Get get = new Get(kv.getRow());
1936           get.setMaxVersions(count);
1937           get.addColumn(family, qual);
1938 
1939           List<Cell> result = get(get, false);
1940 
1941           if (result.size() < count) {
1942             // Nothing to delete
1943             kv.updateLatestStamp(byteNow);
1944             continue;
1945           }
1946           if (result.size() > count) {
1947             throw new RuntimeException("Unexpected size: " + result.size());
1948           }
1949           KeyValue getkv = KeyValueUtil.ensureKeyValue(result.get(count - 1));
1950           Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(),
1951               getkv.getBuffer(), getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
1952         } else {
1953           kv.updateLatestStamp(byteNow);
1954         }
1955       }
1956     }
1957   }
1958 
1959   /**
1960    * @param put
1961    * @throws IOException
1962    */
1963   public void put(Put put)
1964   throws IOException {
1965     checkReadOnly();
1966 
1967     // Do a rough check that we have resources to accept a write.  The check is
1968     // 'rough' in that between the resource check and the call to obtain a
1969     // read lock, resources may run out.  For now, the thought is that this
1970     // will be extremely rare; we'll deal with it when it happens.
1971     checkResources();
1972     startRegionOperation(Operation.PUT);
1973     this.writeRequestsCount.increment();
1974     try {
1975       // All edits for the given row (across all column families) must happen atomically.
1976       doBatchMutate(put);
1977     } finally {
1978       closeRegionOperation(Operation.PUT);
1979     }
1980   }
1981 
1982   /**
1983    * Struct-like class that tracks the progress of a batch operation,
1984    * accumulating status codes and tracking the index at which processing
1985    * is proceeding.
1986    */
1987   private abstract static class BatchOperationInProgress<T> {
1988     T[] operations;
1989     int nextIndexToProcess = 0;
1990     OperationStatus[] retCodeDetails;
1991     WALEdit[] walEditsFromCoprocessors;
1992 
1993     public BatchOperationInProgress(T[] operations) {
1994       this.operations = operations;
1995       this.retCodeDetails = new OperationStatus[operations.length];
1996       this.walEditsFromCoprocessors = new WALEdit[operations.length];
1997       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
1998     }
1999 
2000     public abstract Mutation getMutation(int index);
2001     public abstract long getNonceGroup(int index);
2002     public abstract long getNonce(int index);
2003     /** This method is potentially expensive and should only be used for non-replay CP path. */
2004     public abstract Mutation[] getMutationsForCoprocs();
2005     public abstract boolean isInReplay();
2006 
2007     public boolean isDone() {
2008       return nextIndexToProcess == operations.length;
2009     }
2010   }
2011 
2012   private static class MutationBatch extends BatchOperationInProgress<Mutation> {
2013     private long nonceGroup;
2014     private long nonce;
2015     public MutationBatch(Mutation[] operations, long nonceGroup, long nonce) {
2016       super(operations);
2017       this.nonceGroup = nonceGroup;
2018       this.nonce = nonce;
2019     }
2020 
2021     public Mutation getMutation(int index) {
2022       return this.operations[index];
2023     }
2024 
2025     @Override
2026     public long getNonceGroup(int index) {
2027       return nonceGroup;
2028     }
2029 
2030     @Override
2031     public long getNonce(int index) {
2032       return nonce;
2033     }
2034 
2035     @Override
2036     public Mutation[] getMutationsForCoprocs() {
2037       return this.operations;
2038     }
2039 
2040     @Override
2041     public boolean isInReplay() {
2042       return false;
2043     }
2044   }
2045 
2046   private static class ReplayBatch extends BatchOperationInProgress<HLogSplitter.MutationReplay> {
2047     public ReplayBatch(MutationReplay[] operations) {
2048       super(operations);
2049     }
2050 
2051     @Override
2052     public Mutation getMutation(int index) {
2053       return this.operations[index].mutation;
2054     }
2055 
2056     @Override
2057     public long getNonceGroup(int index) {
2058       return this.operations[index].nonceGroup;
2059     }
2060 
2061     @Override
2062     public long getNonce(int index) {
2063       return this.operations[index].nonce;
2064     }
2065 
2066     @Override
2067     public Mutation[] getMutationsForCoprocs() {
2068       assert false;
2069       throw new RuntimeException("Should not be called for replay batch");
2070     }
2071 
2072     @Override
2073     public boolean isInReplay() {
2074       return true;
2075     }
2076   }
2077 
2078   /**
2079    * Perform a batch of mutations.
2080    * It supports only Put and Delete mutations and will ignore other types passed.
2081    * @param mutations the list of mutations
2082    * @return an array of OperationStatus which internally contains the
2083    *         OperationStatusCode and the exceptionMessage if any.
2084    * @throws IOException
2085    */
2086   public OperationStatus[] batchMutate(
2087       Mutation[] mutations, long nonceGroup, long nonce) throws IOException {
2088     // As it stands, this is used for 3 things
2089     //  * batchMutate with single mutation - put/delete, separate or from checkAndMutate.
2090     //  * coprocessor calls (see ex. BulkDeleteEndpoint).
2091     // So nonces are not really ever used by HBase. They could be by coprocs, and checkAnd...
2092     return batchMutate(new MutationBatch(mutations, nonceGroup, nonce));
2093   }
2094 
2095   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
2096     return batchMutate(mutations, HConstants.NO_NONCE, HConstants.NO_NONCE);
2097   }
2098 
2099   /**
2100    * Replay a batch of mutations.
2101    * @param mutations mutations to replay.
2102    * @return an array of OperationStatus which internally contains the
2103    *         OperationStatusCode and the exceptionMessage if any.
2104    * @throws IOException
2105    */
2106   public OperationStatus[] batchReplay(HLogSplitter.MutationReplay[] mutations)
2107       throws IOException {
2108     return batchMutate(new ReplayBatch(mutations));
2109   }
2110 
2111   /**
2112    * Perform a batch of mutations.
2113    * It supports only Put and Delete mutations and will ignore other types passed.
2114    * @param mutations the list of mutations
2115    * @return an array of OperationStatus which internally contains the
2116    *         OperationStatusCode and the exceptionMessage if any.
2117    * @throws IOException
2118    */
2119   OperationStatus[] batchMutate(BatchOperationInProgress<?> batchOp) throws IOException {
2120     boolean initialized = false;
2121     while (!batchOp.isDone()) {
2122       if (!batchOp.isInReplay()) {
2123         checkReadOnly();
2124       }
2125       checkResources();
2126 
2127       long newSize;
2128       Operation op = Operation.BATCH_MUTATE;
2129       if (batchOp.isInReplay()) op = Operation.REPLAY_BATCH_MUTATE;
2130       startRegionOperation(op);
2131 
2132       try {
2133         if (!initialized) {
2134           if (!batchOp.isInReplay()) {
2135             this.writeRequestsCount.increment();
2136             doPreMutationHook(batchOp);
2137           }
2138           initialized = true;
2139         }
2140         long addedSize = doMiniBatchMutation(batchOp);
2141         newSize = this.addAndGetGlobalMemstoreSize(addedSize);
2142       } finally {
2143         closeRegionOperation(op);
2144       }
2145       if (isFlushSize(newSize)) {
2146         requestFlush();
2147       }
2148     }
2149     return batchOp.retCodeDetails;
2150   }
2151 
2152 
2153   private void doPreMutationHook(BatchOperationInProgress<?> batchOp)
2154       throws IOException {
2155     /* Run coprocessor pre hook outside of locks to avoid deadlock */
2156     WALEdit walEdit = new WALEdit();
2157     if (coprocessorHost != null) {
2158       for (int i = 0 ; i < batchOp.operations.length; i++) {
2159         Mutation m = batchOp.getMutation(i);
2160         if (m instanceof Put) {
2161           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
2162             // pre hook says skip this Put
2163             // mark as success and skip in doMiniBatchMutation
2164             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2165           }
2166         } else if (m instanceof Delete) {
2167           Delete curDel = (Delete) m;
2168           if (curDel.getFamilyCellMap().isEmpty()) {
2169             // handle deleting a row case
2170             prepareDelete(curDel);
2171           }
2172           if (coprocessorHost.preDelete(curDel, walEdit, m.getDurability())) {
2173             // pre hook says skip this Delete
2174             // mark as success and skip in doMiniBatchMutation
2175             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2176           }
2177         } else {
2178           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
2179           // mark the operation return code as failure so that it will not be considered in
2180           // the doMiniBatchMutation
2181           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2182               "Put/Delete mutations only supported in batchMutate() now");
2183         }
2184         if (!walEdit.isEmpty()) {
2185           batchOp.walEditsFromCoprocessors[i] = walEdit;
2186           walEdit = new WALEdit();
2187         }
2188       }
2189     }
2190   }
2191 
2192   @SuppressWarnings("unchecked")
2193   private long doMiniBatchMutation(BatchOperationInProgress<?> batchOp) throws IOException {
2194     boolean isInReplay = batchOp.isInReplay();
2195     // variable to note if all Put items are for the same CF -- metrics related
2196     boolean putsCfSetConsistent = true;
2197     //The set of columnFamilies first seen for Put.
2198     Set<byte[]> putsCfSet = null;
2199     // variable to note if all Delete items are for the same CF -- metrics related
2200     boolean deletesCfSetConsistent = true;
2201     //The set of columnFamilies first seen for Delete.
2202     Set<byte[]> deletesCfSet = null;
2203 
2204     long currentNonceGroup = HConstants.NO_NONCE, currentNonce = HConstants.NO_NONCE;
2205     WALEdit walEdit = new WALEdit(isInReplay);
2206     MultiVersionConsistencyControl.WriteEntry w = null;
2207     long txid = 0;
2208     boolean doRollBackMemstore = false;
2209     boolean locked = false;
2210 
2211     /** Keep track of the locks we hold so we can release them in finally clause */
2212     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2213     // reference family maps directly so coprocessors can mutate them if desired
2214     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2215     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2216     int firstIndex = batchOp.nextIndexToProcess;
2217     int lastIndexExclusive = firstIndex;
2218     boolean success = false;
2219     int noOfPuts = 0, noOfDeletes = 0;
2220     try {
2221       // ------------------------------------
2222       // STEP 1. Try to acquire as many locks as we can, and ensure
2223       // we acquire at least one.
2224       // ----------------------------------
2225       int numReadyToWrite = 0;
2226       long now = EnvironmentEdgeManager.currentTimeMillis();
2227       while (lastIndexExclusive < batchOp.operations.length) {
2228         Mutation mutation = batchOp.getMutation(lastIndexExclusive);
2229         boolean isPutMutation = mutation instanceof Put;
2230 
2231         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2232         // store the family map reference to allow for mutations
2233         familyMaps[lastIndexExclusive] = familyMap;
2234 
2235         // skip anything that "ran" already
2236         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
2237             != OperationStatusCode.NOT_RUN) {
2238           lastIndexExclusive++;
2239           continue;
2240         }
2241 
2242         try {
2243           if (isPutMutation) {
2244             // Check the families in the put. If bad, skip this one.
2245             if (isInReplay) {
2246               removeNonExistentColumnFamilyForReplay(familyMap);
2247             } else {
2248               checkFamilies(familyMap.keySet());
2249             }
2250             checkTimestamps(mutation.getFamilyCellMap(), now);
2251           } else {
2252             prepareDelete((Delete) mutation);
2253           }
2254         } catch (NoSuchColumnFamilyException nscf) {
2255           LOG.warn("No such column family in batch mutation", nscf);
2256           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2257               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
2258           lastIndexExclusive++;
2259           continue;
2260         } catch (FailedSanityCheckException fsce) {
2261           LOG.warn("Batch Mutation did not pass sanity check", fsce);
2262           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2263               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
2264           lastIndexExclusive++;
2265           continue;
2266         }
2267 
2268         // If we haven't got any rows in our batch, we should block to
2269         // get the next one.
2270         boolean shouldBlock = numReadyToWrite == 0;
2271         RowLock rowLock = null;
2272         try {
2273           rowLock = getRowLock(mutation.getRow(), shouldBlock);
2274         } catch (IOException ioe) {
2275           LOG.warn("Failed getting lock in batch put, row="
2276             + Bytes.toStringBinary(mutation.getRow()), ioe);
2277         }
2278         if (rowLock == null) {
2279           // We failed to grab another lock
2280           assert !shouldBlock : "Should never fail to get lock when blocking";
2281           break; // stop acquiring more rows for this batch
2282         } else {
2283           acquiredRowLocks.add(rowLock);
2284         }
2285 
2286         lastIndexExclusive++;
2287         numReadyToWrite++;
2288 
2289         if (isPutMutation) {
2290           // If Column Families stay consistent through out all of the
2291           // individual puts then metrics can be reported as a mutliput across
2292           // column families in the first put.
2293           if (putsCfSet == null) {
2294             putsCfSet = mutation.getFamilyCellMap().keySet();
2295           } else {
2296             putsCfSetConsistent = putsCfSetConsistent
2297                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
2298           }
2299         } else {
2300           if (deletesCfSet == null) {
2301             deletesCfSet = mutation.getFamilyCellMap().keySet();
2302           } else {
2303             deletesCfSetConsistent = deletesCfSetConsistent
2304                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
2305           }
2306         }
2307       }
2308 
2309       // we should record the timestamp only after we have acquired the rowLock,
2310       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
2311       now = EnvironmentEdgeManager.currentTimeMillis();
2312       byte[] byteNow = Bytes.toBytes(now);
2313 
2314       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
2315       if (numReadyToWrite <= 0) return 0L;
2316 
2317       // We've now grabbed as many mutations off the list as we can
2318 
2319       // ------------------------------------
2320       // STEP 2. Update any LATEST_TIMESTAMP timestamps
2321       // ----------------------------------
2322       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2323         // skip invalid
2324         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2325             != OperationStatusCode.NOT_RUN) continue;
2326 
2327         Mutation mutation = batchOp.getMutation(i);
2328         if (mutation instanceof Put) {
2329           updateKVTimestamps(familyMaps[i].values(), byteNow);
2330           noOfPuts++;
2331         } else {
2332           prepareDeleteTimestamps(familyMaps[i], byteNow);
2333           noOfDeletes++;
2334         }
2335       }
2336 
2337       lock(this.updatesLock.readLock(), numReadyToWrite);
2338       locked = true;
2339 
2340       //
2341       // ------------------------------------
2342       // Acquire the latest mvcc number
2343       // ----------------------------------
2344       w = mvcc.beginMemstoreInsert();
2345 
2346       // calling the pre CP hook for batch mutation
2347       if (!isInReplay && coprocessorHost != null) {
2348         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2349           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2350           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2351         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
2352       }
2353 
2354       // ------------------------------------
2355       // STEP 3. Write back to memstore
2356       // Write to memstore. It is ok to write to memstore
2357       // first without updating the HLog because we do not roll
2358       // forward the memstore MVCC. The MVCC will be moved up when
2359       // the complete operation is done. These changes are not yet
2360       // visible to scanners till we update the MVCC. The MVCC is
2361       // moved only when the sync is complete.
2362       // ----------------------------------
2363       long addedSize = 0;
2364       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2365         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2366             != OperationStatusCode.NOT_RUN) {
2367           continue;
2368         }
2369         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
2370         addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
2371       }
2372 
2373       // ------------------------------------
2374       // STEP 4. Build WAL edit
2375       // ----------------------------------
2376       boolean hasWalAppends = false;
2377       Durability durability = Durability.USE_DEFAULT;
2378       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2379         // Skip puts that were determined to be invalid during preprocessing
2380         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2381             != OperationStatusCode.NOT_RUN) {
2382           continue;
2383         }
2384         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2385 
2386         Mutation m = batchOp.getMutation(i);
2387         Durability tmpDur = getEffectiveDurability(m.getDurability());
2388         if (tmpDur.ordinal() > durability.ordinal()) {
2389           durability = tmpDur;
2390         }
2391         if (tmpDur == Durability.SKIP_WAL) {
2392           recordMutationWithoutWal(m.getFamilyCellMap());
2393           continue;
2394         }
2395 
2396         long nonceGroup = batchOp.getNonceGroup(i), nonce = batchOp.getNonce(i);
2397         // In replay, the batch may contain multiple nonces. If so, write WALEdit for each.
2398         // Given how nonces are originally written, these should be contiguous.
2399         // They don't have to be, it will still work, just write more WALEdits than needed.
2400         if (nonceGroup != currentNonceGroup || nonce != currentNonce) {
2401           if (walEdit.size() > 0) {
2402             assert isInReplay;
2403             if (!isInReplay) {
2404               throw new IOException("Multiple nonces per batch and not in replay");
2405             }
2406             // txid should always increase, so having the one from the last call is ok.
2407             txid = this.log.appendNoSync(this.getRegionInfo(), htableDescriptor.getTableName(),
2408                   walEdit, m.getClusterIds(), now, htableDescriptor, this.sequenceId, true,
2409                   currentNonceGroup, currentNonce);
2410             hasWalAppends = true;
2411             walEdit = new WALEdit(isInReplay);
2412           }
2413           currentNonceGroup = nonceGroup;
2414           currentNonce = nonce;
2415         }
2416 
2417         // Add WAL edits by CP
2418         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
2419         if (fromCP != null) {
2420           for (KeyValue kv : fromCP.getKeyValues()) {
2421             walEdit.add(kv);
2422           }
2423         }
2424         addFamilyMapToWALEdit(familyMaps[i], walEdit);
2425       }
2426 
2427       // -------------------------
2428       // STEP 5. Append the final edit to WAL. Do not sync wal.
2429       // -------------------------
2430       Mutation mutation = batchOp.getMutation(firstIndex);
2431       if (walEdit.size() > 0) {
2432         txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
2433               walEdit, mutation.getClusterIds(), now, this.htableDescriptor, this.sequenceId,
2434               true, currentNonceGroup, currentNonce);
2435         hasWalAppends = true;
2436       }
2437 
2438       // -------------------------------
2439       // STEP 6. Release row locks, etc.
2440       // -------------------------------
2441       if (locked) {
2442         this.updatesLock.readLock().unlock();
2443         locked = false;
2444       }
2445       releaseRowLocks(acquiredRowLocks);
2446 
2447       // -------------------------
2448       // STEP 7. Sync wal.
2449       // -------------------------
2450       if (hasWalAppends) {
2451         syncOrDefer(txid, durability);
2452       }
2453       doRollBackMemstore = false;
2454       // calling the post CP hook for batch mutation
2455       if (!isInReplay && coprocessorHost != null) {
2456         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2457           new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2458           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2459         coprocessorHost.postBatchMutate(miniBatchOp);
2460       }
2461 
2462       // ------------------------------------------------------------------
2463       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
2464       // ------------------------------------------------------------------
2465       if (w != null) {
2466         mvcc.completeMemstoreInsert(w);
2467         w = null;
2468       }
2469 
2470       // ------------------------------------
2471       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
2472       // synced so that the coprocessor contract is adhered to.
2473       // ------------------------------------
2474       if (!isInReplay && coprocessorHost != null) {
2475         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2476           // only for successful puts
2477           if (batchOp.retCodeDetails[i].getOperationStatusCode()
2478               != OperationStatusCode.SUCCESS) {
2479             continue;
2480           }
2481           Mutation m = batchOp.getMutation(i);
2482           if (m instanceof Put) {
2483             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
2484           } else {
2485             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
2486           }
2487         }
2488       }
2489 
2490       success = true;
2491       return addedSize;
2492     } finally {
2493 
2494       // if the wal sync was unsuccessful, remove keys from memstore
2495       if (doRollBackMemstore) {
2496         rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive);
2497       }
2498       if (w != null) mvcc.completeMemstoreInsert(w);
2499 
2500       if (locked) {
2501         this.updatesLock.readLock().unlock();
2502       }
2503       releaseRowLocks(acquiredRowLocks);
2504 
2505       // See if the column families were consistent through the whole thing.
2506       // if they were then keep them. If they were not then pass a null.
2507       // null will be treated as unknown.
2508       // Total time taken might be involving Puts and Deletes.
2509       // Split the time for puts and deletes based on the total number of Puts and Deletes.
2510 
2511       if (noOfPuts > 0) {
2512         // There were some Puts in the batch.
2513         if (this.metricsRegion != null) {
2514           this.metricsRegion.updatePut();
2515         }
2516       }
2517       if (noOfDeletes > 0) {
2518         // There were some Deletes in the batch.
2519         if (this.metricsRegion != null) {
2520           this.metricsRegion.updateDelete();
2521         }
2522       }
2523       if (!success) {
2524         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2525           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
2526             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
2527           }
2528         }
2529       }
2530       if (coprocessorHost != null && !batchOp.isInReplay()) {
2531         // call the coprocessor hook to do any finalization steps
2532         // after the put is done
2533         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2534             new MiniBatchOperationInProgress<Mutation>(batchOp.getMutationsForCoprocs(),
2535                 batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex,
2536                 lastIndexExclusive);
2537         coprocessorHost.postBatchMutateIndispensably(miniBatchOp, success);
2538       }
2539 
2540       batchOp.nextIndexToProcess = lastIndexExclusive;
2541     }
2542   }
2543 
2544   /**
2545    * Returns effective durability from the passed durability and
2546    * the table descriptor.
2547    */
2548   protected Durability getEffectiveDurability(Durability d) {
2549     return d == Durability.USE_DEFAULT ? this.durability : d;
2550   }
2551 
2552   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2553   //the getting of the lock happens before, so that you would just pass it into
2554   //the methods. So in the case of checkAndMutate you could just do lockRow,
2555   //get, put, unlockRow or something
2556   /**
2557    *
2558    * @param row
2559    * @param family
2560    * @param qualifier
2561    * @param compareOp
2562    * @param comparator
2563    * @param w
2564    * @param writeToWAL
2565    * @throws IOException
2566    * @return true if the new put was executed, false otherwise
2567    */
2568   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
2569       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
2570       boolean writeToWAL)
2571   throws IOException{
2572     checkReadOnly();
2573     //TODO, add check for value length or maybe even better move this to the
2574     //client if this becomes a global setting
2575     checkResources();
2576     boolean isPut = w instanceof Put;
2577     if (!isPut && !(w instanceof Delete))
2578       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
2579           "be Put or Delete");
2580     if (!Bytes.equals(row, w.getRow())) {
2581       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
2582           "getRow must match the passed row");
2583     }
2584 
2585     startRegionOperation();
2586     try {
2587       Get get = new Get(row);
2588       checkFamily(family);
2589       get.addColumn(family, qualifier);
2590 
2591       // Lock row - note that doBatchMutate will relock this row if called
2592       RowLock rowLock = getRowLock(get.getRow());
2593       // wait for all previous transactions to complete (with lock held)
2594       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2595       List<Cell> result;
2596       try {
2597         result = get(get, false);
2598 
2599         boolean valueIsNull = comparator.getValue() == null ||
2600           comparator.getValue().length == 0;
2601         boolean matches = false;
2602         if (result.size() == 0 && valueIsNull) {
2603           matches = true;
2604         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2605             valueIsNull) {
2606           matches = true;
2607         } else if (result.size() == 1 && !valueIsNull) {
2608           Cell kv = result.get(0);
2609           int compareResult = comparator.compareTo(kv.getValueArray(),
2610               kv.getValueOffset(), kv.getValueLength());
2611           switch (compareOp) {
2612           case LESS:
2613             matches = compareResult < 0;
2614             break;
2615           case LESS_OR_EQUAL:
2616             matches = compareResult <= 0;
2617             break;
2618           case EQUAL:
2619             matches = compareResult == 0;
2620             break;
2621           case NOT_EQUAL:
2622             matches = compareResult != 0;
2623             break;
2624           case GREATER_OR_EQUAL:
2625             matches = compareResult >= 0;
2626             break;
2627           case GREATER:
2628             matches = compareResult > 0;
2629             break;
2630           default:
2631             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2632           }
2633         }
2634         //If matches put the new put or delete the new delete
2635         if (matches) {
2636           // All edits for the given row (across all column families) must
2637           // happen atomically.
2638           doBatchMutate((Mutation)w);
2639           this.checkAndMutateChecksPassed.increment();
2640           return true;
2641         }
2642         this.checkAndMutateChecksFailed.increment();
2643         return false;
2644       } finally {
2645         rowLock.release();
2646       }
2647     } finally {
2648       closeRegionOperation();
2649     }
2650   }
2651 
2652   private void doBatchMutate(Mutation mutation) throws IOException, DoNotRetryIOException {
2653     // Currently this is only called for puts and deletes, so no nonces.
2654     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation },
2655         HConstants.NO_NONCE, HConstants.NO_NONCE);
2656     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
2657       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
2658     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
2659       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
2660     }
2661   }
2662 
2663   /**
2664    * Complete taking the snapshot on the region. Writes the region info and adds references to the
2665    * working snapshot directory.
2666    *
2667    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
2668    * arg.  (In the future other cancellable HRegion methods could eventually add a
2669    * {@link ForeignExceptionSnare}, or we could do something fancier).
2670    *
2671    * @param desc snasphot description object
2672    * @param exnSnare ForeignExceptionSnare that captures external exeptions in case we need to
2673    *   bail out.  This is allowed to be null and will just be ignored in that case.
2674    * @throws IOException if there is an external or internal error causing the snapshot to fail
2675    */
2676   public void addRegionToSnapshot(SnapshotDescription desc,
2677       ForeignExceptionSnare exnSnare) throws IOException {
2678     // This should be "fast" since we don't rewrite store files but instead
2679     // back up the store files by creating a reference
2680     Path rootDir = FSUtils.getRootDir(this.rsServices.getConfiguration());
2681     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
2682 
2683     // 1. dump region meta info into the snapshot directory
2684     LOG.debug("Storing region-info for snapshot.");
2685     HRegionFileSystem snapshotRegionFs = HRegionFileSystem.createRegionOnFileSystem(conf,
2686         this.fs.getFileSystem(), snapshotDir, getRegionInfo());
2687 
2688     // 2. iterate through all the stores in the region
2689     LOG.debug("Creating references for hfiles");
2690 
2691     // This ensures that we have an atomic view of the directory as long as we have < ls limit
2692     // (batch size of the files in a directory) on the namenode. Otherwise, we get back the files in
2693     // batches and may miss files being added/deleted. This could be more robust (iteratively
2694     // checking to see if we have all the files until we are sure), but the limit is currently 1000
2695     // files/batch, far more than the number of store files under a single column family.
2696     for (Store store : stores.values()) {
2697       // 2.1. build the snapshot reference directory for the store
2698       Path dstStoreDir = snapshotRegionFs.getStoreDir(store.getFamily().getNameAsString());
2699       List<StoreFile> storeFiles = new ArrayList<StoreFile>(store.getStorefiles());
2700       if (LOG.isDebugEnabled()) {
2701         LOG.debug("Adding snapshot references for " + storeFiles  + " hfiles");
2702       }
2703 
2704       // 2.2. iterate through all the store's files and create "references".
2705       int sz = storeFiles.size();
2706       for (int i = 0; i < sz; i++) {
2707         if (exnSnare != null) {
2708           exnSnare.rethrowException();
2709         }
2710         StoreFile storeFile = storeFiles.get(i);
2711         Path file = storeFile.getPath();
2712 
2713         LOG.debug("Creating reference for file (" + (i+1) + "/" + sz + ") : " + file);
2714         Path referenceFile = new Path(dstStoreDir, file.getName());
2715         boolean success = true;
2716         if (storeFile.isReference()) {
2717           // write the Reference object to the snapshot
2718           storeFile.getFileInfo().getReference().write(fs.getFileSystem(), referenceFile);
2719         } else {
2720           // create "reference" to this store file.  It is intentionally an empty file -- all
2721           // necessary information is captured by its fs location and filename.  This allows us to
2722           // only figure out what needs to be done via a single nn operation (instead of having to
2723           // open and read the files as well).
2724           success = fs.getFileSystem().createNewFile(referenceFile);
2725         }
2726         if (!success) {
2727           throw new IOException("Failed to create reference file:" + referenceFile);
2728         }
2729       }
2730     }
2731   }
2732 
2733   /**
2734    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP} with the
2735    * provided current timestamp.
2736    */
2737   void updateKVTimestamps(final Iterable<List<Cell>> keyLists, final byte[] now) {
2738     for (List<Cell> cells: keyLists) {
2739       if (cells == null) continue;
2740       for (Cell cell : cells) {
2741         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2742         kv.updateLatestStamp(now);
2743       }
2744     }
2745   }
2746 
2747   /*
2748    * Check if resources to support an update.
2749    *
2750    * We throw RegionTooBusyException if above memstore limit
2751    * and expect client to retry using some kind of backoff
2752   */
2753   private void checkResources()
2754     throws RegionTooBusyException {
2755     // If catalog region, do not impose resource constraints or block updates.
2756     if (this.getRegionInfo().isMetaRegion()) return;
2757 
2758     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
2759       requestFlush();
2760       throw new RegionTooBusyException("Above memstore limit, " +
2761           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
2762           this.getRegionInfo().getRegionNameAsString()) +
2763           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
2764           this.getRegionServerServices().getServerName()) +
2765           ", memstoreSize=" + memstoreSize.get() +
2766           ", blockingMemStoreSize=" + blockingMemStoreSize);
2767     }
2768   }
2769 
2770   /**
2771    * @throws IOException Throws exception if region is in read-only mode.
2772    */
2773   protected void checkReadOnly() throws IOException {
2774     if (this.writestate.isReadOnly()) {
2775       throw new IOException("region is read only");
2776     }
2777   }
2778 
2779   /**
2780    * Add updates first to the hlog and then add values to memstore.
2781    * Warning: Assumption is caller has lock on passed in row.
2782    * @param family
2783    * @param edits Cell updates by column
2784    * @praram now
2785    * @throws IOException
2786    */
2787   private void put(final byte [] row, byte [] family, List<Cell> edits)
2788   throws IOException {
2789     NavigableMap<byte[], List<Cell>> familyMap;
2790     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
2791 
2792     familyMap.put(family, edits);
2793     Put p = new Put(row);
2794     p.setFamilyCellMap(familyMap);
2795     doBatchMutate(p);
2796   }
2797 
2798   /**
2799    * Atomically apply the given map of family->edits to the memstore.
2800    * This handles the consistency control on its own, but the caller
2801    * should already have locked updatesLock.readLock(). This also does
2802    * <b>not</b> check the families for validity.
2803    *
2804    * @param familyMap Map of kvs per family
2805    * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
2806    *        If null, then this method internally creates a mvcc transaction.
2807    * @return the additional memory usage of the memstore caused by the
2808    * new entries.
2809    */
2810   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
2811     MultiVersionConsistencyControl.WriteEntry localizedWriteEntry) {
2812     long size = 0;
2813     boolean freemvcc = false;
2814 
2815     try {
2816       if (localizedWriteEntry == null) {
2817         localizedWriteEntry = mvcc.beginMemstoreInsert();
2818         freemvcc = true;
2819       }
2820 
2821       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2822         byte[] family = e.getKey();
2823         List<Cell> cells = e.getValue();
2824 
2825         Store store = getStore(family);
2826         for (Cell cell: cells) {
2827           KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2828           kv.setMvccVersion(localizedWriteEntry.getWriteNumber());
2829           size += store.add(kv);
2830         }
2831       }
2832     } finally {
2833       if (freemvcc) {
2834         mvcc.completeMemstoreInsert(localizedWriteEntry);
2835       }
2836     }
2837 
2838      return size;
2839    }
2840 
2841   /**
2842    * Remove all the keys listed in the map from the memstore. This method is
2843    * called when a Put/Delete has updated memstore but subequently fails to update
2844    * the wal. This method is then invoked to rollback the memstore.
2845    */
2846   private void rollbackMemstore(BatchOperationInProgress<?> batchOp,
2847                                 Map<byte[], List<Cell>>[] familyMaps,
2848                                 int start, int end) {
2849     int kvsRolledback = 0;
2850     for (int i = start; i < end; i++) {
2851       // skip over request that never succeeded in the first place.
2852       if (batchOp.retCodeDetails[i].getOperationStatusCode()
2853             != OperationStatusCode.SUCCESS) {
2854         continue;
2855       }
2856 
2857       // Rollback all the kvs for this row.
2858       Map<byte[], List<Cell>> familyMap  = familyMaps[i];
2859       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2860         byte[] family = e.getKey();
2861         List<Cell> cells = e.getValue();
2862 
2863         // Remove those keys from the memstore that matches our
2864         // key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
2865         // that even the memstoreTS has to match for keys that will be rolleded-back.
2866         Store store = getStore(family);
2867         for (Cell cell: cells) {
2868           store.rollback(KeyValueUtil.ensureKeyValue(cell));
2869           kvsRolledback++;
2870         }
2871       }
2872     }
2873     LOG.debug("rollbackMemstore rolled back " + kvsRolledback +
2874         " keyvalues from start:" + start + " to end:" + end);
2875   }
2876 
2877   /**
2878    * Check the collection of families for validity.
2879    * @throws NoSuchColumnFamilyException if a family does not exist.
2880    */
2881   void checkFamilies(Collection<byte[]> families)
2882   throws NoSuchColumnFamilyException {
2883     for (byte[] family : families) {
2884       checkFamily(family);
2885     }
2886   }
2887 
2888   /**
2889    * During replay, there could exist column families which are removed between region server
2890    * failure and replay
2891    */
2892   private void removeNonExistentColumnFamilyForReplay(
2893       final Map<byte[], List<Cell>> familyMap) {
2894     List<byte[]> nonExistentList = null;
2895     for (byte[] family : familyMap.keySet()) {
2896       if (!this.htableDescriptor.hasFamily(family)) {
2897         if (nonExistentList == null) {
2898           nonExistentList = new ArrayList<byte[]>();
2899         }
2900         nonExistentList.add(family);
2901       }
2902     }
2903     if (nonExistentList != null) {
2904       for (byte[] family : nonExistentList) {
2905         // Perhaps schema was changed between crash and replay
2906         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
2907         familyMap.remove(family);
2908       }
2909     }
2910   }
2911 
2912   void checkTimestamps(final Map<byte[], List<Cell>> familyMap,
2913       long now) throws FailedSanityCheckException {
2914     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
2915       return;
2916     }
2917     long maxTs = now + timestampSlop;
2918     for (List<Cell> kvs : familyMap.values()) {
2919       for (Cell cell : kvs) {
2920         // see if the user-side TS is out of range. latest = server-side
2921         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2922         if (!kv.isLatestTimestamp() && kv.getTimestamp() > maxTs) {
2923           throw new FailedSanityCheckException("Timestamp for KV out of range "
2924               + cell + " (too.new=" + timestampSlop + ")");
2925         }
2926       }
2927     }
2928   }
2929 
2930   /**
2931    * Append the given map of family->edits to a WALEdit data structure.
2932    * This does not write to the HLog itself.
2933    * @param familyMap map of family->edits
2934    * @param walEdit the destination entry to append into
2935    */
2936   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
2937       WALEdit walEdit) {
2938     for (List<Cell> edits : familyMap.values()) {
2939       for (Cell cell : edits) {
2940         walEdit.add(KeyValueUtil.ensureKeyValue(cell));
2941       }
2942     }
2943   }
2944 
2945   private void requestFlush() {
2946     if (this.rsServices == null) {
2947       return;
2948     }
2949     synchronized (writestate) {
2950       if (this.writestate.isFlushRequested()) {
2951         return;
2952       }
2953       writestate.flushRequested = true;
2954     }
2955     // Make request outside of synchronize block; HBASE-818.
2956     this.rsServices.getFlushRequester().requestFlush(this);
2957     if (LOG.isDebugEnabled()) {
2958       LOG.debug("Flush requested on " + this);
2959     }
2960   }
2961 
2962   /*
2963    * @param size
2964    * @return True if size is over the flush threshold
2965    */
2966   private boolean isFlushSize(final long size) {
2967     return size > this.memstoreFlushSize;
2968   }
2969 
2970   /**
2971    * Read the edits log put under this region by wal log splitting process.  Put
2972    * the recovered edits back up into this region.
2973    *
2974    * <p>We can ignore any log message that has a sequence ID that's equal to or
2975    * lower than minSeqId.  (Because we know such log messages are already
2976    * reflected in the HFiles.)
2977    *
2978    * <p>While this is running we are putting pressure on memory yet we are
2979    * outside of our usual accounting because we are not yet an onlined region
2980    * (this stuff is being run as part of Region initialization).  This means
2981    * that if we're up against global memory limits, we'll not be flagged to flush
2982    * because we are not online. We can't be flushed by usual mechanisms anyways;
2983    * we're not yet online so our relative sequenceids are not yet aligned with
2984    * HLog sequenceids -- not till we come up online, post processing of split
2985    * edits.
2986    *
2987    * <p>But to help relieve memory pressure, at least manage our own heap size
2988    * flushing if are in excess of per-region limits.  Flushing, though, we have
2989    * to be careful and avoid using the regionserver/hlog sequenceid.  Its running
2990    * on a different line to whats going on in here in this region context so if we
2991    * crashed replaying these edits, but in the midst had a flush that used the
2992    * regionserver log with a sequenceid in excess of whats going on in here
2993    * in this region and with its split editlogs, then we could miss edits the
2994    * next time we go to recover. So, we have to flush inline, using seqids that
2995    * make sense in a this single region context only -- until we online.
2996    *
2997    * @param regiondir
2998    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
2999    * the maxSeqId for the store to be applied, else its skipped.
3000    * @param reporter
3001    * @return the sequence id of the last edit added to this region out of the
3002    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3003    * @throws UnsupportedEncodingException
3004    * @throws IOException
3005    */
3006   protected long replayRecoveredEditsIfAny(final Path regiondir,
3007       Map<byte[], Long> maxSeqIdInStores,
3008       final CancelableProgressable reporter, final MonitoredTask status)
3009       throws UnsupportedEncodingException, IOException {
3010     long minSeqIdForTheRegion = -1;
3011     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
3012       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
3013         minSeqIdForTheRegion = maxSeqIdInStore;
3014       }
3015     }
3016     long seqid = minSeqIdForTheRegion;
3017 
3018     FileSystem fs = this.fs.getFileSystem();
3019     NavigableSet<Path> files = HLogUtil.getSplitEditFilesSorted(fs, regiondir);
3020     if (LOG.isDebugEnabled()) {
3021       LOG.debug("Found " + (files == null ? 0 : files.size())
3022         + " recovered edits file(s) under " + regiondir);
3023     }
3024 
3025     if (files == null || files.isEmpty()) return seqid;
3026 
3027     for (Path edits: files) {
3028       if (edits == null || !fs.exists(edits)) {
3029         LOG.warn("Null or non-existent edits file: " + edits);
3030         continue;
3031       }
3032       if (isZeroLengthThenDelete(fs, edits)) continue;
3033 
3034       long maxSeqId;
3035       String fileName = edits.getName();
3036       maxSeqId = Math.abs(Long.parseLong(fileName));
3037       if (maxSeqId <= minSeqIdForTheRegion) {
3038         if (LOG.isDebugEnabled()) {
3039           String msg = "Maximum sequenceid for this log is " + maxSeqId
3040             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
3041             + ", skipped the whole file, path=" + edits;
3042           LOG.debug(msg);
3043         }
3044         continue;
3045       }
3046 
3047       try {
3048         seqid = replayRecoveredEdits(edits, maxSeqIdInStores, reporter);
3049       } catch (IOException e) {
3050         boolean skipErrors = conf.getBoolean(
3051             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
3052             conf.getBoolean(
3053                 "hbase.skip.errors",
3054                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
3055         if (conf.get("hbase.skip.errors") != null) {
3056           LOG.warn(
3057               "The property 'hbase.skip.errors' has been deprecated. Please use " +
3058               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
3059         }
3060         if (skipErrors) {
3061           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3062           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
3063               + "=true so continuing. Renamed " + edits +
3064               " as " + p, e);
3065         } else {
3066           throw e;
3067         }
3068       }
3069     }
3070     // The edits size added into rsAccounting during this replaying will not
3071     // be required any more. So just clear it.
3072     if (this.rsAccounting != null) {
3073       this.rsAccounting.clearRegionReplayEditsSize(this.getRegionName());
3074     }
3075     if (seqid > minSeqIdForTheRegion) {
3076       // Then we added some edits to memory. Flush and cleanup split edit files.
3077       internalFlushcache(null, seqid, status);
3078     }
3079     // Now delete the content of recovered edits.  We're done w/ them.
3080     for (Path file: files) {
3081       if (!fs.delete(file, false)) {
3082         LOG.error("Failed delete of " + file);
3083       } else {
3084         LOG.debug("Deleted recovered.edits file=" + file);
3085       }
3086     }
3087     return seqid;
3088   }
3089 
3090   /*
3091    * @param edits File of recovered edits.
3092    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in log
3093    * must be larger than this to be replayed for each store.
3094    * @param reporter
3095    * @return the sequence id of the last edit added to this region out of the
3096    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
3097    * @throws IOException
3098    */
3099   private long replayRecoveredEdits(final Path edits,
3100       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
3101     throws IOException {
3102     String msg = "Replaying edits from " + edits;
3103     LOG.info(msg);
3104     MonitoredTask status = TaskMonitor.get().createStatus(msg);
3105     FileSystem fs = this.fs.getFileSystem();
3106 
3107     status.setStatus("Opening logs");
3108     HLog.Reader reader = null;
3109     try {
3110       reader = HLogFactory.createReader(fs, edits, conf);
3111       long currentEditSeqId = -1;
3112       long firstSeqIdInLog = -1;
3113       long skippedEdits = 0;
3114       long editsCount = 0;
3115       long intervalEdits = 0;
3116       HLog.Entry entry;
3117       Store store = null;
3118       boolean reported_once = false;
3119       ServerNonceManager ng = this.rsServices == null ? null : this.rsServices.getNonceManager();
3120 
3121       try {
3122         // How many edits seen before we check elapsed time
3123         int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
3124             2000);
3125         // How often to send a progress report (default 1/2 master timeout)
3126         int period = this.conf.getInt("hbase.hstore.report.period",
3127           this.conf.getInt(AssignmentManager.ASSIGNMENT_TIMEOUT,
3128             AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT) / 2);
3129         long lastReport = EnvironmentEdgeManager.currentTimeMillis();
3130 
3131         while ((entry = reader.next()) != null) {
3132           HLogKey key = entry.getKey();
3133           WALEdit val = entry.getEdit();
3134 
3135           if (ng != null) { // some test, or nonces disabled
3136             ng.reportOperationFromWal(key.getNonceGroup(), key.getNonce(), key.getWriteTime());
3137           }
3138 
3139           if (reporter != null) {
3140             intervalEdits += val.size();
3141             if (intervalEdits >= interval) {
3142               // Number of edits interval reached
3143               intervalEdits = 0;
3144               long cur = EnvironmentEdgeManager.currentTimeMillis();
3145               if (lastReport + period <= cur) {
3146                 status.setStatus("Replaying edits..." +
3147                     " skipped=" + skippedEdits +
3148                     " edits=" + editsCount);
3149                 // Timeout reached
3150                 if(!reporter.progress()) {
3151                   msg = "Progressable reporter failed, stopping replay";
3152                   LOG.warn(msg);
3153                   status.abort(msg);
3154                   throw new IOException(msg);
3155                 }
3156                 reported_once = true;
3157                 lastReport = cur;
3158               }
3159             }
3160           }
3161 
3162           // Start coprocessor replay here. The coprocessor is for each WALEdit
3163           // instead of a KeyValue.
3164           if (coprocessorHost != null) {
3165             status.setStatus("Running pre-WAL-restore hook in coprocessors");
3166             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
3167               // if bypass this log entry, ignore it ...
3168               continue;
3169             }
3170           }
3171 
3172           if (firstSeqIdInLog == -1) {
3173             firstSeqIdInLog = key.getLogSeqNum();
3174           }
3175           boolean flush = false;
3176           for (KeyValue kv: val.getKeyValues()) {
3177             // Check this edit is for me. Also, guard against writing the special
3178             // METACOLUMN info such as HBASE::CACHEFLUSH entries
3179             if (kv.matchingFamily(WALEdit.METAFAMILY) ||
3180                 !Bytes.equals(key.getEncodedRegionName(),
3181                   this.getRegionInfo().getEncodedNameAsBytes())) {
3182               //this is a special edit, we should handle it
3183               CompactionDescriptor compaction = WALEdit.getCompaction(kv);
3184               if (compaction != null) {
3185                 //replay the compaction
3186                 completeCompactionMarker(compaction);
3187               }
3188 
3189               skippedEdits++;
3190               continue;
3191             }
3192             // Figure which store the edit is meant for.
3193             if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
3194               store = this.stores.get(kv.getFamily());
3195             }
3196             if (store == null) {
3197               // This should never happen.  Perhaps schema was changed between
3198               // crash and redeploy?
3199               LOG.warn("No family for " + kv);
3200               skippedEdits++;
3201               continue;
3202             }
3203             // Now, figure if we should skip this edit.
3204             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
3205                 .getName())) {
3206               skippedEdits++;
3207               continue;
3208             }
3209             currentEditSeqId = key.getLogSeqNum();
3210             // Once we are over the limit, restoreEdit will keep returning true to
3211             // flush -- but don't flush until we've played all the kvs that make up
3212             // the WALEdit.
3213             flush = restoreEdit(store, kv);
3214             editsCount++;
3215           }
3216           if (flush) internalFlushcache(null, currentEditSeqId, status);
3217 
3218           if (coprocessorHost != null) {
3219             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
3220           }
3221         }
3222       } catch (EOFException eof) {
3223         Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3224         msg = "Encountered EOF. Most likely due to Master failure during " +
3225             "log spliting, so we have this data in another edit.  " +
3226             "Continuing, but renaming " + edits + " as " + p;
3227         LOG.warn(msg, eof);
3228         status.abort(msg);
3229       } catch (IOException ioe) {
3230         // If the IOE resulted from bad file format,
3231         // then this problem is idempotent and retrying won't help
3232         if (ioe.getCause() instanceof ParseException) {
3233           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3234           msg = "File corruption encountered!  " +
3235               "Continuing, but renaming " + edits + " as " + p;
3236           LOG.warn(msg, ioe);
3237           status.setStatus(msg);
3238         } else {
3239           status.abort(StringUtils.stringifyException(ioe));
3240           // other IO errors may be transient (bad network connection,
3241           // checksum exception on one datanode, etc).  throw & retry
3242           throw ioe;
3243         }
3244       }
3245       if (reporter != null && !reported_once) {
3246         reporter.progress();
3247       }
3248       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
3249         ", firstSequenceidInLog=" + firstSeqIdInLog +
3250         ", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits;
3251       status.markComplete(msg);
3252       LOG.debug(msg);
3253       return currentEditSeqId;
3254     } finally {
3255       status.cleanup();
3256       if (reader != null) {
3257          reader.close();
3258       }
3259     }
3260   }
3261 
3262   /**
3263    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
3264    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
3265    * See HBASE-2331.
3266    * @param compaction
3267    */
3268   void completeCompactionMarker(CompactionDescriptor compaction)
3269       throws IOException {
3270     Store store = this.getStore(compaction.getFamilyName().toByteArray());
3271     if (store == null) {
3272       LOG.warn("Found Compaction WAL edit for deleted family:" +
3273           Bytes.toString(compaction.getFamilyName().toByteArray()));
3274       return;
3275     }
3276     store.completeCompactionMarker(compaction);
3277   }
3278 
3279   /**
3280    * Used by tests
3281    * @param s Store to add edit too.
3282    * @param kv KeyValue to add.
3283    * @return True if we should flush.
3284    */
3285   protected boolean restoreEdit(final Store s, final KeyValue kv) {
3286     long kvSize = s.add(kv);
3287     if (this.rsAccounting != null) {
3288       rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
3289     }
3290     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
3291   }
3292 
3293   /*
3294    * @param fs
3295    * @param p File to check.
3296    * @return True if file was zero-length (and if so, we'll delete it in here).
3297    * @throws IOException
3298    */
3299   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
3300       throws IOException {
3301     FileStatus stat = fs.getFileStatus(p);
3302     if (stat.getLen() > 0) return false;
3303     LOG.warn("File " + p + " is zero-length, deleting.");
3304     fs.delete(p, false);
3305     return true;
3306   }
3307 
3308   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
3309     return new HStore(this, family, this.conf);
3310   }
3311 
3312   /**
3313    * Return HStore instance.
3314    * Use with caution.  Exposed for use of fixup utilities.
3315    * @param column Name of column family hosted by this region.
3316    * @return Store that goes with the family on passed <code>column</code>.
3317    * TODO: Make this lookup faster.
3318    */
3319   public Store getStore(final byte[] column) {
3320     return this.stores.get(column);
3321   }
3322 
3323   public Map<byte[], Store> getStores() {
3324     return this.stores;
3325   }
3326 
3327   /**
3328    * Return list of storeFiles for the set of CFs.
3329    * Uses closeLock to prevent the race condition where a region closes
3330    * in between the for loop - closing the stores one by one, some stores
3331    * will return 0 files.
3332    * @return List of storeFiles.
3333    */
3334   public List<String> getStoreFileList(final byte [][] columns)
3335     throws IllegalArgumentException {
3336     List<String> storeFileNames = new ArrayList<String>();
3337     synchronized(closeLock) {
3338       for(byte[] column : columns) {
3339         Store store = this.stores.get(column);
3340         if (store == null) {
3341           throw new IllegalArgumentException("No column family : " +
3342               new String(column) + " available");
3343         }
3344         for (StoreFile storeFile: store.getStorefiles()) {
3345           storeFileNames.add(storeFile.getPath().toString());
3346         }
3347       }
3348     }
3349     return storeFileNames;
3350   }
3351   //////////////////////////////////////////////////////////////////////////////
3352   // Support code
3353   //////////////////////////////////////////////////////////////////////////////
3354 
3355   /** Make sure this is a valid row for the HRegion */
3356   void checkRow(final byte [] row, String op) throws IOException {
3357     if (!rowIsInRange(getRegionInfo(), row)) {
3358       throw new WrongRegionException("Requested row out of range for " +
3359           op + " on HRegion " + this + ", startKey='" +
3360           Bytes.toStringBinary(getStartKey()) + "', getEndKey()='" +
3361           Bytes.toStringBinary(getEndKey()) + "', row='" +
3362           Bytes.toStringBinary(row) + "'");
3363     }
3364   }
3365 
3366   /**
3367    * Tries to acquire a lock on the given row.
3368    * @param waitForLock if true, will block until the lock is available.
3369    *        Otherwise, just tries to obtain the lock and returns
3370    *        false if unavailable.
3371    * @return the row lock if acquired,
3372    *   null if waitForLock was false and the lock was not acquired
3373    * @throws IOException if waitForLock was true and the lock could not be acquired after waiting
3374    */
3375   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
3376     checkRow(row, "row lock");
3377     startRegionOperation();
3378     try {
3379       HashedBytes rowKey = new HashedBytes(row);
3380       RowLockContext rowLockContext = new RowLockContext(rowKey);
3381 
3382       // loop until we acquire the row lock (unless !waitForLock)
3383       while (true) {
3384         RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
3385         if (existingContext == null) {
3386           // Row is not already locked by any thread, use newly created context.
3387           break;
3388         } else if (existingContext.ownedByCurrentThread()) {
3389           // Row is already locked by current thread, reuse existing context instead.
3390           rowLockContext = existingContext;
3391           break;
3392         } else {
3393           // Row is already locked by some other thread, give up or wait for it
3394           if (!waitForLock) {
3395             return null;
3396           }
3397           try {
3398             if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
3399               throw new IOException("Timed out waiting for lock for row: " + rowKey);
3400             }
3401           } catch (InterruptedException ie) {
3402             LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
3403             InterruptedIOException iie = new InterruptedIOException();
3404             iie.initCause(ie);
3405             throw iie;
3406           }
3407         }
3408       }
3409 
3410       // allocate new lock for this thread
3411       return rowLockContext.newLock();
3412     } finally {
3413       closeRegionOperation();
3414     }
3415   }
3416 
3417   /**
3418    * Acqures a lock on the given row.
3419    * The same thread may acquire multiple locks on the same row.
3420    * @return the acquired row lock
3421    * @throws IOException if the lock could not be acquired after waiting
3422    */
3423   public RowLock getRowLock(byte[] row) throws IOException {
3424     return getRowLock(row, true);
3425   }
3426 
3427   /**
3428    * If the given list of row locks is not null, releases all locks.
3429    */
3430   public void releaseRowLocks(List<RowLock> rowLocks) {
3431     if (rowLocks != null) {
3432       for (RowLock rowLock : rowLocks) {
3433         rowLock.release();
3434       }
3435       rowLocks.clear();
3436     }
3437   }
3438 
3439   /**
3440    * Determines whether multiple column families are present
3441    * Precondition: familyPaths is not null
3442    *
3443    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3444    */
3445   private static boolean hasMultipleColumnFamilies(
3446       List<Pair<byte[], String>> familyPaths) {
3447     boolean multipleFamilies = false;
3448     byte[] family = null;
3449     for (Pair<byte[], String> pair : familyPaths) {
3450       byte[] fam = pair.getFirst();
3451       if (family == null) {
3452         family = fam;
3453       } else if (!Bytes.equals(family, fam)) {
3454         multipleFamilies = true;
3455         break;
3456       }
3457     }
3458     return multipleFamilies;
3459   }
3460 
3461 
3462   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths,
3463                                 boolean assignSeqId) throws IOException {
3464     return bulkLoadHFiles(familyPaths, assignSeqId, null);
3465   }
3466 
3467   /**
3468    * Attempts to atomically load a group of hfiles.  This is critical for loading
3469    * rows with multiple column families atomically.
3470    *
3471    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3472    * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
3473    * file about to be bulk loaded
3474    * @param assignSeqId
3475    * @return true if successful, false if failed recoverably
3476    * @throws IOException if failed unrecoverably.
3477    */
3478   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths, boolean assignSeqId,
3479       BulkLoadListener bulkLoadListener) throws IOException {
3480     Preconditions.checkNotNull(familyPaths);
3481     // we need writeLock for multi-family bulk load
3482     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
3483     try {
3484       this.writeRequestsCount.increment();
3485 
3486       // There possibly was a split that happend between when the split keys
3487       // were gathered and before the HReiogn's write lock was taken.  We need
3488       // to validate the HFile region before attempting to bulk load all of them
3489       List<IOException> ioes = new ArrayList<IOException>();
3490       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
3491       for (Pair<byte[], String> p : familyPaths) {
3492         byte[] familyName = p.getFirst();
3493         String path = p.getSecond();
3494 
3495         Store store = getStore(familyName);
3496         if (store == null) {
3497           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
3498               "No such column family " + Bytes.toStringBinary(familyName));
3499           ioes.add(ioe);
3500         } else {
3501           try {
3502             store.assertBulkLoadHFileOk(new Path(path));
3503           } catch (WrongRegionException wre) {
3504             // recoverable (file doesn't fit in region)
3505             failures.add(p);
3506           } catch (IOException ioe) {
3507             // unrecoverable (hdfs problem)
3508             ioes.add(ioe);
3509           }
3510         }
3511       }
3512 
3513       // validation failed because of some sort of IO problem.
3514       if (ioes.size() != 0) {
3515         IOException e = MultipleIOException.createIOException(ioes);
3516         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
3517         throw e;
3518       }
3519 
3520       // validation failed, bail out before doing anything permanent.
3521       if (failures.size() != 0) {
3522         StringBuilder list = new StringBuilder();
3523         for (Pair<byte[], String> p : failures) {
3524           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
3525             .append(p.getSecond());
3526         }
3527         // problem when validating
3528         LOG.warn("There was a recoverable bulk load failure likely due to a" +
3529             " split.  These (family, HFile) pairs were not loaded: " + list);
3530         return false;
3531       }
3532 
3533       for (Pair<byte[], String> p : familyPaths) {
3534         byte[] familyName = p.getFirst();
3535         String path = p.getSecond();
3536         Store store = getStore(familyName);
3537         try {
3538           String finalPath = path;
3539           if(bulkLoadListener != null) {
3540             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
3541           }
3542           store.bulkLoadHFile(finalPath, assignSeqId ? this.sequenceId.incrementAndGet() : -1);
3543           if(bulkLoadListener != null) {
3544             bulkLoadListener.doneBulkLoad(familyName, path);
3545           }
3546         } catch (IOException ioe) {
3547           // A failure here can cause an atomicity violation that we currently
3548           // cannot recover from since it is likely a failed HDFS operation.
3549 
3550           // TODO Need a better story for reverting partial failures due to HDFS.
3551           LOG.error("There was a partial failure due to IO when attempting to" +
3552               " load " + Bytes.toString(p.getFirst()) + " : "+ p.getSecond(), ioe);
3553           if(bulkLoadListener != null) {
3554             try {
3555               bulkLoadListener.failedBulkLoad(familyName, path);
3556             } catch (Exception ex) {
3557               LOG.error("Error while calling failedBulkLoad for family "+
3558                   Bytes.toString(familyName)+" with path "+path, ex);
3559             }
3560           }
3561           throw ioe;
3562         }
3563       }
3564       return true;
3565     } finally {
3566       closeBulkRegionOperation();
3567     }
3568   }
3569 
3570   @Override
3571   public boolean equals(Object o) {
3572     return o instanceof HRegion && Bytes.equals(this.getRegionName(),
3573                                                 ((HRegion) o).getRegionName());
3574   }
3575 
3576   @Override
3577   public int hashCode() {
3578     return Bytes.hashCode(this.getRegionName());
3579   }
3580 
3581   @Override
3582   public String toString() {
3583     return this.getRegionNameAsString();
3584   }
3585 
3586   /**
3587    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
3588    */
3589   class RegionScannerImpl implements RegionScanner {
3590     // Package local for testability
3591     KeyValueHeap storeHeap = null;
3592     /** Heap of key-values that are not essential for the provided filters and are thus read
3593      * on demand, if on-demand column family loading is enabled.*/
3594     KeyValueHeap joinedHeap = null;
3595     /**
3596      * If the joined heap data gathering is interrupted due to scan limits, this will
3597      * contain the row for which we are populating the values.*/
3598     protected KeyValue joinedContinuationRow = null;
3599     // KeyValue indicating that limit is reached when scanning
3600     private final KeyValue KV_LIMIT = new KeyValue();
3601     protected final byte[] stopRow;
3602     private final Filter filter;
3603     private int batch;
3604     protected int isScan;
3605     private boolean filterClosed = false;
3606     private long readPt;
3607     private long maxResultSize;
3608     protected HRegion region;
3609 
3610     @Override
3611     public HRegionInfo getRegionInfo() {
3612       return region.getRegionInfo();
3613     }
3614 
3615     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
3616         throws IOException {
3617 
3618       this.region = region;
3619       this.maxResultSize = scan.getMaxResultSize();
3620       if (scan.hasFilter()) {
3621         this.filter = new FilterWrapper(scan.getFilter());
3622       } else {
3623         this.filter = null;
3624       }
3625 
3626       this.batch = scan.getBatch();
3627       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
3628         this.stopRow = null;
3629       } else {
3630         this.stopRow = scan.getStopRow();
3631       }
3632       // If we are doing a get, we want to be [startRow,endRow] normally
3633       // it is [startRow,endRow) and if startRow=endRow we get nothing.
3634       this.isScan = scan.isGetScan() ? -1 : 0;
3635 
3636       // synchronize on scannerReadPoints so that nobody calculates
3637       // getSmallestReadPoint, before scannerReadPoints is updated.
3638       IsolationLevel isolationLevel = scan.getIsolationLevel();
3639       synchronized(scannerReadPoints) {
3640         this.readPt = getReadpoint(isolationLevel);
3641         scannerReadPoints.put(this, this.readPt);
3642       }
3643 
3644       // Here we separate all scanners into two lists - scanner that provide data required
3645       // by the filter to operate (scanners list) and all others (joinedScanners list).
3646       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
3647       List<KeyValueScanner> joinedScanners = new ArrayList<KeyValueScanner>();
3648       if (additionalScanners != null) {
3649         scanners.addAll(additionalScanners);
3650       }
3651 
3652       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
3653           scan.getFamilyMap().entrySet()) {
3654         Store store = stores.get(entry.getKey());
3655         KeyValueScanner scanner = store.getScanner(scan, entry.getValue(), this.readPt);
3656         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
3657           || this.filter.isFamilyEssential(entry.getKey())) {
3658           scanners.add(scanner);
3659         } else {
3660           joinedScanners.add(scanner);
3661         }
3662       }
3663       initializeKVHeap(scanners, joinedScanners, region);
3664     }
3665 
3666     RegionScannerImpl(Scan scan, HRegion region) throws IOException {
3667       this(scan, null, region);
3668     }
3669 
3670     protected void initializeKVHeap(List<KeyValueScanner> scanners,
3671         List<KeyValueScanner> joinedScanners, HRegion region)
3672         throws IOException {
3673       this.storeHeap = new KeyValueHeap(scanners, region.comparator);
3674       if (!joinedScanners.isEmpty()) {
3675         this.joinedHeap = new KeyValueHeap(joinedScanners, region.comparator);
3676       }
3677     }
3678 
3679     @Override
3680     public long getMaxResultSize() {
3681       return maxResultSize;
3682     }
3683 
3684     @Override
3685     public long getMvccReadPoint() {
3686       return this.readPt;
3687     }
3688 
3689     /**
3690      * Reset both the filter and the old filter.
3691      *
3692      * @throws IOException in case a filter raises an I/O exception.
3693      */
3694     protected void resetFilters() throws IOException {
3695       if (filter != null) {
3696         filter.reset();
3697       }
3698     }
3699 
3700     @Override
3701     public boolean next(List<Cell> outResults)
3702         throws IOException {
3703       // apply the batching limit by default
3704       return next(outResults, batch);
3705     }
3706 
3707     @Override
3708     public synchronized boolean next(List<Cell> outResults, int limit) throws IOException {
3709       if (this.filterClosed) {
3710         throw new UnknownScannerException("Scanner was closed (timed out?) " +
3711             "after we renewed it. Could be caused by a very slow scanner " +
3712             "or a lengthy garbage collection");
3713       }
3714       startRegionOperation(Operation.SCAN);
3715       readRequestsCount.increment();
3716       try {
3717         return nextRaw(outResults, limit);
3718       } finally {
3719         closeRegionOperation(Operation.SCAN);
3720       }
3721     }
3722 
3723     @Override
3724     public boolean nextRaw(List<Cell> outResults)
3725         throws IOException {
3726       return nextRaw(outResults, batch);
3727     }
3728 
3729     @Override
3730     public boolean nextRaw(List<Cell> outResults, int limit) throws IOException {
3731       boolean returnResult;
3732       if (outResults.isEmpty()) {
3733         // Usually outResults is empty. This is true when next is called
3734         // to handle scan or get operation.
3735         returnResult = nextInternal(outResults, limit);
3736       } else {
3737         List<Cell> tmpList = new ArrayList<Cell>();
3738         returnResult = nextInternal(tmpList, limit);
3739         outResults.addAll(tmpList);
3740       }
3741       resetFilters();
3742       if (isFilterDoneInternal()) {
3743         return false;
3744       }
3745       if (region != null && region.metricsRegion != null) {
3746         long totalSize = 0;
3747         for(Cell c:outResults) {
3748           // TODO clean up
3749           KeyValue kv = KeyValueUtil.ensureKeyValue(c);
3750           totalSize += kv.getLength();
3751         }
3752         region.metricsRegion.updateScanNext(totalSize);
3753       }
3754       return returnResult;
3755     }
3756 
3757 
3758     private void populateFromJoinedHeap(List<Cell> results, int limit)
3759         throws IOException {
3760       assert joinedContinuationRow != null;
3761       KeyValue kv = populateResult(results, this.joinedHeap, limit,
3762           joinedContinuationRow.getBuffer(), joinedContinuationRow.getRowOffset(),
3763           joinedContinuationRow.getRowLength());
3764       if (kv != KV_LIMIT) {
3765         // We are done with this row, reset the continuation.
3766         joinedContinuationRow = null;
3767       }
3768       // As the data is obtained from two independent heaps, we need to
3769       // ensure that result list is sorted, because Result relies on that.
3770       Collections.sort(results, comparator);
3771     }
3772 
3773     /**
3774      * Fetches records with currentRow into results list, until next row or limit (if not -1).
3775      * @param results
3776      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
3777      * @param limit Max amount of KVs to place in result list, -1 means no limit.
3778      * @param currentRow Byte array with key we are fetching.
3779      * @param offset offset for currentRow
3780      * @param length length for currentRow
3781      * @return KV_LIMIT if limit reached, next KeyValue otherwise.
3782      */
3783     private KeyValue populateResult(List<Cell> results, KeyValueHeap heap, int limit,
3784         byte[] currentRow, int offset, short length) throws IOException {
3785       KeyValue nextKv;
3786       do {
3787         heap.next(results, limit - results.size());
3788         if (limit > 0 && results.size() == limit) {
3789           return KV_LIMIT;
3790         }
3791         nextKv = heap.peek();
3792       } while (nextKv != null && nextKv.matchingRow(currentRow, offset, length));
3793 
3794       return nextKv;
3795     }
3796 
3797     /*
3798      * @return True if a filter rules the scanner is over, done.
3799      */
3800     @Override
3801     public synchronized boolean isFilterDone() throws IOException {
3802       return isFilterDoneInternal();
3803     }
3804 
3805     private boolean isFilterDoneInternal() throws IOException {
3806       return this.filter != null && this.filter.filterAllRemaining();
3807     }
3808 
3809     private boolean nextInternal(List<Cell> results, int limit)
3810     throws IOException {
3811       if (!results.isEmpty()) {
3812         throw new IllegalArgumentException("First parameter should be an empty list");
3813       }
3814       RpcCallContext rpcCall = RpcServer.getCurrentCall();
3815       // The loop here is used only when at some point during the next we determine
3816       // that due to effects of filters or otherwise, we have an empty row in the result.
3817       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
3818       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
3819       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
3820       while (true) {
3821         if (rpcCall != null) {
3822           // If a user specifies a too-restrictive or too-slow scanner, the
3823           // client might time out and disconnect while the server side
3824           // is still processing the request. We should abort aggressively
3825           // in that case.
3826           long afterTime = rpcCall.disconnectSince();
3827           if (afterTime >= 0) {
3828             throw new CallerDisconnectedException(
3829                 "Aborting on region " + getRegionNameAsString() + ", call " +
3830                     this + " after " + afterTime + " ms, since " +
3831                     "caller disconnected");
3832           }
3833         }
3834 
3835         // Let's see what we have in the storeHeap.
3836         KeyValue current = this.storeHeap.peek();
3837 
3838         byte[] currentRow = null;
3839         int offset = 0;
3840         short length = 0;
3841         if (current != null) {
3842           currentRow = current.getBuffer();
3843           offset = current.getRowOffset();
3844           length = current.getRowLength();
3845         }
3846         boolean stopRow = isStopRow(currentRow, offset, length);
3847         // Check if we were getting data from the joinedHeap and hit the limit.
3848         // If not, then it's main path - getting results from storeHeap.
3849         if (joinedContinuationRow == null) {
3850           // First, check if we are at a stop row. If so, there are no more results.
3851           if (stopRow) {
3852             if (filter != null && filter.hasFilterRow()) {
3853               filter.filterRowCells(results);
3854             }
3855             return false;
3856           }
3857 
3858           // Check if rowkey filter wants to exclude this row. If so, loop to next.
3859           // Technically, if we hit limits before on this row, we don't need this call.
3860           if (filterRowKey(currentRow, offset, length)) {
3861             boolean moreRows = nextRow(currentRow, offset, length);
3862             if (!moreRows) return false;
3863             results.clear();
3864             continue;
3865           }
3866 
3867           KeyValue nextKv = populateResult(results, this.storeHeap, limit, currentRow, offset,
3868               length);
3869           // Ok, we are good, let's try to get some results from the main heap.
3870           if (nextKv == KV_LIMIT) {
3871             if (this.filter != null && filter.hasFilterRow()) {
3872               throw new IncompatibleFilterException(
3873                 "Filter whose hasFilterRow() returns true is incompatible with scan with limit!");
3874             }
3875             return true; // We hit the limit.
3876           }
3877 
3878           stopRow = nextKv == null ||
3879               isStopRow(nextKv.getBuffer(), nextKv.getRowOffset(), nextKv.getRowLength());
3880           // save that the row was empty before filters applied to it.
3881           final boolean isEmptyRow = results.isEmpty();
3882 
3883           // We have the part of the row necessary for filtering (all of it, usually).
3884           // First filter with the filterRow(List).
3885           if (filter != null && filter.hasFilterRow()) {
3886             filter.filterRowCells(results);
3887           }
3888           
3889           if (isEmptyRow || filterRow()) {
3890             results.clear();
3891             boolean moreRows = nextRow(currentRow, offset, length);
3892             if (!moreRows) return false;
3893 
3894             // This row was totally filtered out, if this is NOT the last row,
3895             // we should continue on. Otherwise, nothing else to do.
3896             if (!stopRow) continue;
3897             return false;
3898           }
3899 
3900           // Ok, we are done with storeHeap for this row.
3901           // Now we may need to fetch additional, non-essential data into row.
3902           // These values are not needed for filter to work, so we postpone their
3903           // fetch to (possibly) reduce amount of data loads from disk.
3904           if (this.joinedHeap != null) {
3905             KeyValue nextJoinedKv = joinedHeap.peek();
3906             // If joinedHeap is pointing to some other row, try to seek to a correct one.
3907             boolean mayHaveData =
3908               (nextJoinedKv != null && nextJoinedKv.matchingRow(currentRow, offset, length))
3909               || (this.joinedHeap.requestSeek(KeyValue.createFirstOnRow(currentRow, offset, length),
3910                 true, true)
3911                 && joinedHeap.peek() != null
3912                 && joinedHeap.peek().matchingRow(currentRow, offset, length));
3913             if (mayHaveData) {
3914               joinedContinuationRow = current;
3915               populateFromJoinedHeap(results, limit);
3916             }
3917           }
3918         } else {
3919           // Populating from the joined heap was stopped by limits, populate some more.
3920           populateFromJoinedHeap(results, limit);
3921         }
3922 
3923         // We may have just called populateFromJoinedMap and hit the limits. If that is
3924         // the case, we need to call it again on the next next() invocation.
3925         if (joinedContinuationRow != null) {
3926           return true;
3927         }
3928 
3929         // Finally, we are done with both joinedHeap and storeHeap.
3930         // Double check to prevent empty rows from appearing in result. It could be
3931         // the case when SingleColumnValueExcludeFilter is used.
3932         if (results.isEmpty()) {
3933           boolean moreRows = nextRow(currentRow, offset, length);
3934           if (!moreRows) return false;
3935           if (!stopRow) continue;
3936         }
3937 
3938         // We are done. Return the result.
3939         return !stopRow;
3940       }
3941     }
3942 
3943     /**
3944      * This function is to maintain backward compatibility for 0.94 filters. HBASE-6429 combines
3945      * both filterRow & filterRow(List<KeyValue> kvs) functions. While 0.94 code or older, it may
3946      * not implement hasFilterRow as HBase-6429 expects because 0.94 hasFilterRow() only returns
3947      * true when filterRow(List<KeyValue> kvs) is overridden not the filterRow(). Therefore, the
3948      * filterRow() will be skipped.
3949      */
3950     private boolean filterRow() throws IOException {
3951       // when hasFilterRow returns true, filter.filterRow() will be called automatically inside
3952       // filterRowCells(List<Cell> kvs) so we skip that scenario here.
3953       return filter != null && (!filter.hasFilterRow())
3954           && filter.filterRow();
3955     }
3956     
3957     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
3958       return filter != null
3959           && filter.filterRowKey(row, offset, length);
3960     }
3961 
3962     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
3963       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
3964       KeyValue next;
3965       while ((next = this.storeHeap.peek()) != null &&
3966              next.matchingRow(currentRow, offset, length)) {
3967         this.storeHeap.next(MOCKED_LIST);
3968       }
3969       resetFilters();
3970       // Calling the hook in CP which allows it to do a fast forward
3971       return this.region.getCoprocessorHost() == null
3972           || this.region.getCoprocessorHost()
3973               .postScannerFilterRow(this, currentRow, offset, length);
3974     }
3975 
3976     protected boolean isStopRow(byte[] currentRow, int offset, short length) {
3977       return currentRow == null ||
3978           (stopRow != null &&
3979           comparator.compareRows(stopRow, 0, stopRow.length,
3980             currentRow, offset, length) <= isScan);
3981     }
3982 
3983     @Override
3984     public synchronized void close() {
3985       if (storeHeap != null) {
3986         storeHeap.close();
3987         storeHeap = null;
3988       }
3989       if (joinedHeap != null) {
3990         joinedHeap.close();
3991         joinedHeap = null;
3992       }
3993       // no need to sychronize here.
3994       scannerReadPoints.remove(this);
3995       this.filterClosed = true;
3996     }
3997 
3998     KeyValueHeap getStoreHeapForTesting() {
3999       return storeHeap;
4000     }
4001 
4002     @Override
4003     public synchronized boolean reseek(byte[] row) throws IOException {
4004       if (row == null) {
4005         throw new IllegalArgumentException("Row cannot be null.");
4006       }
4007       boolean result = false;
4008       startRegionOperation();
4009       try {
4010         KeyValue kv = KeyValue.createFirstOnRow(row);
4011         // use request seek to make use of the lazy seek option. See HBASE-5520
4012         result = this.storeHeap.requestSeek(kv, true, true);
4013         if (this.joinedHeap != null) {
4014           result = this.joinedHeap.requestSeek(kv, true, true) || result;
4015         }
4016       } finally {
4017         closeRegionOperation();
4018       }
4019       return result;
4020     }
4021   }
4022 
4023   // Utility methods
4024   /**
4025    * A utility method to create new instances of HRegion based on the
4026    * {@link HConstants#REGION_IMPL} configuration property.
4027    * @param tableDir qualified path of directory where region should be located,
4028    * usually the table directory.
4029    * @param log The HLog is the outbound log for any updates to the HRegion
4030    * (There's a single HLog for all the HRegions on a single HRegionServer.)
4031    * The log file is a logfile from the previous execution that's
4032    * custom-computed for this HRegion. The HRegionServer computes and sorts the
4033    * appropriate log info for this HRegion. If there is a previous log file
4034    * (implying that the HRegion has been written-to before), then read it from
4035    * the supplied path.
4036    * @param fs is the filesystem.
4037    * @param conf is global configuration settings.
4038    * @param regionInfo - HRegionInfo that describes the region
4039    * is new), then read them from the supplied path.
4040    * @param htd the table descriptor
4041    * @param rsServices
4042    * @return the new instance
4043    */
4044   static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs,
4045       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
4046       RegionServerServices rsServices) {
4047     try {
4048       @SuppressWarnings("unchecked")
4049       Class<? extends HRegion> regionClass =
4050           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
4051 
4052       Constructor<? extends HRegion> c =
4053           regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
4054               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
4055               RegionServerServices.class);
4056 
4057       return c.newInstance(tableDir, log, fs, conf, regionInfo, htd, rsServices);
4058     } catch (Throwable e) {
4059       // todo: what should I throw here?
4060       throw new IllegalStateException("Could not instantiate a region instance.", e);
4061     }
4062   }
4063 
4064   /**
4065    * Convenience method creating new HRegions. Used by createTable and by the
4066    * bootstrap code in the HMaster constructor.
4067    * Note, this method creates an {@link HLog} for the created region. It
4068    * needs to be closed explicitly.  Use {@link HRegion#getLog()} to get
4069    * access.  <b>When done with a region created using this method, you will
4070    * need to explicitly close the {@link HLog} it created too; it will not be
4071    * done for you.  Not closing the log will leave at least a daemon thread
4072    * running.</b>  Call {@link #closeHRegion(HRegion)} and it will do
4073    * necessary cleanup for you.
4074    * @param info Info for region to create.
4075    * @param rootDir Root directory for HBase instance
4076    * @param conf
4077    * @param hTableDescriptor
4078    * @return new HRegion
4079    *
4080    * @throws IOException
4081    */
4082   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4083       final Configuration conf, final HTableDescriptor hTableDescriptor)
4084   throws IOException {
4085     return createHRegion(info, rootDir, conf, hTableDescriptor, null);
4086   }
4087 
4088   /**
4089    * This will do the necessary cleanup a call to
4090    * {@link #createHRegion(HRegionInfo, Path, Configuration, HTableDescriptor)}
4091    * requires.  This method will close the region and then close its
4092    * associated {@link HLog} file.  You use it if you call the other createHRegion,
4093    * the one that takes an {@link HLog} instance but don't be surprised by the
4094    * call to the {@link HLog#closeAndDelete()} on the {@link HLog} the
4095    * HRegion was carrying.
4096    * @param r
4097    * @throws IOException
4098    */
4099   public static void closeHRegion(final HRegion r) throws IOException {
4100     if (r == null) return;
4101     r.close();
4102     if (r.getLog() == null) return;
4103     r.getLog().closeAndDelete();
4104   }
4105 
4106   /**
4107    * Convenience method creating new HRegions. Used by createTable.
4108    * The {@link HLog} for the created region needs to be closed explicitly.
4109    * Use {@link HRegion#getLog()} to get access.
4110    *
4111    * @param info Info for region to create.
4112    * @param rootDir Root directory for HBase instance
4113    * @param conf
4114    * @param hTableDescriptor
4115    * @param hlog shared HLog
4116    * @param initialize - true to initialize the region
4117    * @return new HRegion
4118    *
4119    * @throws IOException
4120    */
4121   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4122                                       final Configuration conf,
4123                                       final HTableDescriptor hTableDescriptor,
4124                                       final HLog hlog,
4125                                       final boolean initialize)
4126       throws IOException {
4127     return createHRegion(info, rootDir, conf, hTableDescriptor,
4128         hlog, initialize, false);
4129   }
4130 
4131   /**
4132    * Convenience method creating new HRegions. Used by createTable.
4133    * The {@link HLog} for the created region needs to be closed
4134    * explicitly, if it is not null.
4135    * Use {@link HRegion#getLog()} to get access.
4136    *
4137    * @param info Info for region to create.
4138    * @param rootDir Root directory for HBase instance
4139    * @param conf
4140    * @param hTableDescriptor
4141    * @param hlog shared HLog
4142    * @param initialize - true to initialize the region
4143    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4144    * @return new HRegion
4145    * @throws IOException
4146    */
4147   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4148                                       final Configuration conf,
4149                                       final HTableDescriptor hTableDescriptor,
4150                                       final HLog hlog,
4151                                       final boolean initialize, final boolean ignoreHLog)
4152       throws IOException {
4153       Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4154       return createHRegion(info, rootDir, tableDir, conf, hTableDescriptor, hlog, initialize, ignoreHLog);
4155   }
4156 
4157   /**
4158    * Convenience method creating new HRegions. Used by createTable.
4159    * The {@link HLog} for the created region needs to be closed
4160    * explicitly, if it is not null.
4161    * Use {@link HRegion#getLog()} to get access.
4162    *
4163    * @param info Info for region to create.
4164    * @param rootDir Root directory for HBase instance
4165    * @param tableDir table directory
4166    * @param conf
4167    * @param hTableDescriptor
4168    * @param hlog shared HLog
4169    * @param initialize - true to initialize the region
4170    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
4171    * @return new HRegion
4172    * @throws IOException
4173    */
4174   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir, final Path tableDir,
4175                                       final Configuration conf,
4176                                       final HTableDescriptor hTableDescriptor,
4177                                       final HLog hlog,
4178                                       final boolean initialize, final boolean ignoreHLog)
4179       throws IOException {
4180     LOG.info("creating HRegion " + info.getTable().getNameAsString()
4181         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
4182         " Table name == " + info.getTable().getNameAsString());
4183     FileSystem fs = FileSystem.get(conf);
4184     HRegionFileSystem rfs = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
4185     HLog effectiveHLog = hlog;
4186     if (hlog == null && !ignoreHLog) {
4187       effectiveHLog = HLogFactory.createHLog(fs, rfs.getRegionDir(),
4188                                              HConstants.HREGION_LOGDIR_NAME, conf);
4189     }
4190     HRegion region = HRegion.newHRegion(tableDir,
4191         effectiveHLog, fs, conf, info, hTableDescriptor, null);
4192     if (initialize) {
4193       // If initializing, set the sequenceId. It is also required by HLogPerformanceEvaluation when
4194       // verifying the WALEdits.
4195       region.setSequenceId(region.initialize());
4196     }
4197     return region;
4198   }
4199 
4200   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
4201                                       final Configuration conf,
4202                                       final HTableDescriptor hTableDescriptor,
4203                                       final HLog hlog)
4204     throws IOException {
4205     return createHRegion(info, rootDir, conf, hTableDescriptor, hlog, true);
4206   }
4207 
4208 
4209   /**
4210    * Open a Region.
4211    * @param info Info for region to be opened.
4212    * @param wal HLog for region to use. This method will call
4213    * HLog#setSequenceNumber(long) passing the result of the call to
4214    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4215    * up.  HRegionStore does this every time it opens a new region.
4216    * @param conf
4217    * @return new HRegion
4218    *
4219    * @throws IOException
4220    */
4221   public static HRegion openHRegion(final HRegionInfo info,
4222       final HTableDescriptor htd, final HLog wal,
4223       final Configuration conf)
4224   throws IOException {
4225     return openHRegion(info, htd, wal, conf, null, null);
4226   }
4227 
4228   /**
4229    * Open a Region.
4230    * @param info Info for region to be opened
4231    * @param htd the table descriptor
4232    * @param wal HLog for region to use. This method will call
4233    * HLog#setSequenceNumber(long) passing the result of the call to
4234    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4235    * up.  HRegionStore does this every time it opens a new region.
4236    * @param conf The Configuration object to use.
4237    * @param rsServices An interface we can request flushes against.
4238    * @param reporter An interface we can report progress against.
4239    * @return new HRegion
4240    *
4241    * @throws IOException
4242    */
4243   public static HRegion openHRegion(final HRegionInfo info,
4244     final HTableDescriptor htd, final HLog wal, final Configuration conf,
4245     final RegionServerServices rsServices,
4246     final CancelableProgressable reporter)
4247   throws IOException {
4248     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
4249   }
4250 
4251   /**
4252    * Open a Region.
4253    * @param rootDir Root directory for HBase instance
4254    * @param info Info for region to be opened.
4255    * @param htd the table descriptor
4256    * @param wal HLog for region to use. This method will call
4257    * HLog#setSequenceNumber(long) passing the result of the call to
4258    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4259    * up.  HRegionStore does this every time it opens a new region.
4260    * @param conf The Configuration object to use.
4261    * @return new HRegion
4262    * @throws IOException
4263    */
4264   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
4265       final HTableDescriptor htd, final HLog wal, final Configuration conf)
4266   throws IOException {
4267     return openHRegion(rootDir, info, htd, wal, conf, null, null);
4268   }
4269 
4270   /**
4271    * Open a Region.
4272    * @param rootDir Root directory for HBase instance
4273    * @param info Info for region to be opened.
4274    * @param htd the table descriptor
4275    * @param wal HLog for region to use. This method will call
4276    * HLog#setSequenceNumber(long) passing the result of the call to
4277    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4278    * up.  HRegionStore does this every time it opens a new region.
4279    * @param conf The Configuration object to use.
4280    * @param rsServices An interface we can request flushes against.
4281    * @param reporter An interface we can report progress against.
4282    * @return new HRegion
4283    * @throws IOException
4284    */
4285   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
4286       final HTableDescriptor htd, final HLog wal, final Configuration conf,
4287       final RegionServerServices rsServices,
4288       final CancelableProgressable reporter)
4289   throws IOException {
4290     FileSystem fs = null;
4291     if (rsServices != null) {
4292       fs = rsServices.getFileSystem();
4293     }
4294     if (fs == null) {
4295       fs = FileSystem.get(conf);
4296     }
4297     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
4298   }
4299 
4300   /**
4301    * Open a Region.
4302    * @param conf The Configuration object to use.
4303    * @param fs Filesystem to use
4304    * @param rootDir Root directory for HBase instance
4305    * @param info Info for region to be opened.
4306    * @param htd the table descriptor
4307    * @param wal HLog for region to use. This method will call
4308    * HLog#setSequenceNumber(long) passing the result of the call to
4309    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4310    * up.  HRegionStore does this every time it opens a new region.
4311    * @return new HRegion
4312    * @throws IOException
4313    */
4314   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4315       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal)
4316       throws IOException {
4317     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
4318   }
4319 
4320   /**
4321    * Open a Region.
4322    * @param conf The Configuration object to use.
4323    * @param fs Filesystem to use
4324    * @param rootDir Root directory for HBase instance
4325    * @param info Info for region to be opened.
4326    * @param htd the table descriptor
4327    * @param wal HLog for region to use. This method will call
4328    * HLog#setSequenceNumber(long) passing the result of the call to
4329    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4330    * up.  HRegionStore does this every time it opens a new region.
4331    * @param rsServices An interface we can request flushes against.
4332    * @param reporter An interface we can report progress against.
4333    * @return new HRegion
4334    * @throws IOException
4335    */
4336   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4337       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4338       final RegionServerServices rsServices, final CancelableProgressable reporter)
4339       throws IOException {
4340     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
4341     return openHRegion(conf, fs, rootDir, tableDir, info, htd, wal, rsServices, reporter);
4342   }
4343 
4344   /**
4345    * Open a Region.
4346    * @param conf The Configuration object to use.
4347    * @param fs Filesystem to use
4348    * @param rootDir Root directory for HBase instance
4349    * @param info Info for region to be opened.
4350    * @param htd the table descriptor
4351    * @param wal HLog for region to use. This method will call
4352    * HLog#setSequenceNumber(long) passing the result of the call to
4353    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4354    * up.  HRegionStore does this every time it opens a new region.
4355    * @param rsServices An interface we can request flushes against.
4356    * @param reporter An interface we can report progress against.
4357    * @return new HRegion
4358    * @throws IOException
4359    */
4360   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4361       final Path rootDir, final Path tableDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4362       final RegionServerServices rsServices, final CancelableProgressable reporter)
4363       throws IOException {
4364     if (info == null) throw new NullPointerException("Passed region info is null");
4365     if (LOG.isDebugEnabled()) {
4366       LOG.debug("Opening region: " + info);
4367     }
4368     HRegion r = HRegion.newHRegion(tableDir, wal, fs, conf, info, htd, rsServices);
4369     return r.openHRegion(reporter);
4370   }
4371 
4372 
4373   /**
4374    * Useful when reopening a closed region (normally for unit tests)
4375    * @param other original object
4376    * @param reporter An interface we can report progress against.
4377    * @return new HRegion
4378    * @throws IOException
4379    */
4380   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
4381       throws IOException {
4382     HRegionFileSystem regionFs = other.getRegionFileSystem();
4383     HRegion r = newHRegion(regionFs.getTableDir(), other.getLog(), regionFs.getFileSystem(),
4384         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
4385     return r.openHRegion(reporter);
4386   }
4387 
4388   /**
4389    * Open HRegion.
4390    * Calls initialize and sets sequenceid.
4391    * @param reporter
4392    * @return Returns <code>this</code>
4393    * @throws IOException
4394    */
4395   protected HRegion openHRegion(final CancelableProgressable reporter)
4396   throws IOException {
4397     checkCompressionCodecs();
4398 
4399     this.openSeqNum = initialize(reporter);
4400     this.setSequenceId(openSeqNum);
4401     return this;
4402   }
4403 
4404   private void checkCompressionCodecs() throws IOException {
4405     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4406       CompressionTest.testCompression(fam.getCompression());
4407       CompressionTest.testCompression(fam.getCompactionCompression());
4408     }
4409   }
4410 
4411   /**
4412    * Create a daughter region from given a temp directory with the region data.
4413    * @param hri Spec. for daughter region to open.
4414    * @throws IOException
4415    */
4416   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
4417     // Move the files from the temporary .splits to the final /table/region directory
4418     fs.commitDaughterRegion(hri);
4419 
4420     // Create the daughter HRegion instance
4421     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(), fs.getFileSystem(),
4422         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
4423     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
4424     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
4425     return r;
4426   }
4427 
4428   /**
4429    * Create a merged region given a temp directory with the region data.
4430    * @param mergedRegionInfo
4431    * @param region_b another merging region
4432    * @return merged hregion
4433    * @throws IOException
4434    */
4435   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
4436       final HRegion region_b) throws IOException {
4437     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(),
4438         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
4439         this.getTableDesc(), this.rsServices);
4440     r.readRequestsCount.set(this.getReadRequestsCount()
4441         + region_b.getReadRequestsCount());
4442     r.writeRequestsCount.set(this.getWriteRequestsCount()
4443         + region_b.getWriteRequestsCount());
4444     this.fs.commitMergedRegion(mergedRegionInfo);
4445     return r;
4446   }
4447 
4448   /**
4449    * Inserts a new region's meta information into the passed
4450    * <code>meta</code> region. Used by the HMaster bootstrap code adding
4451    * new table to hbase:meta table.
4452    *
4453    * @param meta hbase:meta HRegion to be updated
4454    * @param r HRegion to add to <code>meta</code>
4455    *
4456    * @throws IOException
4457    */
4458   // TODO remove since only test and merge use this
4459   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
4460     meta.checkResources();
4461     // The row key is the region name
4462     byte[] row = r.getRegionName();
4463     final long now = EnvironmentEdgeManager.currentTimeMillis();
4464     final List<Cell> cells = new ArrayList<Cell>(2);
4465     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4466       HConstants.REGIONINFO_QUALIFIER, now,
4467       r.getRegionInfo().toByteArray()));
4468     // Set into the root table the version of the meta table.
4469     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4470       HConstants.META_VERSION_QUALIFIER, now,
4471       Bytes.toBytes(HConstants.META_VERSION)));
4472     meta.put(row, HConstants.CATALOG_FAMILY, cells);
4473   }
4474 
4475   /**
4476    * Computes the Path of the HRegion
4477    *
4478    * @param tabledir qualified path for table
4479    * @param name ENCODED region name
4480    * @return Path of HRegion directory
4481    */
4482   @Deprecated
4483   public static Path getRegionDir(final Path tabledir, final String name) {
4484     return new Path(tabledir, name);
4485   }
4486 
4487   /**
4488    * Computes the Path of the HRegion
4489    *
4490    * @param rootdir qualified path of HBase root directory
4491    * @param info HRegionInfo for the region
4492    * @return qualified path of region directory
4493    */
4494   @Deprecated
4495   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
4496     return new Path(
4497       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
4498   }
4499 
4500   /**
4501    * Determines if the specified row is within the row range specified by the
4502    * specified HRegionInfo
4503    *
4504    * @param info HRegionInfo that specifies the row range
4505    * @param row row to be checked
4506    * @return true if the row is within the range specified by the HRegionInfo
4507    */
4508   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
4509     return ((info.getStartKey().length == 0) ||
4510         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
4511         ((info.getEndKey().length == 0) ||
4512             (Bytes.compareTo(info.getEndKey(), row) > 0));
4513   }
4514 
4515   /**
4516    * Merge two HRegions.  The regions must be adjacent and must not overlap.
4517    *
4518    * @param srcA
4519    * @param srcB
4520    * @return new merged HRegion
4521    * @throws IOException
4522    */
4523   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
4524   throws IOException {
4525     HRegion a = srcA;
4526     HRegion b = srcB;
4527 
4528     // Make sure that srcA comes first; important for key-ordering during
4529     // write of the merged file.
4530     if (srcA.getStartKey() == null) {
4531       if (srcB.getStartKey() == null) {
4532         throw new IOException("Cannot merge two regions with null start key");
4533       }
4534       // A's start key is null but B's isn't. Assume A comes before B
4535     } else if ((srcB.getStartKey() == null) ||
4536       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
4537       a = srcB;
4538       b = srcA;
4539     }
4540 
4541     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
4542       throw new IOException("Cannot merge non-adjacent regions");
4543     }
4544     return merge(a, b);
4545   }
4546 
4547   /**
4548    * Merge two regions whether they are adjacent or not.
4549    *
4550    * @param a region a
4551    * @param b region b
4552    * @return new merged region
4553    * @throws IOException
4554    */
4555   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
4556     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
4557       throw new IOException("Regions do not belong to the same table");
4558     }
4559 
4560     FileSystem fs = a.getRegionFileSystem().getFileSystem();
4561     // Make sure each region's cache is empty
4562     a.flushcache();
4563     b.flushcache();
4564 
4565     // Compact each region so we only have one store file per family
4566     a.compactStores(true);
4567     if (LOG.isDebugEnabled()) {
4568       LOG.debug("Files for region: " + a);
4569       a.getRegionFileSystem().logFileSystemState(LOG);
4570     }
4571     b.compactStores(true);
4572     if (LOG.isDebugEnabled()) {
4573       LOG.debug("Files for region: " + b);
4574       b.getRegionFileSystem().logFileSystemState(LOG);
4575     }
4576 
4577     RegionMergeTransaction rmt = new RegionMergeTransaction(a, b, true);
4578     if (!rmt.prepare(null)) {
4579       throw new IOException("Unable to merge regions " + a + " and " + b);
4580     }
4581     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
4582     LOG.info("starting merge of regions: " + a + " and " + b
4583         + " into new region " + mergedRegionInfo.getRegionNameAsString()
4584         + " with start key <"
4585         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
4586         + "> and end key <"
4587         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
4588     HRegion dstRegion;
4589     try {
4590       dstRegion = rmt.execute(null, null);
4591     } catch (IOException ioe) {
4592       rmt.rollback(null, null);
4593       throw new IOException("Failed merging region " + a + " and " + b
4594           + ", and succssfully rolled back");
4595     }
4596     dstRegion.compactStores(true);
4597 
4598     if (LOG.isDebugEnabled()) {
4599       LOG.debug("Files for new region");
4600       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
4601     }
4602 
4603     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
4604       throw new IOException("Merged region " + dstRegion
4605           + " still has references after the compaction, is compaction canceled?");
4606     }
4607 
4608     // Archiving the 'A' region
4609     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
4610     // Archiving the 'B' region
4611     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
4612 
4613     LOG.info("merge completed. New region is " + dstRegion);
4614     return dstRegion;
4615   }
4616 
4617   /**
4618    * @return True if needs a major compaction.
4619    * @throws IOException
4620    */
4621   boolean isMajorCompaction() throws IOException {
4622     for (Store store : this.stores.values()) {
4623       if (store.isMajorCompaction()) {
4624         return true;
4625       }
4626     }
4627     return false;
4628   }
4629 
4630   //
4631   // HBASE-880
4632   //
4633   /**
4634    * @param get get object
4635    * @return result
4636    * @throws IOException read exceptions
4637    */
4638   public Result get(final Get get) throws IOException {
4639     checkRow(get.getRow(), "Get");
4640     // Verify families are all valid
4641     if (get.hasFamilies()) {
4642       for (byte [] family: get.familySet()) {
4643         checkFamily(family);
4644       }
4645     } else { // Adding all families to scanner
4646       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
4647         get.addFamily(family);
4648       }
4649     }
4650     List<Cell> results = get(get, true);
4651     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null);
4652   }
4653 
4654   /*
4655    * Do a get based on the get parameter.
4656    * @param withCoprocessor invoke coprocessor or not. We don't want to
4657    * always invoke cp for this private method.
4658    */
4659   private List<Cell> get(Get get, boolean withCoprocessor)
4660   throws IOException {
4661 
4662     List<Cell> results = new ArrayList<Cell>();
4663 
4664     // pre-get CP hook
4665     if (withCoprocessor && (coprocessorHost != null)) {
4666        if (coprocessorHost.preGet(get, results)) {
4667          return results;
4668        }
4669     }
4670 
4671     Scan scan = new Scan(get);
4672 
4673     RegionScanner scanner = null;
4674     try {
4675       scanner = getScanner(scan);
4676       scanner.next(results);
4677     } finally {
4678       if (scanner != null)
4679         scanner.close();
4680     }
4681 
4682     // post-get CP hook
4683     if (withCoprocessor && (coprocessorHost != null)) {
4684       coprocessorHost.postGet(get, results);
4685     }
4686 
4687     // do after lock
4688     if (this.metricsRegion != null) {
4689       long totalSize = 0l;
4690       if (results != null) {
4691         for (Cell kv:results) {
4692           totalSize += KeyValueUtil.ensureKeyValue(kv).getLength();
4693         }
4694       }
4695       this.metricsRegion.updateGet(totalSize);
4696     }
4697 
4698     return results;
4699   }
4700 
4701   public void mutateRow(RowMutations rm) throws IOException {
4702     // Don't need nonces here - RowMutations only supports puts and deletes
4703     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
4704   }
4705 
4706   /**
4707    * Perform atomic mutations within the region w/o nonces.
4708    * See {@link #mutateRowsWithLocks(Collection, Collection, long, long)}
4709    */
4710   public void mutateRowsWithLocks(Collection<Mutation> mutations,
4711       Collection<byte[]> rowsToLock) throws IOException {
4712     mutateRowsWithLocks(mutations, rowsToLock, HConstants.NO_NONCE, HConstants.NO_NONCE);
4713   }
4714 
4715   /**
4716    * Perform atomic mutations within the region.
4717    * @param mutations The list of mutations to perform.
4718    * <code>mutations</code> can contain operations for multiple rows.
4719    * Caller has to ensure that all rows are contained in this region.
4720    * @param rowsToLock Rows to lock
4721    * @param nonceGroup Optional nonce group of the operation (client Id)
4722    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
4723    * If multiple rows are locked care should be taken that
4724    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
4725    * @throws IOException
4726    */
4727   public void mutateRowsWithLocks(Collection<Mutation> mutations,
4728       Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
4729     MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
4730     processRowsWithLocks(proc, -1, nonceGroup, nonce);
4731   }
4732 
4733   /**
4734    * Performs atomic multiple reads and writes on a given row.
4735    *
4736    * @param processor The object defines the reads and writes to a row.
4737    * @param nonceGroup Optional nonce group of the operation (client Id)
4738    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
4739    */
4740   public void processRowsWithLocks(RowProcessor<?,?> processor, long nonceGroup, long nonce)
4741       throws IOException {
4742     processRowsWithLocks(processor, rowProcessorTimeout, nonceGroup, nonce);
4743   }
4744 
4745   /**
4746    * Performs atomic multiple reads and writes on a given row.
4747    *
4748    * @param processor The object defines the reads and writes to a row.
4749    * @param timeout The timeout of the processor.process() execution
4750    *                Use a negative number to switch off the time bound
4751    * @param nonceGroup Optional nonce group of the operation (client Id)
4752    * @param nonce Optional nonce of the operation (unique random id to ensure "more idempotence")
4753    */
4754   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout,
4755       long nonceGroup, long nonce) throws IOException {
4756 
4757     for (byte[] row : processor.getRowsToLock()) {
4758       checkRow(row, "processRowsWithLocks");
4759     }
4760     if (!processor.readOnly()) {
4761       checkReadOnly();
4762     }
4763     checkResources();
4764 
4765     startRegionOperation();
4766     WALEdit walEdit = new WALEdit();
4767 
4768     // 1. Run pre-process hook
4769     processor.preProcess(this, walEdit);
4770 
4771     // Short circuit the read only case
4772     if (processor.readOnly()) {
4773       try {
4774         long now = EnvironmentEdgeManager.currentTimeMillis();
4775         doProcessRowWithTimeout(
4776             processor, now, this, null, null, timeout);
4777         processor.postProcess(this, walEdit);
4778       } catch (IOException e) {
4779         throw e;
4780       } finally {
4781         closeRegionOperation();
4782       }
4783       return;
4784     }
4785 
4786     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
4787     boolean locked = false;
4788     boolean walSyncSuccessful = false;
4789     List<RowLock> acquiredRowLocks = null;
4790     long addedSize = 0;
4791     List<KeyValue> mutations = new ArrayList<KeyValue>();
4792     Collection<byte[]> rowsToLock = processor.getRowsToLock();
4793     try {
4794       // 2. Acquire the row lock(s)
4795       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
4796       for (byte[] row : rowsToLock) {
4797         // Attempt to lock all involved rows, throw if any lock times out
4798         acquiredRowLocks.add(getRowLock(row));
4799       }
4800       // 3. Region lock
4801       lock(this.updatesLock.readLock(), acquiredRowLocks.size());
4802       locked = true;
4803 
4804       long now = EnvironmentEdgeManager.currentTimeMillis();
4805       try {
4806         // 4. Let the processor scan the rows, generate mutations and add
4807         //    waledits
4808         doProcessRowWithTimeout(
4809             processor, now, this, mutations, walEdit, timeout);
4810 
4811         if (!mutations.isEmpty()) {
4812           // 5. Get a mvcc write number
4813           writeEntry = mvcc.beginMemstoreInsert();
4814           // 6. Apply to memstore
4815           for (KeyValue kv : mutations) {
4816             kv.setMvccVersion(writeEntry.getWriteNumber());
4817             byte[] family = kv.getFamily();
4818             checkFamily(family);
4819             addedSize += stores.get(family).add(kv);
4820           }
4821 
4822           long txid = 0;
4823           // 7. Append no sync
4824           if (!walEdit.isEmpty()) {
4825             txid = this.log.appendNoSync(this.getRegionInfo(),
4826               this.htableDescriptor.getTableName(), walEdit, processor.getClusterIds(), now,
4827               this.htableDescriptor, this.sequenceId, true, nonceGroup, nonce);
4828           }
4829           // 8. Release region lock
4830           if (locked) {
4831             this.updatesLock.readLock().unlock();
4832             locked = false;
4833           }
4834 
4835           // 9. Release row lock(s)
4836           releaseRowLocks(acquiredRowLocks);
4837 
4838           // 10. Sync edit log
4839           if (txid != 0) {
4840             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
4841           }
4842           walSyncSuccessful = true;
4843         }
4844       } finally {
4845         if (!mutations.isEmpty() && !walSyncSuccessful) {
4846           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
4847               " memstore keyvalues for row(s):" +
4848               processor.getRowsToLock().iterator().next() + "...");
4849           for (KeyValue kv : mutations) {
4850             stores.get(kv.getFamily()).rollback(kv);
4851           }
4852         }
4853         // 11. Roll mvcc forward
4854         if (writeEntry != null) {
4855           mvcc.completeMemstoreInsert(writeEntry);
4856           writeEntry = null;
4857         }
4858         if (locked) {
4859           this.updatesLock.readLock().unlock();
4860           locked = false;
4861         }
4862         // release locks if some were acquired but another timed out
4863         releaseRowLocks(acquiredRowLocks);
4864       }
4865 
4866       // 12. Run post-process hook
4867       processor.postProcess(this, walEdit);
4868 
4869     } catch (IOException e) {
4870       throw e;
4871     } finally {
4872       closeRegionOperation();
4873       if (!mutations.isEmpty() &&
4874           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
4875         requestFlush();
4876       }
4877     }
4878   }
4879 
4880   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
4881                                        final long now,
4882                                        final HRegion region,
4883                                        final List<KeyValue> mutations,
4884                                        final WALEdit walEdit,
4885                                        final long timeout) throws IOException {
4886     // Short circuit the no time bound case.
4887     if (timeout < 0) {
4888       try {
4889         processor.process(now, region, mutations, walEdit);
4890       } catch (IOException e) {
4891         LOG.warn("RowProcessor:" + processor.getClass().getName() +
4892             " throws Exception on row(s):" +
4893             Bytes.toStringBinary(
4894               processor.getRowsToLock().iterator().next()) + "...", e);
4895         throw e;
4896       }
4897       return;
4898     }
4899 
4900     // Case with time bound
4901     FutureTask<Void> task =
4902       new FutureTask<Void>(new Callable<Void>() {
4903         @Override
4904         public Void call() throws IOException {
4905           try {
4906             processor.process(now, region, mutations, walEdit);
4907             return null;
4908           } catch (IOException e) {
4909             LOG.warn("RowProcessor:" + processor.getClass().getName() +
4910                 " throws Exception on row(s):" +
4911                 Bytes.toStringBinary(
4912                     processor.getRowsToLock().iterator().next()) + "...", e);
4913             throw e;
4914           }
4915         }
4916       });
4917     rowProcessorExecutor.execute(task);
4918     try {
4919       task.get(timeout, TimeUnit.MILLISECONDS);
4920     } catch (TimeoutException te) {
4921       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
4922           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
4923           "...");
4924       throw new IOException(te);
4925     } catch (Exception e) {
4926       throw new IOException(e);
4927     }
4928   }
4929 
4930   public Result append(Append append) throws IOException {
4931     return append(append, HConstants.NO_NONCE, HConstants.NO_NONCE);
4932   }
4933 
4934   // TODO: There's a lot of boiler plate code identical
4935   // to increment... See how to better unify that.
4936   /**
4937    * Perform one or more append operations on a row.
4938    *
4939    * @param append
4940    * @return new keyvalues after increment
4941    * @throws IOException
4942    */
4943   public Result append(Append append, long nonceGroup, long nonce)
4944       throws IOException {
4945     byte[] row = append.getRow();
4946     checkRow(row, "append");
4947     boolean flush = false;
4948     Durability durability = getEffectiveDurability(append.getDurability());
4949     boolean writeToWAL = durability != Durability.SKIP_WAL;
4950     WALEdit walEdits = null;
4951     List<Cell> allKVs = new ArrayList<Cell>(append.size());
4952     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
4953 
4954     long size = 0;
4955     long txid = 0;
4956 
4957     checkReadOnly();
4958     checkResources();
4959     // Lock row
4960     startRegionOperation(Operation.APPEND);
4961     this.writeRequestsCount.increment();
4962     WriteEntry w = null;
4963     RowLock rowLock;
4964     try {
4965       rowLock = getRowLock(row);
4966       try {
4967         lock(this.updatesLock.readLock());
4968         // wait for all prior MVCC transactions to finish - while we hold the row lock
4969         // (so that we are guaranteed to see the latest state)
4970         mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
4971         // now start my own transaction
4972         w = mvcc.beginMemstoreInsert();
4973         try {
4974           long now = EnvironmentEdgeManager.currentTimeMillis();
4975           // Process each family
4976           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
4977 
4978             Store store = stores.get(family.getKey());
4979             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
4980 
4981             // Sort the cells so that they match the order that they
4982             // appear in the Get results. Otherwise, we won't be able to
4983             // find the existing values if the cells are not specified
4984             // in order by the client since cells are in an array list.
4985             Collections.sort(family.getValue(), store.getComparator());
4986             // Get previous values for all columns in this family
4987             Get get = new Get(row);
4988             for (Cell cell : family.getValue()) {
4989               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4990               get.addColumn(family.getKey(), kv.getQualifier());
4991             }
4992             List<Cell> results = get(get, false);
4993 
4994             // Iterate the input columns and update existing values if they were
4995             // found, otherwise add new column initialized to the append value
4996 
4997             // Avoid as much copying as possible. Every byte is copied at most
4998             // once.
4999             // Would be nice if KeyValue had scatter/gather logic
5000             int idx = 0;
5001             for (Cell cell : family.getValue()) {
5002               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5003               KeyValue newKV;
5004               KeyValue oldKv = null;
5005               if (idx < results.size()
5006                   && CellUtil.matchingQualifier(results.get(idx),kv)) {
5007                 oldKv = KeyValueUtil.ensureKeyValue(results.get(idx));
5008                 // allocate an empty kv once
5009                 newKV = new KeyValue(row.length, kv.getFamilyLength(),
5010                     kv.getQualifierLength(), now, KeyValue.Type.Put,
5011                     oldKv.getValueLength() + kv.getValueLength(),
5012                     oldKv.getTagsLength() + kv.getTagsLength());
5013                 // copy in the value
5014                 System.arraycopy(oldKv.getBuffer(), oldKv.getValueOffset(),
5015                     newKV.getBuffer(), newKV.getValueOffset(),
5016                     oldKv.getValueLength());
5017                 System.arraycopy(kv.getBuffer(), kv.getValueOffset(),
5018                     newKV.getBuffer(),
5019                     newKV.getValueOffset() + oldKv.getValueLength(),
5020                     kv.getValueLength());
5021                 // copy in the tags
5022                 System.arraycopy(oldKv.getBuffer(), oldKv.getTagsOffset(), newKV.getBuffer(),
5023                     newKV.getTagsOffset(), oldKv.getTagsLength());
5024                 System.arraycopy(kv.getBuffer(), kv.getTagsOffset(), newKV.getBuffer(),
5025                     newKV.getTagsOffset() + oldKv.getTagsLength(), kv.getTagsLength());
5026                 idx++;
5027               } else {
5028                 // allocate an empty kv once
5029                 newKV = new KeyValue(row.length, kv.getFamilyLength(),
5030                     kv.getQualifierLength(), now, KeyValue.Type.Put,
5031                     kv.getValueLength(), kv.getTagsLength());
5032                 // copy in the value
5033                 System.arraycopy(kv.getBuffer(), kv.getValueOffset(),
5034                     newKV.getBuffer(), newKV.getValueOffset(),
5035                     kv.getValueLength());
5036                 // copy in tags
5037                 System.arraycopy(kv.getBuffer(), kv.getTagsOffset(), newKV.getBuffer(),
5038                     newKV.getTagsOffset(), kv.getTagsLength());
5039               }
5040               // copy in row, family, and qualifier
5041               System.arraycopy(kv.getBuffer(), kv.getRowOffset(),
5042                   newKV.getBuffer(), newKV.getRowOffset(), kv.getRowLength());
5043               System.arraycopy(kv.getBuffer(), kv.getFamilyOffset(),
5044                   newKV.getBuffer(), newKV.getFamilyOffset(),
5045                   kv.getFamilyLength());
5046               System.arraycopy(kv.getBuffer(), kv.getQualifierOffset(),
5047                   newKV.getBuffer(), newKV.getQualifierOffset(),
5048                   kv.getQualifierLength());
5049 
5050               newKV.setMvccVersion(w.getWriteNumber());
5051               // Give coprocessors a chance to update the new cell
5052               if (coprocessorHost != null) {
5053                 newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5054                     RegionObserver.MutationType.APPEND, append, oldKv, (Cell) newKV));
5055               }
5056               kvs.add(newKV);
5057 
5058               // Append update to WAL
5059               if (writeToWAL) {
5060                 if (walEdits == null) {
5061                   walEdits = new WALEdit();
5062                 }
5063                 walEdits.add(newKV);
5064               }
5065             }
5066 
5067             //store the kvs to the temporary memstore before writing HLog
5068             tempMemstore.put(store, kvs);
5069           }
5070 
5071           // Actually write to WAL now
5072           if (writeToWAL) {
5073             // Using default cluster id, as this can only happen in the orginating
5074             // cluster. A slave cluster receives the final value (not the delta)
5075             // as a Put.
5076             txid = this.log.appendNoSync(this.getRegionInfo(),
5077               this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5078               EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5079               true, nonceGroup, nonce);
5080           } else {
5081             recordMutationWithoutWal(append.getFamilyCellMap());
5082           }
5083 
5084           //Actually write to Memstore now
5085           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5086             Store store = entry.getKey();
5087             if (store.getFamily().getMaxVersions() == 1) {
5088               // upsert if VERSIONS for this CF == 1
5089               size += store.upsert(entry.getValue(), getSmallestReadPoint());
5090             } else {
5091               // otherwise keep older versions around
5092               for (Cell cell: entry.getValue()) {
5093                 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5094                 size += store.add(kv);
5095               }
5096             }
5097             allKVs.addAll(entry.getValue());
5098           }
5099           size = this.addAndGetGlobalMemstoreSize(size);
5100           flush = isFlushSize(size);
5101         } finally {
5102           this.updatesLock.readLock().unlock();
5103         }
5104       } finally {
5105         rowLock.release();
5106       }
5107       if (writeToWAL) {
5108         // sync the transaction log outside the rowlock
5109         syncOrDefer(txid, durability);
5110       }
5111     } finally {
5112       if (w != null) {
5113         mvcc.completeMemstoreInsert(w);
5114       }
5115       closeRegionOperation(Operation.APPEND);
5116     }
5117 
5118     if (this.metricsRegion != null) {
5119       this.metricsRegion.updateAppend();
5120     }
5121 
5122     if (flush) {
5123       // Request a cache flush. Do it outside update lock.
5124       requestFlush();
5125     }
5126 
5127 
5128     return append.isReturnResults() ? Result.create(allKVs) : null;
5129   }
5130 
5131   public Result increment(Increment increment) throws IOException {
5132     return increment(increment, HConstants.NO_NONCE, HConstants.NO_NONCE);
5133   }
5134 
5135   /**
5136    * Perform one or more increment operations on a row.
5137    * @param increment
5138    * @return new keyvalues after increment
5139    * @throws IOException
5140    */
5141   public Result increment(Increment increment, long nonceGroup, long nonce)
5142   throws IOException {
5143     byte [] row = increment.getRow();
5144     checkRow(row, "increment");
5145     TimeRange tr = increment.getTimeRange();
5146     boolean flush = false;
5147     Durability durability = getEffectiveDurability(increment.getDurability());
5148     boolean writeToWAL = durability != Durability.SKIP_WAL;
5149     WALEdit walEdits = null;
5150     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
5151     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
5152 
5153     long size = 0;
5154     long txid = 0;
5155 
5156     checkReadOnly();
5157     checkResources();
5158     // Lock row
5159     startRegionOperation(Operation.INCREMENT);
5160     this.writeRequestsCount.increment();
5161     WriteEntry w = null;
5162     try {
5163       RowLock rowLock = getRowLock(row);
5164       try {
5165         lock(this.updatesLock.readLock());
5166         // wait for all prior MVCC transactions to finish - while we hold the row lock
5167         // (so that we are guaranteed to see the latest state)
5168         mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
5169         // now start my own transaction
5170         w = mvcc.beginMemstoreInsert();
5171         try {
5172           long now = EnvironmentEdgeManager.currentTimeMillis();
5173           // Process each family
5174           for (Map.Entry<byte [], List<Cell>> family:
5175               increment.getFamilyCellMap().entrySet()) {
5176 
5177             Store store = stores.get(family.getKey());
5178             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
5179 
5180             // Sort the cells so that they match the order that they
5181             // appear in the Get results. Otherwise, we won't be able to
5182             // find the existing values if the cells are not specified
5183             // in order by the client since cells are in an array list.
5184             Collections.sort(family.getValue(), store.getComparator());
5185             // Get previous values for all columns in this family
5186             Get get = new Get(row);
5187             for (Cell cell: family.getValue()) {
5188               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5189               get.addColumn(family.getKey(), kv.getQualifier());
5190             }
5191             get.setTimeRange(tr.getMin(), tr.getMax());
5192             List<Cell> results = get(get, false);
5193 
5194             // Iterate the input columns and update existing values if they were
5195             // found, otherwise add new column initialized to the increment amount
5196             int idx = 0;
5197             for (Cell kv: family.getValue()) {
5198               long amount = Bytes.toLong(CellUtil.cloneValue(kv));
5199               boolean noWriteBack = (amount == 0);
5200 
5201               Cell c = null;
5202               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), kv)) {
5203                 c = results.get(idx);
5204                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
5205                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
5206                 } else {
5207                   // throw DoNotRetryIOException instead of IllegalArgumentException
5208                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
5209                       "Attempted to increment field that isn't 64 bits wide");
5210                 }
5211                 idx++;
5212               }
5213 
5214               // Append new incremented KeyValue to list
5215               byte[] q = CellUtil.cloneQualifier(kv);
5216               byte[] val = Bytes.toBytes(amount);
5217               int oldCellTagsLen = (c == null) ? 0 : c.getTagsLength();
5218               int incCellTagsLen = kv.getTagsLength();
5219               KeyValue newKV = new KeyValue(row.length, family.getKey().length, q.length, now,
5220                   KeyValue.Type.Put, val.length, oldCellTagsLen + incCellTagsLen);
5221               System.arraycopy(row, 0, newKV.getBuffer(), newKV.getRowOffset(), row.length);
5222               System.arraycopy(family.getKey(), 0, newKV.getBuffer(), newKV.getFamilyOffset(),
5223                   family.getKey().length);
5224               System.arraycopy(q, 0, newKV.getBuffer(), newKV.getQualifierOffset(), q.length);
5225               // copy in the value
5226               System.arraycopy(val, 0, newKV.getBuffer(), newKV.getValueOffset(), val.length);
5227               // copy tags
5228               if (oldCellTagsLen > 0) {
5229                 System.arraycopy(c.getTagsArray(), c.getTagsOffset(), newKV.getBuffer(),
5230                     newKV.getTagsOffset(), oldCellTagsLen);
5231               }
5232               if (incCellTagsLen > 0) {
5233                 System.arraycopy(kv.getTagsArray(), kv.getTagsOffset(), newKV.getBuffer(),
5234                     newKV.getTagsOffset() + oldCellTagsLen, incCellTagsLen);
5235               }
5236               newKV.setMvccVersion(w.getWriteNumber());
5237               // Give coprocessors a chance to update the new cell
5238               if (coprocessorHost != null) {
5239                 newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
5240                     RegionObserver.MutationType.INCREMENT, increment, c, (Cell) newKV));
5241               }
5242               allKVs.add(newKV);
5243 
5244               if (!noWriteBack) {
5245                 kvs.add(newKV);
5246 
5247                 // Prepare WAL updates
5248                 if (writeToWAL) {
5249                   if (walEdits == null) {
5250                     walEdits = new WALEdit();
5251                   }
5252                   walEdits.add(newKV);
5253                 }
5254               }
5255             }
5256 
5257             //store the kvs to the temporary memstore before writing HLog
5258             if (!kvs.isEmpty()) {
5259               tempMemstore.put(store, kvs);
5260             }
5261           }
5262 
5263           // Actually write to WAL now
5264           if (walEdits != null && !walEdits.isEmpty()) {
5265             if (writeToWAL) {
5266               // Using default cluster id, as this can only happen in the orginating
5267               // cluster. A slave cluster receives the final value (not the delta)
5268               // as a Put.
5269               txid = this.log.appendNoSync(this.getRegionInfo(),
5270                   this.htableDescriptor.getTableName(), walEdits, new ArrayList<UUID>(),
5271                   EnvironmentEdgeManager.currentTimeMillis(), this.htableDescriptor, this.sequenceId,
5272                   true, nonceGroup, nonce);
5273             } else {
5274               recordMutationWithoutWal(increment.getFamilyCellMap());
5275             }
5276           }
5277           //Actually write to Memstore now
5278           if (!tempMemstore.isEmpty()) {
5279             for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
5280               Store store = entry.getKey();
5281               if (store.getFamily().getMaxVersions() == 1) {
5282                 // upsert if VERSIONS for this CF == 1
5283                 size += store.upsert(entry.getValue(), getSmallestReadPoint());
5284               } else {
5285                 // otherwise keep older versions around
5286                 for (Cell cell : entry.getValue()) {
5287                   KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5288                   size += store.add(kv);
5289                 }
5290               }
5291             }
5292             size = this.addAndGetGlobalMemstoreSize(size);
5293             flush = isFlushSize(size);
5294           }
5295         } finally {
5296           this.updatesLock.readLock().unlock();
5297         }
5298       } finally {
5299         rowLock.release();
5300       }
5301       if (writeToWAL && (walEdits != null) && !walEdits.isEmpty()) {
5302         // sync the transaction log outside the rowlock
5303         syncOrDefer(txid, durability);
5304       }
5305     } finally {
5306       if (w != null) {
5307         mvcc.completeMemstoreInsert(w);
5308       }
5309       closeRegionOperation(Operation.INCREMENT);
5310       if (this.metricsRegion != null) {
5311         this.metricsRegion.updateIncrement();
5312       }
5313     }
5314 
5315     if (flush) {
5316       // Request a cache flush.  Do it outside update lock.
5317       requestFlush();
5318     }
5319 
5320     return Result.create(allKVs);
5321   }
5322 
5323   //
5324   // New HBASE-880 Helpers
5325   //
5326 
5327   private void checkFamily(final byte [] family)
5328   throws NoSuchColumnFamilyException {
5329     if (!this.htableDescriptor.hasFamily(family)) {
5330       throw new NoSuchColumnFamilyException("Column family " +
5331           Bytes.toString(family) + " does not exist in region " + this
5332           + " in table " + this.htableDescriptor);
5333     }
5334   }
5335 
5336   public static final long FIXED_OVERHEAD = ClassSize.align(
5337       ClassSize.OBJECT +
5338       ClassSize.ARRAY +
5339       41 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
5340       (12 * Bytes.SIZEOF_LONG) +
5341       4 * Bytes.SIZEOF_BOOLEAN);
5342 
5343   // woefully out of date - currently missing:
5344   // 1 x HashMap - coprocessorServiceHandlers
5345   // 6 org.cliffc.high_scale_lib.Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
5346   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
5347   //   writeRequestsCount, updatesBlockedMs
5348   // 1 x HRegion$WriteState - writestate
5349   // 1 x RegionCoprocessorHost - coprocessorHost
5350   // 1 x RegionSplitPolicy - splitPolicy
5351   // 1 x MetricsRegion - metricsRegion
5352   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
5353   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
5354       ClassSize.OBJECT + // closeLock
5355       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
5356       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
5357       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
5358       WriteState.HEAP_SIZE + // writestate
5359       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
5360       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
5361       ClassSize.ARRAYLIST + // recentFlushes
5362       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
5363       + ClassSize.TREEMAP // maxSeqIdInStores
5364       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
5365       ;
5366 
5367   @Override
5368   public long heapSize() {
5369     long heapSize = DEEP_OVERHEAD;
5370     for (Store store : this.stores.values()) {
5371       heapSize += store.heapSize();
5372     }
5373     // this does not take into account row locks, recent flushes, mvcc entries, and more
5374     return heapSize;
5375   }
5376 
5377   /*
5378    * This method calls System.exit.
5379    * @param message Message to print out.  May be null.
5380    */
5381   private static void printUsageAndExit(final String message) {
5382     if (message != null && message.length() > 0) System.out.println(message);
5383     System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
5384     System.out.println("Options:");
5385     System.out.println(" major_compact  Pass this option to major compact " +
5386       "passed region.");
5387     System.out.println("Default outputs scan of passed region.");
5388     System.exit(1);
5389   }
5390 
5391   /**
5392    * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
5393    * be available for handling
5394    * {@link HRegion#execService(com.google.protobuf.RpcController,
5395    *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall)}} calls.
5396    *
5397    * <p>
5398    * Only a single instance may be registered per region for a given {@link Service} subclass (the
5399    * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
5400    * After the first registration, subsequent calls with the same service name will fail with
5401    * a return value of {@code false}.
5402    * </p>
5403    * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
5404    * @return {@code true} if the registration was successful, {@code false}
5405    * otherwise
5406    */
5407   public boolean registerService(Service instance) {
5408     /*
5409      * No stacking of instances is allowed for a single service name
5410      */
5411     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
5412     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
5413       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
5414           " already registered, rejecting request from "+instance
5415       );
5416       return false;
5417     }
5418 
5419     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
5420     if (LOG.isDebugEnabled()) {
5421       LOG.debug("Registered coprocessor service: region="+
5422           Bytes.toStringBinary(getRegionName())+" service="+serviceDesc.getFullName());
5423     }
5424     return true;
5425   }
5426 
5427   /**
5428    * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
5429    * the registered protocol handlers.  {@link Service} implementations must be registered via the
5430    * {@link HRegion#registerService(com.google.protobuf.Service)}
5431    * method before they are available.
5432    *
5433    * @param controller an {@code RpcContoller} implementation to pass to the invoked service
5434    * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
5435    *     and parameters for the method invocation
5436    * @return a protocol buffer {@code Message} instance containing the method's result
5437    * @throws IOException if no registered service handler is found or an error
5438    *     occurs during the invocation
5439    * @see org.apache.hadoop.hbase.regionserver.HRegion#registerService(com.google.protobuf.Service)
5440    */
5441   public Message execService(RpcController controller, CoprocessorServiceCall call)
5442       throws IOException {
5443     String serviceName = call.getServiceName();
5444     String methodName = call.getMethodName();
5445     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
5446       throw new UnknownProtocolException(null,
5447           "No registered coprocessor service found for name "+serviceName+
5448           " in region "+Bytes.toStringBinary(getRegionName()));
5449     }
5450 
5451     Service service = coprocessorServiceHandlers.get(serviceName);
5452     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
5453     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
5454     if (methodDesc == null) {
5455       throw new UnknownProtocolException(service.getClass(),
5456           "Unknown method "+methodName+" called on service "+serviceName+
5457               " in region "+Bytes.toStringBinary(getRegionName()));
5458     }
5459 
5460     Message request = service.getRequestPrototype(methodDesc).newBuilderForType()
5461         .mergeFrom(call.getRequest()).build();
5462 
5463     if (coprocessorHost != null) {
5464       request = coprocessorHost.preEndpointInvocation(service, methodName, request);
5465     }
5466 
5467     final Message.Builder responseBuilder =
5468         service.getResponsePrototype(methodDesc).newBuilderForType();
5469     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
5470       @Override
5471       public void run(Message message) {
5472         if (message != null) {
5473           responseBuilder.mergeFrom(message);
5474         }
5475       }
5476     });
5477 
5478     if (coprocessorHost != null) {
5479       coprocessorHost.postEndpointInvocation(service, methodName, request, responseBuilder);
5480     }
5481 
5482     return responseBuilder.build();
5483   }
5484 
5485   /*
5486    * Process table.
5487    * Do major compaction or list content.
5488    * @param fs
5489    * @param p
5490    * @param log
5491    * @param c
5492    * @param majorCompact
5493    * @throws IOException
5494    */
5495   private static void processTable(final FileSystem fs, final Path p,
5496       final HLog log, final Configuration c,
5497       final boolean majorCompact)
5498   throws IOException {
5499     HRegion region = null;
5500     // Currently expects tables have one region only.
5501     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
5502       region = HRegion.newHRegion(p, log, fs, c,
5503         HRegionInfo.FIRST_META_REGIONINFO, HTableDescriptor.META_TABLEDESC, null);
5504     } else {
5505       throw new IOException("Not a known catalog table: " + p.toString());
5506     }
5507     try {
5508       region.initialize();
5509       if (majorCompact) {
5510         region.compactStores(true);
5511       } else {
5512         // Default behavior
5513         Scan scan = new Scan();
5514         // scan.addFamily(HConstants.CATALOG_FAMILY);
5515         RegionScanner scanner = region.getScanner(scan);
5516         try {
5517           List<Cell> kvs = new ArrayList<Cell>();
5518           boolean done;
5519           do {
5520             kvs.clear();
5521             done = scanner.next(kvs);
5522             if (kvs.size() > 0) LOG.info(kvs);
5523           } while (done);
5524         } finally {
5525           scanner.close();
5526         }
5527       }
5528     } finally {
5529       region.close();
5530     }
5531   }
5532 
5533   boolean shouldForceSplit() {
5534     return this.splitRequest;
5535   }
5536 
5537   byte[] getExplicitSplitPoint() {
5538     return this.explicitSplitPoint;
5539   }
5540 
5541   void forceSplit(byte[] sp) {
5542     // NOTE : this HRegion will go away after the forced split is successfull
5543     //        therefore, no reason to clear this value
5544     this.splitRequest = true;
5545     if (sp != null) {
5546       this.explicitSplitPoint = sp;
5547     }
5548   }
5549 
5550   void clearSplit_TESTS_ONLY() {
5551     this.splitRequest = false;
5552   }
5553 
5554   /**
5555    * Give the region a chance to prepare before it is split.
5556    */
5557   protected void prepareToSplit() {
5558     // nothing
5559   }
5560 
5561   /**
5562    * Return the splitpoint. null indicates the region isn't splittable
5563    * If the splitpoint isn't explicitly specified, it will go over the stores
5564    * to find the best splitpoint. Currently the criteria of best splitpoint
5565    * is based on the size of the store.
5566    */
5567   public byte[] checkSplit() {
5568     // Can't split META
5569     if (this.getRegionInfo().isMetaTable() ||
5570         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
5571       if (shouldForceSplit()) {
5572         LOG.warn("Cannot split meta region in HBase 0.20 and above");
5573       }
5574       return null;
5575     }
5576 
5577     // Can't split region which is in recovering state
5578     if (this.isRecovering()) {
5579       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
5580       return null;
5581     }
5582 
5583     if (!splitPolicy.shouldSplit()) {
5584       return null;
5585     }
5586 
5587     byte[] ret = splitPolicy.getSplitPoint();
5588 
5589     if (ret != null) {
5590       try {
5591         checkRow(ret, "calculated split");
5592       } catch (IOException e) {
5593         LOG.error("Ignoring invalid split", e);
5594         return null;
5595       }
5596     }
5597     return ret;
5598   }
5599 
5600   /**
5601    * @return The priority that this region should have in the compaction queue
5602    */
5603   public int getCompactPriority() {
5604     int count = Integer.MAX_VALUE;
5605     for (Store store : stores.values()) {
5606       count = Math.min(count, store.getCompactPriority());
5607     }
5608     return count;
5609   }
5610 
5611   /**
5612    * Checks every store to see if one has too many
5613    * store files
5614    * @return true if any store has too many store files
5615    */
5616   public boolean needsCompaction() {
5617     for (Store store : stores.values()) {
5618       if(store.needsCompaction()) {
5619         return true;
5620       }
5621     }
5622     return false;
5623   }
5624 
5625   /** @return the coprocessor host */
5626   public RegionCoprocessorHost getCoprocessorHost() {
5627     return coprocessorHost;
5628   }
5629 
5630   /** @param coprocessorHost the new coprocessor host */
5631   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
5632     this.coprocessorHost = coprocessorHost;
5633   }
5634 
5635   /**
5636    * This method needs to be called before any public call that reads or
5637    * modifies data. It has to be called just before a try.
5638    * #closeRegionOperation needs to be called in the try's finally block
5639    * Acquires a read lock and checks if the region is closing or closed.
5640    * @throws IOException 
5641    */
5642   public void startRegionOperation() throws IOException {
5643     startRegionOperation(Operation.ANY);
5644   }
5645 
5646   /**
5647    * @param op The operation is about to be taken on the region
5648    * @throws IOException 
5649    */
5650   protected void startRegionOperation(Operation op) throws IOException {
5651     switch (op) {
5652     case INCREMENT:
5653     case APPEND:
5654     case GET:
5655     case SCAN:
5656     case SPLIT_REGION:
5657     case MERGE_REGION:
5658     case PUT:
5659     case DELETE:
5660     case BATCH_MUTATE:
5661     case COMPACT_REGION:
5662       // when a region is in recovering state, no read, split or merge is allowed
5663       if (this.isRecovering() && (this.disallowWritesInRecovering ||
5664               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
5665         throw new RegionInRecoveryException(this.getRegionNameAsString() + " is recovering");
5666       }
5667       break;
5668     default:
5669       break;
5670     }
5671     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
5672         || op == Operation.COMPACT_REGION) {
5673       // split, merge or compact region doesn't need to check the closing/closed state or lock the
5674       // region
5675       return;
5676     }
5677     if (this.closing.get()) {
5678       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
5679     }
5680     lock(lock.readLock());
5681     if (this.closed.get()) {
5682       lock.readLock().unlock();
5683       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
5684     }
5685     try {
5686       if (coprocessorHost != null) {
5687         coprocessorHost.postStartRegionOperation(op);
5688       }
5689     } catch (Exception e) {
5690       lock.readLock().unlock();
5691       throw new IOException(e);
5692     }
5693   }
5694 
5695   /**
5696    * Closes the lock. This needs to be called in the finally block corresponding
5697    * to the try block of #startRegionOperation
5698    * @throws IOException 
5699    */
5700   public void closeRegionOperation() throws IOException {
5701     closeRegionOperation(Operation.ANY);
5702   }
5703 
5704   /**
5705    * Closes the lock. This needs to be called in the finally block corresponding
5706    * to the try block of {@link #startRegionOperation(Operation)}
5707    * @param operation
5708    * @throws IOException
5709    */
5710   public void closeRegionOperation(Operation operation) throws IOException {
5711     lock.readLock().unlock();
5712     if (coprocessorHost != null) {
5713       coprocessorHost.postCloseRegionOperation(operation);
5714     }
5715   }
5716 
5717   /**
5718    * This method needs to be called before any public call that reads or
5719    * modifies stores in bulk. It has to be called just before a try.
5720    * #closeBulkRegionOperation needs to be called in the try's finally block
5721    * Acquires a writelock and checks if the region is closing or closed.
5722    * @throws NotServingRegionException when the region is closing or closed
5723    * @throws RegionTooBusyException if failed to get the lock in time
5724    * @throws InterruptedIOException if interrupted while waiting for a lock
5725    */
5726   private void startBulkRegionOperation(boolean writeLockNeeded)
5727       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
5728     if (this.closing.get()) {
5729       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
5730     }
5731     if (writeLockNeeded) lock(lock.writeLock());
5732     else lock(lock.readLock());
5733     if (this.closed.get()) {
5734       if (writeLockNeeded) lock.writeLock().unlock();
5735       else lock.readLock().unlock();
5736       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
5737     }
5738   }
5739 
5740   /**
5741    * Closes the lock. This needs to be called in the finally block corresponding
5742    * to the try block of #startRegionOperation
5743    */
5744   private void closeBulkRegionOperation(){
5745     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
5746     else lock.readLock().unlock();
5747   }
5748 
5749   /**
5750    * Update counters for numer of puts without wal and the size of possible data loss.
5751    * These information are exposed by the region server metrics.
5752    */
5753   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
5754     numMutationsWithoutWAL.increment();
5755     if (numMutationsWithoutWAL.get() <= 1) {
5756       LOG.info("writing data to region " + this +
5757                " with WAL disabled. Data may be lost in the event of a crash.");
5758     }
5759 
5760     long mutationSize = 0;
5761     for (List<Cell> cells: familyMap.values()) {
5762       for (Cell cell : cells) {
5763         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5764         mutationSize += kv.getKeyLength() + kv.getValueLength();
5765       }
5766     }
5767 
5768     dataInMemoryWithoutWAL.add(mutationSize);
5769   }
5770 
5771   private void lock(final Lock lock)
5772       throws RegionTooBusyException, InterruptedIOException {
5773     lock(lock, 1);
5774   }
5775 
5776   /**
5777    * Try to acquire a lock.  Throw RegionTooBusyException
5778    * if failed to get the lock in time. Throw InterruptedIOException
5779    * if interrupted while waiting for the lock.
5780    */
5781   private void lock(final Lock lock, final int multiplier)
5782       throws RegionTooBusyException, InterruptedIOException {
5783     try {
5784       final long waitTime = Math.min(maxBusyWaitDuration,
5785           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
5786       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
5787         throw new RegionTooBusyException(
5788             "failed to get a lock in " + waitTime + " ms. " +
5789                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
5790                 this.getRegionInfo().getRegionNameAsString()) +
5791                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
5792                 this.getRegionServerServices().getServerName()));
5793       }
5794     } catch (InterruptedException ie) {
5795       LOG.info("Interrupted while waiting for a lock");
5796       InterruptedIOException iie = new InterruptedIOException();
5797       iie.initCause(ie);
5798       throw iie;
5799     }
5800   }
5801 
5802   /**
5803    * Calls sync with the given transaction ID if the region's table is not
5804    * deferring it.
5805    * @param txid should sync up to which transaction
5806    * @throws IOException If anything goes wrong with DFS
5807    */
5808   private void syncOrDefer(long txid, Durability durability) throws IOException {
5809     if (this.getRegionInfo().isMetaRegion()) {
5810       this.log.sync(txid);
5811     } else {
5812       switch(durability) {
5813       case USE_DEFAULT:
5814         // do what table defaults to
5815         if (shouldSyncLog()) {
5816           this.log.sync(txid);
5817         }
5818         break;
5819       case SKIP_WAL:
5820         // nothing do to
5821         break;
5822       case ASYNC_WAL:
5823         // nothing do to
5824         break;
5825       case SYNC_WAL:
5826       case FSYNC_WAL:
5827         // sync the WAL edit (SYNC and FSYNC treated the same for now)
5828         this.log.sync(txid);
5829         break;
5830       }
5831     }
5832   }
5833 
5834   /**
5835    * Check whether we should sync the log from the table's durability settings
5836    */
5837   private boolean shouldSyncLog() {
5838     return durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
5839   }
5840 
5841   /**
5842    * A mocked list implementaion - discards all updates.
5843    */
5844   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
5845 
5846     @Override
5847     public void add(int index, Cell element) {
5848       // do nothing
5849     }
5850 
5851     @Override
5852     public boolean addAll(int index, Collection<? extends Cell> c) {
5853       return false; // this list is never changed as a result of an update
5854     }
5855 
5856     @Override
5857     public KeyValue get(int index) {
5858       throw new UnsupportedOperationException();
5859     }
5860 
5861     @Override
5862     public int size() {
5863       return 0;
5864     }
5865   };
5866 
5867   /**
5868    * Facility for dumping and compacting catalog tables.
5869    * Only does catalog tables since these are only tables we for sure know
5870    * schema on.  For usage run:
5871    * <pre>
5872    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
5873    * </pre>
5874    * @param args
5875    * @throws IOException
5876    */
5877   public static void main(String[] args) throws IOException {
5878     if (args.length < 1) {
5879       printUsageAndExit(null);
5880     }
5881     boolean majorCompact = false;
5882     if (args.length > 1) {
5883       if (!args[1].toLowerCase().startsWith("major")) {
5884         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
5885       }
5886       majorCompact = true;
5887     }
5888     final Path tableDir = new Path(args[0]);
5889     final Configuration c = HBaseConfiguration.create();
5890     final FileSystem fs = FileSystem.get(c);
5891     final Path logdir = new Path(c.get("hbase.tmp.dir"));
5892     final String logname = "hlog" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
5893 
5894     final HLog log = HLogFactory.createHLog(fs, logdir, logname, c);
5895     try {
5896       processTable(fs, tableDir, log, c, majorCompact);
5897     } finally {
5898        log.close();
5899        // TODO: is this still right?
5900        BlockCache bc = new CacheConfig(c).getBlockCache();
5901        if (bc != null) bc.shutdown();
5902     }
5903   }
5904 
5905   /**
5906    * Gets the latest sequence number that was read from storage when this region was opened.
5907    */
5908   public long getOpenSeqNum() {
5909     return this.openSeqNum;
5910   }
5911 
5912   /**
5913    * Gets max sequence ids of stores that was read from storage when this region was opened. WAL
5914    * Edits with smaller or equal sequence number will be skipped from replay.
5915    */
5916   public Map<byte[], Long> getMaxStoreSeqIdForLogReplay() {
5917     return this.maxSeqIdInStores;
5918   }
5919 
5920   /**
5921    * @return if a given region is in compaction now.
5922    */
5923   public CompactionState getCompactionState() {
5924     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
5925     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
5926         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
5927   }
5928 
5929   public void reportCompactionRequestStart(boolean isMajor){
5930     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
5931   }
5932 
5933   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted){
5934     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
5935 
5936     // metrics
5937     compactionsFinished.incrementAndGet();
5938     compactionNumFilesCompacted.addAndGet(numFiles);
5939     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
5940 
5941     assert newValue >= 0;
5942   }
5943 
5944   /**
5945    * @return sequenceId.
5946    */
5947   public AtomicLong getSequenceId() {
5948     return this.sequenceId;
5949   }
5950 
5951   /**
5952    * sets this region's sequenceId.
5953    * @param value new value
5954    */
5955   private void setSequenceId(long value) {
5956     this.sequenceId.set(value);
5957   }
5958 
5959   /**
5960    * Listener class to enable callers of
5961    * bulkLoadHFile() to perform any necessary
5962    * pre/post processing of a given bulkload call
5963    */
5964   public interface BulkLoadListener {
5965 
5966     /**
5967      * Called before an HFile is actually loaded
5968      * @param family family being loaded to
5969      * @param srcPath path of HFile
5970      * @return final path to be used for actual loading
5971      * @throws IOException
5972      */
5973     String prepareBulkLoad(byte[] family, String srcPath) throws IOException;
5974 
5975     /**
5976      * Called after a successful HFile load
5977      * @param family family being loaded to
5978      * @param srcPath path of HFile
5979      * @throws IOException
5980      */
5981     void doneBulkLoad(byte[] family, String srcPath) throws IOException;
5982 
5983     /**
5984      * Called after a failed HFile load
5985      * @param family family being loaded to
5986      * @param srcPath path of HFile
5987      * @throws IOException
5988      */
5989     void failedBulkLoad(byte[] family, String srcPath) throws IOException;
5990   }
5991 
5992   @VisibleForTesting class RowLockContext {
5993     private final HashedBytes row;
5994     private final CountDownLatch latch = new CountDownLatch(1);
5995     private final Thread thread;
5996     private int lockCount = 0;
5997 
5998     RowLockContext(HashedBytes row) {
5999       this.row = row;
6000       this.thread = Thread.currentThread();
6001     }
6002 
6003     boolean ownedByCurrentThread() {
6004       return thread == Thread.currentThread();
6005     }
6006 
6007     RowLock newLock() {
6008       lockCount++;
6009       return new RowLock(this);
6010     }
6011 
6012     void releaseLock() {
6013       if (!ownedByCurrentThread()) {
6014         throw new IllegalArgumentException("Lock held by thread: " + thread
6015           + " cannot be released by different thread: " + Thread.currentThread());
6016       }
6017       lockCount--;
6018       if (lockCount == 0) {
6019         // no remaining locks by the thread, unlock and allow other threads to access
6020         RowLockContext existingContext = lockedRows.remove(row);
6021         if (existingContext != this) {
6022           throw new RuntimeException(
6023               "Internal row lock state inconsistent, should not happen, row: " + row);
6024         }
6025         latch.countDown();
6026       }
6027     }
6028   }
6029 
6030   /**
6031    * Row lock held by a given thread.
6032    * One thread may acquire multiple locks on the same row simultaneously.
6033    * The locks must be released by calling release() from the same thread.
6034    */
6035   public static class RowLock {
6036     @VisibleForTesting final RowLockContext context;
6037     private boolean released = false;
6038 
6039     @VisibleForTesting RowLock(RowLockContext context) {
6040       this.context = context;
6041     }
6042 
6043     /**
6044      * Release the given lock.  If there are no remaining locks held by the current thread
6045      * then unlock the row and allow other threads to acquire the lock.
6046      * @throws IllegalArgumentException if called by a different thread than the lock owning thread
6047      */
6048     public void release() {
6049       if (!released) {
6050         context.releaseLock();
6051         released = true;
6052       }
6053     }
6054   }
6055 
6056   /**
6057    * Lock the updates' readLock first, so that we could safely append logs in coprocessors.
6058    * @throws RegionTooBusyException
6059    * @throws InterruptedIOException
6060    */
6061   public void updatesLock() throws RegionTooBusyException, InterruptedIOException {
6062     lock(updatesLock.readLock());
6063   }
6064 
6065   /**
6066    * Unlock the updates' readLock after appending logs in coprocessors.
6067    * @throws InterruptedIOException
6068    */
6069   public void updatesUnlock() throws InterruptedIOException {
6070     updatesLock.readLock().unlock();
6071   }
6072 }