View Javadoc

1   /*
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.io.EOFException;
22  import java.io.IOException;
23  import java.io.InterruptedIOException;
24  import java.io.UnsupportedEncodingException;
25  import java.lang.reflect.Constructor;
26  import java.text.ParseException;
27  import java.util.AbstractList;
28  import java.util.ArrayList;
29  import java.util.Arrays;
30  import java.util.Collection;
31  import java.util.Collections;
32  import java.util.HashMap;
33  import java.util.List;
34  import java.util.Map;
35  import java.util.NavigableMap;
36  import java.util.NavigableSet;
37  import java.util.Set;
38  import java.util.TreeMap;
39  import java.util.UUID;
40  import java.util.concurrent.Callable;
41  import java.util.concurrent.CompletionService;
42  import java.util.concurrent.ConcurrentHashMap;
43  import java.util.concurrent.ConcurrentSkipListMap;
44  import java.util.concurrent.CountDownLatch;
45  import java.util.concurrent.ExecutionException;
46  import java.util.concurrent.ExecutorCompletionService;
47  import java.util.concurrent.ExecutorService;
48  import java.util.concurrent.Executors;
49  import java.util.concurrent.Future;
50  import java.util.concurrent.FutureTask;
51  import java.util.concurrent.ThreadFactory;
52  import java.util.concurrent.ThreadPoolExecutor;
53  import java.util.concurrent.TimeUnit;
54  import java.util.concurrent.TimeoutException;
55  import java.util.concurrent.atomic.AtomicBoolean;
56  import java.util.concurrent.atomic.AtomicInteger;
57  import java.util.concurrent.atomic.AtomicLong;
58  import java.util.concurrent.locks.Lock;
59  import java.util.concurrent.locks.ReentrantReadWriteLock;
60  
61  import org.apache.commons.logging.Log;
62  import org.apache.commons.logging.LogFactory;
63  import org.apache.hadoop.classification.InterfaceAudience;
64  import org.apache.hadoop.conf.Configuration;
65  import org.apache.hadoop.fs.FileStatus;
66  import org.apache.hadoop.fs.FileSystem;
67  import org.apache.hadoop.fs.Path;
68  import org.apache.hadoop.hbase.Cell;
69  import org.apache.hadoop.hbase.CellUtil;
70  import org.apache.hadoop.hbase.CompoundConfiguration;
71  import org.apache.hadoop.hbase.DroppedSnapshotException;
72  import org.apache.hadoop.hbase.HBaseConfiguration;
73  import org.apache.hadoop.hbase.HColumnDescriptor;
74  import org.apache.hadoop.hbase.HConstants;
75  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
76  import org.apache.hadoop.hbase.HDFSBlocksDistribution;
77  import org.apache.hadoop.hbase.HRegionInfo;
78  import org.apache.hadoop.hbase.HTableDescriptor;
79  import org.apache.hadoop.hbase.KeyValue;
80  import org.apache.hadoop.hbase.KeyValueUtil;
81  import org.apache.hadoop.hbase.NotServingRegionException;
82  import org.apache.hadoop.hbase.RegionTooBusyException;
83  import org.apache.hadoop.hbase.TableName;
84  import org.apache.hadoop.hbase.UnknownScannerException;
85  import org.apache.hadoop.hbase.backup.HFileArchiver;
86  import org.apache.hadoop.hbase.client.Append;
87  import org.apache.hadoop.hbase.client.Delete;
88  import org.apache.hadoop.hbase.client.Durability;
89  import org.apache.hadoop.hbase.client.Get;
90  import org.apache.hadoop.hbase.client.Increment;
91  import org.apache.hadoop.hbase.client.IsolationLevel;
92  import org.apache.hadoop.hbase.client.Mutation;
93  import org.apache.hadoop.hbase.client.Put;
94  import org.apache.hadoop.hbase.client.Result;
95  import org.apache.hadoop.hbase.client.RowMutations;
96  import org.apache.hadoop.hbase.client.Row;
97  import org.apache.hadoop.hbase.client.Scan;
98  import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
99  import org.apache.hadoop.hbase.exceptions.FailedSanityCheckException;
100 import org.apache.hadoop.hbase.exceptions.RegionInRecoveryException;
101 import org.apache.hadoop.hbase.exceptions.UnknownProtocolException;
102 import org.apache.hadoop.hbase.filter.ByteArrayComparable;
103 import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
104 import org.apache.hadoop.hbase.filter.Filter;
105 import org.apache.hadoop.hbase.filter.FilterWrapper;
106 import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
107 import org.apache.hadoop.hbase.io.HeapSize;
108 import org.apache.hadoop.hbase.io.TimeRange;
109 import org.apache.hadoop.hbase.io.hfile.BlockCache;
110 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
111 import org.apache.hadoop.hbase.ipc.CallerDisconnectedException;
112 import org.apache.hadoop.hbase.ipc.RpcCallContext;
113 import org.apache.hadoop.hbase.ipc.RpcServer;
114 import org.apache.hadoop.hbase.master.AssignmentManager;
115 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
116 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
117 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.GetRegionInfoResponse.CompactionState;
118 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall;
119 import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.SnapshotDescription;
120 import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor;
121 import org.apache.hadoop.hbase.regionserver.MultiVersionConsistencyControl.WriteEntry;
122 import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
123 import org.apache.hadoop.hbase.regionserver.wal.HLog;
124 import org.apache.hadoop.hbase.regionserver.wal.HLogFactory;
125 import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
126 import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
127 import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
128 import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
129 import org.apache.hadoop.hbase.util.Bytes;
130 import org.apache.hadoop.hbase.util.CancelableProgressable;
131 import org.apache.hadoop.hbase.util.ClassSize;
132 import org.apache.hadoop.hbase.util.CompressionTest;
133 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
134 import org.apache.hadoop.hbase.util.FSUtils;
135 import org.apache.hadoop.hbase.util.HashedBytes;
136 import org.apache.hadoop.hbase.util.Pair;
137 import org.apache.hadoop.hbase.util.Threads;
138 import org.apache.hadoop.io.MultipleIOException;
139 import org.apache.hadoop.util.StringUtils;
140 import org.cliffc.high_scale_lib.Counter;
141 
142 import com.google.common.annotations.VisibleForTesting;
143 import com.google.common.base.Preconditions;
144 import com.google.common.collect.Lists;
145 import com.google.common.collect.Maps;
146 import com.google.common.io.Closeables;
147 import com.google.protobuf.Descriptors;
148 import com.google.protobuf.Message;
149 import com.google.protobuf.RpcCallback;
150 import com.google.protobuf.RpcController;
151 import com.google.protobuf.Service;
152 
153 /**
154  * HRegion stores data for a certain region of a table.  It stores all columns
155  * for each row. A given table consists of one or more HRegions.
156  *
157  * <p>We maintain multiple HStores for a single HRegion.
158  *
159  * <p>An Store is a set of rows with some column data; together,
160  * they make up all the data for the rows.
161  *
162  * <p>Each HRegion has a 'startKey' and 'endKey'.
163  * <p>The first is inclusive, the second is exclusive (except for
164  * the final region)  The endKey of region 0 is the same as
165  * startKey for region 1 (if it exists).  The startKey for the
166  * first region is null. The endKey for the final region is null.
167  *
168  * <p>Locking at the HRegion level serves only one purpose: preventing the
169  * region from being closed (and consequently split) while other operations
170  * are ongoing. Each row level operation obtains both a row lock and a region
171  * read lock for the duration of the operation. While a scanner is being
172  * constructed, getScanner holds a read lock. If the scanner is successfully
173  * constructed, it holds a read lock until it is closed. A close takes out a
174  * write lock and consequently will block for ongoing operations and will block
175  * new operations from starting while the close is in progress.
176  *
177  * <p>An HRegion is defined by its table and its key extent.
178  *
179  * <p>It consists of at least one Store.  The number of Stores should be
180  * configurable, so that data which is accessed together is stored in the same
181  * Store.  Right now, we approximate that by building a single Store for
182  * each column family.  (This config info will be communicated via the
183  * tabledesc.)
184  *
185  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
186  * regionName is a unique identifier for this HRegion. (startKey, endKey]
187  * defines the keyspace for this HRegion.
188  */
189 @InterfaceAudience.Private
190 public class HRegion implements HeapSize { // , Writable{
191   public static final Log LOG = LogFactory.getLog(HRegion.class);
192 
193   public static final String LOAD_CFS_ON_DEMAND_CONFIG_KEY =
194       "hbase.hregion.scan.loadColumnFamiliesOnDemand";
195 
196   /**
197    * This is the global default value for durability. All tables/mutations not
198    * defining a durability or using USE_DEFAULT will default to this value.
199    */
200   private static final Durability DEFAULT_DURABLITY = Durability.SYNC_WAL;
201 
202   final AtomicBoolean closed = new AtomicBoolean(false);
203   /* Closing can take some time; use the closing flag if there is stuff we don't
204    * want to do while in closing state; e.g. like offer this region up to the
205    * master as a region to close if the carrying regionserver is overloaded.
206    * Once set, it is never cleared.
207    */
208   final AtomicBoolean closing = new AtomicBoolean(false);
209 
210   protected long completeSequenceId = -1L;
211 
212   /**
213    * Operation enum is used in {@link HRegion#startRegionOperation} to provide operation context for
214    * startRegionOperation to possibly invoke different checks before any region operations. Not all
215    * operations have to be defined here. It's only needed when a special check is need in
216    * startRegionOperation
217    */
218   protected enum Operation {
219     ANY, GET, PUT, DELETE, SCAN, APPEND, INCREMENT, SPLIT_REGION, MERGE_REGION, BATCH_MUTATE,
220     REPLAY_BATCH_MUTATE, COMPACT_REGION
221   }
222 
223   //////////////////////////////////////////////////////////////////////////////
224   // Members
225   //////////////////////////////////////////////////////////////////////////////
226 
227   // map from a locked row to the context for that lock including:
228   // - CountDownLatch for threads waiting on that row
229   // - the thread that owns the lock (allow reentrancy)
230   // - reference count of (reentrant) locks held by the thread
231   // - the row itself
232   private final ConcurrentHashMap<HashedBytes, RowLockContext> lockedRows =
233       new ConcurrentHashMap<HashedBytes, RowLockContext>();
234 
235   protected final Map<byte[], Store> stores = new ConcurrentSkipListMap<byte[], Store>(
236       Bytes.BYTES_RAWCOMPARATOR);
237 
238   // TODO: account for each registered handler in HeapSize computation
239   private Map<String, Service> coprocessorServiceHandlers = Maps.newHashMap();
240 
241   public final AtomicLong memstoreSize = new AtomicLong(0);
242 
243   // Debug possible data loss due to WAL off
244   final Counter numMutationsWithoutWAL = new Counter();
245   final Counter dataInMemoryWithoutWAL = new Counter();
246 
247   // Debug why CAS operations are taking a while.
248   final Counter checkAndMutateChecksPassed = new Counter();
249   final Counter checkAndMutateChecksFailed = new Counter();
250 
251   //Number of requests
252   final Counter readRequestsCount = new Counter();
253   final Counter writeRequestsCount = new Counter();
254 
255   // Compaction counters
256   final AtomicLong compactionsFinished = new AtomicLong(0L);
257   final AtomicLong compactionNumFilesCompacted = new AtomicLong(0L);
258   final AtomicLong compactionNumBytesCompacted = new AtomicLong(0L);
259 
260 
261   private final HLog log;
262   private final HRegionFileSystem fs;
263   protected final Configuration conf;
264   private final Configuration baseConf;
265   private final KeyValue.KVComparator comparator;
266   private final int rowLockWaitDuration;
267   static final int DEFAULT_ROWLOCK_WAIT_DURATION = 30000;
268 
269   // The internal wait duration to acquire a lock before read/update
270   // from the region. It is not per row. The purpose of this wait time
271   // is to avoid waiting a long time while the region is busy, so that
272   // we can release the IPC handler soon enough to improve the
273   // availability of the region server. It can be adjusted by
274   // tuning configuration "hbase.busy.wait.duration".
275   final long busyWaitDuration;
276   static final long DEFAULT_BUSY_WAIT_DURATION = HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
277 
278   // If updating multiple rows in one call, wait longer,
279   // i.e. waiting for busyWaitDuration * # of rows. However,
280   // we can limit the max multiplier.
281   final int maxBusyWaitMultiplier;
282 
283   // Max busy wait duration. There is no point to wait longer than the RPC
284   // purge timeout, when a RPC call will be terminated by the RPC engine.
285   final long maxBusyWaitDuration;
286 
287   // negative number indicates infinite timeout
288   static final long DEFAULT_ROW_PROCESSOR_TIMEOUT = 60 * 1000L;
289   final ExecutorService rowProcessorExecutor = Executors.newCachedThreadPool();
290 
291   private final ConcurrentHashMap<RegionScanner, Long> scannerReadPoints;
292 
293   /**
294    * The sequence ID that was encountered when this region was opened.
295    */
296   private long openSeqNum = HConstants.NO_SEQNUM;
297 
298   /**
299    * The default setting for whether to enable on-demand CF loading for
300    * scan requests to this region. Requests can override it.
301    */
302   private boolean isLoadingCfsOnDemandDefault = false;
303 
304   private final AtomicInteger majorInProgress = new AtomicInteger(0);
305   private final AtomicInteger minorInProgress = new AtomicInteger(0);
306 
307   //
308   // Context: During replay we want to ensure that we do not lose any data. So, we
309   // have to be conservative in how we replay logs. For each store, we calculate
310   // the maxSeqId up to which the store was flushed. And, skip the edits which
311   // are equal to or lower than maxSeqId for each store.
312   // The following map is populated when opening the region
313   Map<byte[], Long> maxSeqIdInStores = new TreeMap<byte[], Long>(Bytes.BYTES_COMPARATOR);
314 
315   /**
316    * Config setting for whether to allow writes when a region is in recovering or not.
317    */
318   private boolean disallowWritesInRecovering = false;
319 
320   // when a region is in recovering state, it can only accept writes not reads
321   private volatile boolean isRecovering = false;
322 
323   /**
324    * @return The smallest mvcc readPoint across all the scanners in this
325    * region. Writes older than this readPoint, are included  in every
326    * read operation.
327    */
328   public long getSmallestReadPoint() {
329     long minimumReadPoint;
330     // We need to ensure that while we are calculating the smallestReadPoint
331     // no new RegionScanners can grab a readPoint that we are unaware of.
332     // We achieve this by synchronizing on the scannerReadPoints object.
333     synchronized(scannerReadPoints) {
334       minimumReadPoint = mvcc.memstoreReadPoint();
335 
336       for (Long readPoint: this.scannerReadPoints.values()) {
337         if (readPoint < minimumReadPoint) {
338           minimumReadPoint = readPoint;
339         }
340       }
341     }
342     return minimumReadPoint;
343   }
344   /*
345    * Data structure of write state flags used coordinating flushes,
346    * compactions and closes.
347    */
348   static class WriteState {
349     // Set while a memstore flush is happening.
350     volatile boolean flushing = false;
351     // Set when a flush has been requested.
352     volatile boolean flushRequested = false;
353     // Number of compactions running.
354     volatile int compacting = 0;
355     // Gets set in close. If set, cannot compact or flush again.
356     volatile boolean writesEnabled = true;
357     // Set if region is read-only
358     volatile boolean readOnly = false;
359 
360     /**
361      * Set flags that make this region read-only.
362      *
363      * @param onOff flip value for region r/o setting
364      */
365     synchronized void setReadOnly(final boolean onOff) {
366       this.writesEnabled = !onOff;
367       this.readOnly = onOff;
368     }
369 
370     boolean isReadOnly() {
371       return this.readOnly;
372     }
373 
374     boolean isFlushRequested() {
375       return this.flushRequested;
376     }
377 
378     static final long HEAP_SIZE = ClassSize.align(
379         ClassSize.OBJECT + 5 * Bytes.SIZEOF_BOOLEAN);
380   }
381 
382   final WriteState writestate = new WriteState();
383 
384   long memstoreFlushSize;
385   final long timestampSlop;
386   final long rowProcessorTimeout;
387   private volatile long lastFlushTime;
388   final RegionServerServices rsServices;
389   private RegionServerAccounting rsAccounting;
390   private List<Pair<Long, Long>> recentFlushes = new ArrayList<Pair<Long,Long>>();
391   private long flushCheckInterval;
392   private long blockingMemStoreSize;
393   final long threadWakeFrequency;
394   // Used to guard closes
395   final ReentrantReadWriteLock lock =
396     new ReentrantReadWriteLock();
397 
398   // Stop updates lock
399   private final ReentrantReadWriteLock updatesLock =
400     new ReentrantReadWriteLock();
401   private boolean splitRequest;
402   private byte[] explicitSplitPoint = null;
403 
404   private final MultiVersionConsistencyControl mvcc =
405       new MultiVersionConsistencyControl();
406 
407   // Coprocessor host
408   private RegionCoprocessorHost coprocessorHost;
409 
410   private HTableDescriptor htableDescriptor = null;
411   private RegionSplitPolicy splitPolicy;
412 
413   private final MetricsRegion metricsRegion;
414   private final MetricsRegionWrapperImpl metricsRegionWrapper;
415   private final boolean deferredLogSyncDisabled;
416   private final Durability durability;
417 
418   /**
419    * HRegion constructor. This constructor should only be used for testing and
420    * extensions.  Instances of HRegion should be instantiated with the
421    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
422    *
423    * @param tableDir qualified path of directory where region should be located,
424    * usually the table directory.
425    * @param log The HLog is the outbound log for any updates to the HRegion
426    * (There's a single HLog for all the HRegions on a single HRegionServer.)
427    * The log file is a logfile from the previous execution that's
428    * custom-computed for this HRegion. The HRegionServer computes and sorts the
429    * appropriate log info for this HRegion. If there is a previous log file
430    * (implying that the HRegion has been written-to before), then read it from
431    * the supplied path.
432    * @param fs is the filesystem.
433    * @param confParam is global configuration settings.
434    * @param regionInfo - HRegionInfo that describes the region
435    * is new), then read them from the supplied path.
436    * @param htd the table descriptor
437    * @param rsServices reference to {@link RegionServerServices} or null
438    */
439   @Deprecated
440   public HRegion(final Path tableDir, final HLog log, final FileSystem fs,
441       final Configuration confParam, final HRegionInfo regionInfo,
442       final HTableDescriptor htd, final RegionServerServices rsServices) {
443     this(new HRegionFileSystem(confParam, fs, tableDir, regionInfo),
444       log, confParam, htd, rsServices);
445   }
446 
447   /**
448    * HRegion constructor. This constructor should only be used for testing and
449    * extensions.  Instances of HRegion should be instantiated with the
450    * {@link HRegion#createHRegion} or {@link HRegion#openHRegion} method.
451    *
452    * @param fs is the filesystem.
453    * @param log The HLog is the outbound log for any updates to the HRegion
454    * (There's a single HLog for all the HRegions on a single HRegionServer.)
455    * The log file is a logfile from the previous execution that's
456    * custom-computed for this HRegion. The HRegionServer computes and sorts the
457    * appropriate log info for this HRegion. If there is a previous log file
458    * (implying that the HRegion has been written-to before), then read it from
459    * the supplied path.
460    * @param confParam is global configuration settings.
461    * @param htd the table descriptor
462    * @param rsServices reference to {@link RegionServerServices} or null
463    */
464   public HRegion(final HRegionFileSystem fs, final HLog log, final Configuration confParam,
465       final HTableDescriptor htd, final RegionServerServices rsServices) {
466     if (htd == null) {
467       throw new IllegalArgumentException("Need table descriptor");
468     }
469 
470     if (confParam instanceof CompoundConfiguration) {
471       throw new IllegalArgumentException("Need original base configuration");
472     }
473 
474     this.comparator = fs.getRegionInfo().getComparator();
475     this.log = log;
476     this.fs = fs;
477 
478     // 'conf' renamed to 'confParam' b/c we use this.conf in the constructor
479     this.baseConf = confParam;
480     this.conf = new CompoundConfiguration()
481       .add(confParam)
482       .addStringMap(htd.getConfiguration())
483       .addWritableMap(htd.getValues());
484     this.flushCheckInterval = conf.getInt(MEMSTORE_PERIODIC_FLUSH_INTERVAL,
485         DEFAULT_CACHE_FLUSH_INTERVAL);
486     this.rowLockWaitDuration = conf.getInt("hbase.rowlock.wait.duration",
487                     DEFAULT_ROWLOCK_WAIT_DURATION);
488 
489     this.isLoadingCfsOnDemandDefault = conf.getBoolean(LOAD_CFS_ON_DEMAND_CONFIG_KEY, true);
490     this.htableDescriptor = htd;
491     this.rsServices = rsServices;
492     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
493     setHTableSpecificConf();
494     this.scannerReadPoints = new ConcurrentHashMap<RegionScanner, Long>();
495 
496     this.busyWaitDuration = conf.getLong(
497       "hbase.busy.wait.duration", DEFAULT_BUSY_WAIT_DURATION);
498     this.maxBusyWaitMultiplier = conf.getInt("hbase.busy.wait.multiplier.max", 2);
499     if (busyWaitDuration * maxBusyWaitMultiplier <= 0L) {
500       throw new IllegalArgumentException("Invalid hbase.busy.wait.duration ("
501         + busyWaitDuration + ") or hbase.busy.wait.multiplier.max ("
502         + maxBusyWaitMultiplier + "). Their product should be positive");
503     }
504     this.maxBusyWaitDuration = conf.getLong("ipc.client.call.purge.timeout",
505       2 * HConstants.DEFAULT_HBASE_RPC_TIMEOUT);
506 
507     /*
508      * timestamp.slop provides a server-side constraint on the timestamp. This
509      * assumes that you base your TS around currentTimeMillis(). In this case,
510      * throw an error to the user if the user-specified TS is newer than now +
511      * slop. LATEST_TIMESTAMP == don't use this functionality
512      */
513     this.timestampSlop = conf.getLong(
514         "hbase.hregion.keyvalue.timestamp.slop.millisecs",
515         HConstants.LATEST_TIMESTAMP);
516 
517     /**
518      * Timeout for the process time in processRowsWithLocks().
519      * Use -1 to switch off time bound.
520      */
521     this.rowProcessorTimeout = conf.getLong(
522         "hbase.hregion.row.processor.timeout", DEFAULT_ROW_PROCESSOR_TIMEOUT);
523     // When hbase.regionserver.optionallogflushinterval <= 0 , deferred log sync is disabled.
524     this.deferredLogSyncDisabled = conf.getLong("hbase.regionserver.optionallogflushinterval",
525         1 * 1000) <= 0;
526     this.durability = htd.getDurability() == Durability.USE_DEFAULT
527         ? DEFAULT_DURABLITY
528         : htd.getDurability();
529     if (rsServices != null) {
530       this.rsAccounting = this.rsServices.getRegionServerAccounting();
531       // don't initialize coprocessors if not running within a regionserver
532       // TODO: revisit if coprocessors should load in other cases
533       this.coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
534       this.metricsRegionWrapper = new MetricsRegionWrapperImpl(this);
535       this.metricsRegion = new MetricsRegion(this.metricsRegionWrapper);
536     } else {
537       this.metricsRegionWrapper = null;
538       this.metricsRegion = null;
539     }
540     if (LOG.isDebugEnabled()) {
541       // Write out region name as string and its encoded name.
542       LOG.debug("Instantiated " + this);
543     }
544 
545     // by default, we allow writes against a region when it's in recovering
546     this.disallowWritesInRecovering =
547         conf.getBoolean(HConstants.DISALLOW_WRITES_IN_RECOVERING,
548           HConstants.DEFAULT_DISALLOW_WRITES_IN_RECOVERING_CONFIG);
549   }
550 
551   void setHTableSpecificConf() {
552     if (this.htableDescriptor == null) return;
553     long flushSize = this.htableDescriptor.getMemStoreFlushSize();
554 
555     if (flushSize <= 0) {
556       flushSize = conf.getLong(HConstants.HREGION_MEMSTORE_FLUSH_SIZE,
557         HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
558     }
559     this.memstoreFlushSize = flushSize;
560     this.blockingMemStoreSize = this.memstoreFlushSize *
561         conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
562   }
563 
564   /**
565    * Initialize this region.
566    * Used only by tests and SplitTransaction to reopen the region.
567    * You should use createHRegion() or openHRegion()
568    * @return What the next sequence (edit) id should be.
569    * @throws IOException e
570    * @deprecated use HRegion.createHRegion() or HRegion.openHRegion()
571    */
572   @Deprecated
573   public long initialize() throws IOException {
574     return initialize(null);
575   }
576 
577   /**
578    * Initialize this region.
579    *
580    * @param reporter Tickle every so often if initialize is taking a while.
581    * @return What the next sequence (edit) id should be.
582    * @throws IOException e
583    */
584   private long initialize(final CancelableProgressable reporter) throws IOException {
585     MonitoredTask status = TaskMonitor.get().createStatus("Initializing region " + this);
586     long nextSeqId = -1;
587     try {
588       nextSeqId = initializeRegionInternals(reporter, status);
589       return nextSeqId;
590     } finally {
591       // nextSeqid will be -1 if the initialization fails.
592       // At least it will be 0 otherwise.
593       if (nextSeqId == -1) {
594         status
595             .abort("Exception during region " + this.getRegionNameAsString() + " initialization.");
596       }
597     }
598   }
599 
600   private long initializeRegionInternals(final CancelableProgressable reporter,
601       final MonitoredTask status) throws IOException, UnsupportedEncodingException {
602     if (coprocessorHost != null) {
603       status.setStatus("Running coprocessor pre-open hook");
604       coprocessorHost.preOpen();
605     }
606 
607     // Write HRI to a file in case we need to recover hbase:meta
608     status.setStatus("Writing region info on filesystem");
609     fs.checkRegionInfoOnFilesystem();
610 
611     // Remove temporary data left over from old regions
612     status.setStatus("Cleaning up temporary data from old regions");
613     fs.cleanupTempDir();
614 
615     // Initialize all the HStores
616     status.setStatus("Initializing all the Stores");
617     long maxSeqId = initializeRegionStores(reporter, status);
618 
619     status.setStatus("Cleaning up detritus from prior splits");
620     // Get rid of any splits or merges that were lost in-progress.  Clean out
621     // these directories here on open.  We may be opening a region that was
622     // being split but we crashed in the middle of it all.
623     fs.cleanupAnySplitDetritus();
624     fs.cleanupMergesDir();
625 
626     this.writestate.setReadOnly(this.htableDescriptor.isReadOnly());
627     this.writestate.flushRequested = false;
628     this.writestate.compacting = 0;
629 
630     // Initialize split policy
631     this.splitPolicy = RegionSplitPolicy.create(this, conf);
632 
633     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
634     // Use maximum of log sequenceid or that which was found in stores
635     // (particularly if no recovered edits, seqid will be -1).
636     long nextSeqid = maxSeqId + 1;
637     LOG.info("Onlined " + this.getRegionInfo().getShortNameToLog() +
638       "; next sequenceid=" + nextSeqid);
639 
640     // A region can be reopened if failed a split; reset flags
641     this.closing.set(false);
642     this.closed.set(false);
643 
644     if (coprocessorHost != null) {
645       status.setStatus("Running coprocessor post-open hooks");
646       coprocessorHost.postOpen();
647     }
648 
649     status.markComplete("Region opened successfully");
650     return nextSeqid;
651   }
652 
653   private long initializeRegionStores(final CancelableProgressable reporter, MonitoredTask status)
654       throws IOException, UnsupportedEncodingException {
655     // Load in all the HStores.
656 
657     long maxSeqId = -1;
658     // initialized to -1 so that we pick up MemstoreTS from column families
659     long maxMemstoreTS = -1;
660 
661     if (!htableDescriptor.getFamilies().isEmpty()) {
662       // initialize the thread pool for opening stores in parallel.
663       ThreadPoolExecutor storeOpenerThreadPool =
664         getStoreOpenAndCloseThreadPool("StoreOpener-" + this.getRegionInfo().getShortNameToLog());
665       CompletionService<HStore> completionService =
666         new ExecutorCompletionService<HStore>(storeOpenerThreadPool);
667 
668       // initialize each store in parallel
669       for (final HColumnDescriptor family : htableDescriptor.getFamilies()) {
670         status.setStatus("Instantiating store for column family " + family);
671         completionService.submit(new Callable<HStore>() {
672           @Override
673           public HStore call() throws IOException {
674             return instantiateHStore(family);
675           }
676         });
677       }
678       try {
679         for (int i = 0; i < htableDescriptor.getFamilies().size(); i++) {
680           Future<HStore> future = completionService.take();
681           HStore store = future.get();
682 
683           this.stores.put(store.getColumnFamilyName().getBytes(), store);
684           // Do not include bulk loaded files when determining seqIdForReplay
685           long storeSeqIdForReplay = store.getMaxSequenceId(false);
686           maxSeqIdInStores.put(store.getColumnFamilyName().getBytes(),
687               storeSeqIdForReplay);
688           // Include bulk loaded files when determining seqIdForAssignment
689           long storeSeqIdForAssignment = store.getMaxSequenceId(true);
690           if (maxSeqId == -1 || storeSeqIdForAssignment > maxSeqId) {
691             maxSeqId = storeSeqIdForAssignment;
692           }
693           long maxStoreMemstoreTS = store.getMaxMemstoreTS();
694           if (maxStoreMemstoreTS > maxMemstoreTS) {
695             maxMemstoreTS = maxStoreMemstoreTS;
696           }
697         }
698       } catch (InterruptedException e) {
699         throw new IOException(e);
700       } catch (ExecutionException e) {
701         throw new IOException(e.getCause());
702       } finally {
703         storeOpenerThreadPool.shutdownNow();
704       }
705     }
706     mvcc.initialize(maxMemstoreTS + 1);
707     // Recover any edits if available.
708     maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
709         this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
710     return maxSeqId;
711   }
712 
713   /*
714    * Move any passed HStore files into place (if any).  Used to pick up split
715    * files and any merges from splits and merges dirs.
716    * @param initialFiles
717    * @throws IOException
718    */
719   static void moveInitialFilesIntoPlace(final FileSystem fs,
720     final Path initialFiles, final Path regiondir)
721   throws IOException {
722     if (initialFiles != null && fs.exists(initialFiles)) {
723       if (!fs.rename(initialFiles, regiondir)) {
724         LOG.warn("Unable to rename " + initialFiles + " to " + regiondir);
725       }
726     }
727   }
728 
729   /**
730    * @return True if this region has references.
731    */
732   public boolean hasReferences() {
733     for (Store store : this.stores.values()) {
734       if (store.hasReferences()) return true;
735     }
736     return false;
737   }
738 
739   /**
740    * This function will return the HDFS blocks distribution based on the data
741    * captured when HFile is created
742    * @return The HDFS blocks distribution for the region.
743    */
744   public HDFSBlocksDistribution getHDFSBlocksDistribution() {
745     HDFSBlocksDistribution hdfsBlocksDistribution =
746       new HDFSBlocksDistribution();
747     synchronized (this.stores) {
748       for (Store store : this.stores.values()) {
749         for (StoreFile sf : store.getStorefiles()) {
750           HDFSBlocksDistribution storeFileBlocksDistribution =
751             sf.getHDFSBlockDistribution();
752           hdfsBlocksDistribution.add(storeFileBlocksDistribution);
753         }
754       }
755     }
756     return hdfsBlocksDistribution;
757   }
758 
759   /**
760    * This is a helper function to compute HDFS block distribution on demand
761    * @param conf configuration
762    * @param tableDescriptor HTableDescriptor of the table
763    * @param regionInfo encoded name of the region
764    * @return The HDFS blocks distribution for the given region.
765    * @throws IOException
766    */
767   public static HDFSBlocksDistribution computeHDFSBlocksDistribution(final Configuration conf,
768       final HTableDescriptor tableDescriptor, final HRegionInfo regionInfo) throws IOException {
769     HDFSBlocksDistribution hdfsBlocksDistribution = new HDFSBlocksDistribution();
770     Path tablePath = FSUtils.getTableDir(FSUtils.getRootDir(conf), tableDescriptor.getTableName());
771     FileSystem fs = tablePath.getFileSystem(conf);
772 
773     HRegionFileSystem regionFs = new HRegionFileSystem(conf, fs, tablePath, regionInfo);
774     for (HColumnDescriptor family: tableDescriptor.getFamilies()) {
775       Collection<StoreFileInfo> storeFiles = regionFs.getStoreFiles(family.getNameAsString());
776       if (storeFiles == null) continue;
777 
778       for (StoreFileInfo storeFileInfo : storeFiles) {
779         hdfsBlocksDistribution.add(storeFileInfo.computeHDFSBlocksDistribution(fs));
780       }
781     }
782     return hdfsBlocksDistribution;
783   }
784 
785   public AtomicLong getMemstoreSize() {
786     return memstoreSize;
787   }
788 
789   /**
790    * Increase the size of mem store in this region and the size of global mem
791    * store
792    * @param memStoreSize
793    * @return the size of memstore in this region
794    */
795   public long addAndGetGlobalMemstoreSize(long memStoreSize) {
796     if (this.rsAccounting != null) {
797       rsAccounting.addAndGetGlobalMemstoreSize(memStoreSize);
798     }
799     return this.memstoreSize.getAndAdd(memStoreSize);
800   }
801 
802   /** @return a HRegionInfo object for this region */
803   public HRegionInfo getRegionInfo() {
804     return this.fs.getRegionInfo();
805   }
806 
807   /**
808    * @return Instance of {@link RegionServerServices} used by this HRegion.
809    * Can be null.
810    */
811   RegionServerServices getRegionServerServices() {
812     return this.rsServices;
813   }
814 
815   /** @return readRequestsCount for this region */
816   long getReadRequestsCount() {
817     return this.readRequestsCount.get();
818   }
819 
820   /** @return writeRequestsCount for this region */
821   long getWriteRequestsCount() {
822     return this.writeRequestsCount.get();
823   }
824 
825   MetricsRegion getMetrics() {
826     return metricsRegion;
827   }
828 
829   /** @return true if region is closed */
830   public boolean isClosed() {
831     return this.closed.get();
832   }
833 
834   /**
835    * @return True if closing process has started.
836    */
837   public boolean isClosing() {
838     return this.closing.get();
839   }
840 
841   /**
842    * Reset recovering state of current region
843    * @param newState
844    */
845   public void setRecovering(boolean newState) {
846     this.isRecovering = newState;
847   }
848 
849   /**
850    * @return True if current region is in recovering
851    */
852   public boolean isRecovering() {
853     return this.isRecovering;
854   }
855 
856   /** @return true if region is available (not closed and not closing) */
857   public boolean isAvailable() {
858     return !isClosed() && !isClosing();
859   }
860 
861   /** @return true if region is splittable */
862   public boolean isSplittable() {
863     return isAvailable() && !hasReferences();
864   }
865 
866   /**
867    * @return true if region is mergeable
868    */
869   public boolean isMergeable() {
870     if (!isAvailable()) {
871       LOG.debug("Region " + this.getRegionNameAsString()
872           + " is not mergeable because it is closing or closed");
873       return false;
874     }
875     if (hasReferences()) {
876       LOG.debug("Region " + this.getRegionNameAsString()
877           + " is not mergeable because it has references");
878       return false;
879     }
880 
881     return true;
882   }
883 
884   public boolean areWritesEnabled() {
885     synchronized(this.writestate) {
886       return this.writestate.writesEnabled;
887     }
888   }
889 
890    public MultiVersionConsistencyControl getMVCC() {
891      return mvcc;
892    }
893 
894    public boolean isLoadingCfsOnDemandDefault() {
895      return this.isLoadingCfsOnDemandDefault;
896    }
897 
898   /**
899    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
900    * service any more calls.
901    *
902    * <p>This method could take some time to execute, so don't call it from a
903    * time-sensitive thread.
904    *
905    * @return Vector of all the storage files that the HRegion's component
906    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
907    * vector if already closed and null if judged that it should not close.
908    *
909    * @throws IOException e
910    */
911   public Map<byte[], List<StoreFile>> close() throws IOException {
912     return close(false);
913   }
914 
915   private final Object closeLock = new Object();
916 
917   /** Conf key for the periodic flush interval */
918   public static final String MEMSTORE_PERIODIC_FLUSH_INTERVAL =
919       "hbase.regionserver.optionalcacheflushinterval";
920   /** Default interval for the memstore flush */
921   public static final int DEFAULT_CACHE_FLUSH_INTERVAL = 3600000;
922 
923   /**
924    * Close down this HRegion.  Flush the cache unless abort parameter is true,
925    * Shut down each HStore, don't service any more calls.
926    *
927    * This method could take some time to execute, so don't call it from a
928    * time-sensitive thread.
929    *
930    * @param abort true if server is aborting (only during testing)
931    * @return Vector of all the storage files that the HRegion's component
932    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
933    * we are not to close at this time or we are already closed.
934    *
935    * @throws IOException e
936    */
937   public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
938     // Only allow one thread to close at a time. Serialize them so dual
939     // threads attempting to close will run up against each other.
940     MonitoredTask status = TaskMonitor.get().createStatus(
941         "Closing region " + this +
942         (abort ? " due to abort" : ""));
943 
944     status.setStatus("Waiting for close lock");
945     try {
946       synchronized (closeLock) {
947         return doClose(abort, status);
948       }
949     } finally {
950       status.cleanup();
951     }
952   }
953 
954   private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
955       throws IOException {
956     if (isClosed()) {
957       LOG.warn("Region " + this + " already closed");
958       return null;
959     }
960 
961     if (coprocessorHost != null) {
962       status.setStatus("Running coprocessor pre-close hooks");
963       this.coprocessorHost.preClose(abort);
964     }
965 
966     status.setStatus("Disabling compacts and flushes for region");
967     boolean wasFlushing = false;
968     synchronized (writestate) {
969       // Disable compacting and flushing by background threads for this
970       // region.
971       writestate.writesEnabled = false;
972       wasFlushing = writestate.flushing;
973       LOG.debug("Closing " + this + ": disabling compactions & flushes");
974       waitForFlushesAndCompactions();
975     }
976     // If we were not just flushing, is it worth doing a preflush...one
977     // that will clear out of the bulk of the memstore before we put up
978     // the close flag?
979     if (!abort && !wasFlushing && worthPreFlushing()) {
980       status.setStatus("Pre-flushing region before close");
981       LOG.info("Running close preflush of " + this.getRegionNameAsString());
982       internalFlushcache(status);
983     }
984 
985     this.closing.set(true);
986     status.setStatus("Disabling writes for close");
987     // block waiting for the lock for closing
988     lock.writeLock().lock();
989     try {
990       if (this.isClosed()) {
991         status.abort("Already got closed by another process");
992         // SplitTransaction handles the null
993         return null;
994       }
995       LOG.debug("Updates disabled for region " + this);
996       // Don't flush the cache if we are aborting
997       if (!abort) {
998         internalFlushcache(status);
999       }
1000 
1001       Map<byte[], List<StoreFile>> result =
1002         new TreeMap<byte[], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
1003       if (!stores.isEmpty()) {
1004         // initialize the thread pool for closing stores in parallel.
1005         ThreadPoolExecutor storeCloserThreadPool =
1006           getStoreOpenAndCloseThreadPool("StoreCloserThread-" + this.getRegionNameAsString());
1007         CompletionService<Pair<byte[], Collection<StoreFile>>> completionService =
1008           new ExecutorCompletionService<Pair<byte[], Collection<StoreFile>>>(storeCloserThreadPool);
1009 
1010         // close each store in parallel
1011         for (final Store store : stores.values()) {
1012           completionService
1013               .submit(new Callable<Pair<byte[], Collection<StoreFile>>>() {
1014                 @Override
1015                 public Pair<byte[], Collection<StoreFile>> call() throws IOException {
1016                   return new Pair<byte[], Collection<StoreFile>>(
1017                     store.getFamily().getName(), store.close());
1018                 }
1019               });
1020         }
1021         try {
1022           for (int i = 0; i < stores.size(); i++) {
1023             Future<Pair<byte[], Collection<StoreFile>>> future = completionService.take();
1024             Pair<byte[], Collection<StoreFile>> storeFiles = future.get();
1025             List<StoreFile> familyFiles = result.get(storeFiles.getFirst());
1026             if (familyFiles == null) {
1027               familyFiles = new ArrayList<StoreFile>();
1028               result.put(storeFiles.getFirst(), familyFiles);
1029             }
1030             familyFiles.addAll(storeFiles.getSecond());
1031           }
1032         } catch (InterruptedException e) {
1033           throw new IOException(e);
1034         } catch (ExecutionException e) {
1035           throw new IOException(e.getCause());
1036         } finally {
1037           storeCloserThreadPool.shutdownNow();
1038         }
1039       }
1040       this.closed.set(true);
1041 
1042       if (coprocessorHost != null) {
1043         status.setStatus("Running coprocessor post-close hooks");
1044         this.coprocessorHost.postClose(abort);
1045       }
1046       if ( this.metricsRegion != null) {
1047         this.metricsRegion.close();
1048       }
1049       if ( this.metricsRegionWrapper != null) {
1050         Closeables.closeQuietly(this.metricsRegionWrapper);
1051       }
1052       status.markComplete("Closed");
1053       LOG.info("Closed " + this);
1054       return result;
1055     } finally {
1056       lock.writeLock().unlock();
1057     }
1058   }
1059 
1060   /**
1061    * Wait for all current flushes and compactions of the region to complete.
1062    * <p>
1063    * Exposed for TESTING.
1064    */
1065   public void waitForFlushesAndCompactions() {
1066     synchronized (writestate) {
1067       while (writestate.compacting > 0 || writestate.flushing) {
1068         LOG.debug("waiting for " + writestate.compacting + " compactions"
1069             + (writestate.flushing ? " & cache flush" : "") + " to complete for region " + this);
1070         try {
1071           writestate.wait();
1072         } catch (InterruptedException iex) {
1073           // essentially ignore and propagate the interrupt back up
1074           Thread.currentThread().interrupt();
1075         }
1076       }
1077     }
1078   }
1079 
1080   protected ThreadPoolExecutor getStoreOpenAndCloseThreadPool(
1081       final String threadNamePrefix) {
1082     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1083     int maxThreads = Math.min(numStores,
1084         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1085             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX));
1086     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1087   }
1088 
1089   protected ThreadPoolExecutor getStoreFileOpenAndCloseThreadPool(
1090       final String threadNamePrefix) {
1091     int numStores = Math.max(1, this.htableDescriptor.getFamilies().size());
1092     int maxThreads = Math.max(1,
1093         conf.getInt(HConstants.HSTORE_OPEN_AND_CLOSE_THREADS_MAX,
1094             HConstants.DEFAULT_HSTORE_OPEN_AND_CLOSE_THREADS_MAX)
1095             / numStores);
1096     return getOpenAndCloseThreadPool(maxThreads, threadNamePrefix);
1097   }
1098 
1099   static ThreadPoolExecutor getOpenAndCloseThreadPool(int maxThreads,
1100       final String threadNamePrefix) {
1101     return Threads.getBoundedCachedThreadPool(maxThreads, 30L, TimeUnit.SECONDS,
1102       new ThreadFactory() {
1103         private int count = 1;
1104 
1105         @Override
1106         public Thread newThread(Runnable r) {
1107           return new Thread(r, threadNamePrefix + "-" + count++);
1108         }
1109       });
1110   }
1111 
1112    /**
1113     * @return True if its worth doing a flush before we put up the close flag.
1114     */
1115   private boolean worthPreFlushing() {
1116     return this.memstoreSize.get() >
1117       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
1118   }
1119 
1120   //////////////////////////////////////////////////////////////////////////////
1121   // HRegion accessors
1122   //////////////////////////////////////////////////////////////////////////////
1123 
1124   /** @return start key for region */
1125   public byte [] getStartKey() {
1126     return this.getRegionInfo().getStartKey();
1127   }
1128 
1129   /** @return end key for region */
1130   public byte [] getEndKey() {
1131     return this.getRegionInfo().getEndKey();
1132   }
1133 
1134   /** @return region id */
1135   public long getRegionId() {
1136     return this.getRegionInfo().getRegionId();
1137   }
1138 
1139   /** @return region name */
1140   public byte [] getRegionName() {
1141     return this.getRegionInfo().getRegionName();
1142   }
1143 
1144   /** @return region name as string for logging */
1145   public String getRegionNameAsString() {
1146     return this.getRegionInfo().getRegionNameAsString();
1147   }
1148 
1149   /** @return HTableDescriptor for this region */
1150   public HTableDescriptor getTableDesc() {
1151     return this.htableDescriptor;
1152   }
1153 
1154   /** @return HLog in use for this region */
1155   public HLog getLog() {
1156     return this.log;
1157   }
1158 
1159   /**
1160    * A split takes the config from the parent region & passes it to the daughter
1161    * region's constructor. If 'conf' was passed, you would end up using the HTD
1162    * of the parent region in addition to the new daughter HTD. Pass 'baseConf'
1163    * to the daughter regions to avoid this tricky dedupe problem.
1164    * @return Configuration object
1165    */
1166   Configuration getBaseConf() {
1167     return this.baseConf;
1168   }
1169 
1170   /** @return {@link FileSystem} being used by this region */
1171   public FileSystem getFilesystem() {
1172     return fs.getFileSystem();
1173   }
1174 
1175   /** @return the {@link HRegionFileSystem} used by this region */
1176   public HRegionFileSystem getRegionFileSystem() {
1177     return this.fs;
1178   }
1179 
1180   /** @return the last time the region was flushed */
1181   public long getLastFlushTime() {
1182     return this.lastFlushTime;
1183   }
1184 
1185   //////////////////////////////////////////////////////////////////////////////
1186   // HRegion maintenance.
1187   //
1188   // These methods are meant to be called periodically by the HRegionServer for
1189   // upkeep.
1190   //////////////////////////////////////////////////////////////////////////////
1191 
1192   /** @return returns size of largest HStore. */
1193   public long getLargestHStoreSize() {
1194     long size = 0;
1195     for (Store h : stores.values()) {
1196       long storeSize = h.getSize();
1197       if (storeSize > size) {
1198         size = storeSize;
1199       }
1200     }
1201     return size;
1202   }
1203 
1204   /*
1205    * Do preparation for pending compaction.
1206    * @throws IOException
1207    */
1208   protected void doRegionCompactionPrep() throws IOException {
1209   }
1210 
1211   void triggerMajorCompaction() {
1212     for (Store h : stores.values()) {
1213       h.triggerMajorCompaction();
1214     }
1215   }
1216 
1217   /**
1218    * This is a helper function that compact all the stores synchronously
1219    * It is used by utilities and testing
1220    *
1221    * @param majorCompaction True to force a major compaction regardless of thresholds
1222    * @throws IOException e
1223    */
1224   public void compactStores(final boolean majorCompaction)
1225   throws IOException {
1226     if (majorCompaction) {
1227       this.triggerMajorCompaction();
1228     }
1229     compactStores();
1230   }
1231 
1232   /**
1233    * This is a helper function that compact all the stores synchronously
1234    * It is used by utilities and testing
1235    *
1236    * @throws IOException e
1237    */
1238   public void compactStores() throws IOException {
1239     for (Store s : getStores().values()) {
1240       CompactionContext compaction = s.requestCompaction();
1241       if (compaction != null) {
1242         compact(compaction, s);
1243       }
1244     }
1245   }
1246 
1247   /*
1248    * Called by compaction thread and after region is opened to compact the
1249    * HStores if necessary.
1250    *
1251    * <p>This operation could block for a long time, so don't call it from a
1252    * time-sensitive thread.
1253    *
1254    * Note that no locking is necessary at this level because compaction only
1255    * conflicts with a region split, and that cannot happen because the region
1256    * server does them sequentially and not in parallel.
1257    *
1258    * @param cr Compaction details, obtained by requestCompaction()
1259    * @return whether the compaction completed
1260    * @throws IOException e
1261    */
1262   public boolean compact(CompactionContext compaction, Store store) throws IOException {
1263     assert compaction != null && compaction.hasSelection();
1264     assert !compaction.getRequest().getFiles().isEmpty();
1265     if (this.closing.get() || this.closed.get()) {
1266       LOG.debug("Skipping compaction on " + this + " because closing/closed");
1267       store.cancelRequestedCompaction(compaction);
1268       return false;
1269     }
1270     MonitoredTask status = null;
1271     boolean didPerformCompaction = false;
1272     // block waiting for the lock for compaction
1273     lock.readLock().lock();
1274     try {
1275       status = TaskMonitor.get().createStatus("Compacting " + store + " in " + this);
1276       if (this.closed.get()) {
1277         String msg = "Skipping compaction on " + this + " because closed";
1278         LOG.debug(msg);
1279         status.abort(msg);
1280         return false;
1281       }
1282       boolean wasStateSet = false;
1283       try {
1284         synchronized (writestate) {
1285           if (writestate.writesEnabled) {
1286             wasStateSet = true;
1287             ++writestate.compacting;
1288           } else {
1289             String msg = "NOT compacting region " + this + ". Writes disabled.";
1290             LOG.info(msg);
1291             status.abort(msg);
1292             return false;
1293           }
1294         }
1295         LOG.info("Starting compaction on " + store + " in region " + this
1296             + (compaction.getRequest().isOffPeak()?" as an off-peak compaction":""));
1297         doRegionCompactionPrep();
1298         try {
1299           status.setStatus("Compacting store " + store);
1300           didPerformCompaction = true;
1301           store.compact(compaction);
1302         } catch (InterruptedIOException iioe) {
1303           String msg = "compaction interrupted";
1304           LOG.info(msg, iioe);
1305           status.abort(msg);
1306           return false;
1307         }
1308       } finally {
1309         if (wasStateSet) {
1310           synchronized (writestate) {
1311             --writestate.compacting;
1312             if (writestate.compacting <= 0) {
1313               writestate.notifyAll();
1314             }
1315           }
1316         }
1317       }
1318       status.markComplete("Compaction complete");
1319       return true;
1320     } finally {
1321       try {
1322         if (!didPerformCompaction) store.cancelRequestedCompaction(compaction);
1323         if (status != null) status.cleanup();
1324       } finally {
1325         lock.readLock().unlock();
1326       }
1327     }
1328   }
1329 
1330   /**
1331    * Flush the cache.
1332    *
1333    * When this method is called the cache will be flushed unless:
1334    * <ol>
1335    *   <li>the cache is empty</li>
1336    *   <li>the region is closed.</li>
1337    *   <li>a flush is already in progress</li>
1338    *   <li>writes are disabled</li>
1339    * </ol>
1340    *
1341    * <p>This method may block for some time, so it should not be called from a
1342    * time-sensitive thread.
1343    *
1344    * @return true if the region needs compacting
1345    *
1346    * @throws IOException general io exceptions
1347    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1348    * because a Snapshot was not properly persisted.
1349    */
1350   public boolean flushcache() throws IOException {
1351     // fail-fast instead of waiting on the lock
1352     if (this.closing.get()) {
1353       LOG.debug("Skipping flush on " + this + " because closing");
1354       return false;
1355     }
1356     MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
1357     status.setStatus("Acquiring readlock on region");
1358     // block waiting for the lock for flushing cache
1359     lock.readLock().lock();
1360     try {
1361       if (this.closed.get()) {
1362         LOG.debug("Skipping flush on " + this + " because closed");
1363         status.abort("Skipped: closed");
1364         return false;
1365       }
1366       if (coprocessorHost != null) {
1367         status.setStatus("Running coprocessor pre-flush hooks");
1368         coprocessorHost.preFlush();
1369       }
1370       if (numMutationsWithoutWAL.get() > 0) {
1371         numMutationsWithoutWAL.set(0);
1372         dataInMemoryWithoutWAL.set(0);
1373       }
1374       synchronized (writestate) {
1375         if (!writestate.flushing && writestate.writesEnabled) {
1376           this.writestate.flushing = true;
1377         } else {
1378           if (LOG.isDebugEnabled()) {
1379             LOG.debug("NOT flushing memstore for region " + this
1380                 + ", flushing=" + writestate.flushing + ", writesEnabled="
1381                 + writestate.writesEnabled);
1382           }
1383           status.abort("Not flushing since "
1384               + (writestate.flushing ? "already flushing"
1385                   : "writes not enabled"));
1386           return false;
1387         }
1388       }
1389       try {
1390         boolean result = internalFlushcache(status);
1391 
1392         if (coprocessorHost != null) {
1393           status.setStatus("Running post-flush coprocessor hooks");
1394           coprocessorHost.postFlush();
1395         }
1396 
1397         status.markComplete("Flush successful");
1398         return result;
1399       } finally {
1400         synchronized (writestate) {
1401           writestate.flushing = false;
1402           this.writestate.flushRequested = false;
1403           writestate.notifyAll();
1404         }
1405       }
1406     } finally {
1407       lock.readLock().unlock();
1408       status.cleanup();
1409     }
1410   }
1411 
1412   /**
1413    * Should the memstore be flushed now
1414    */
1415   boolean shouldFlush() {
1416     if (flushCheckInterval <= 0) { //disabled
1417       return false;
1418     }
1419     long now = EnvironmentEdgeManager.currentTimeMillis();
1420     //if we flushed in the recent past, we don't need to do again now
1421     if ((now - getLastFlushTime() < flushCheckInterval)) {
1422       return false;
1423     }
1424     //since we didn't flush in the recent past, flush now if certain conditions
1425     //are met. Return true on first such memstore hit.
1426     for (Store s : this.getStores().values()) {
1427       if (s.timeOfOldestEdit() < now - flushCheckInterval) {
1428         // we have an old enough edit in the memstore, flush
1429         return true;
1430       }
1431     }
1432     return false;
1433   }
1434 
1435   /**
1436    * Flush the memstore.
1437    *
1438    * Flushing the memstore is a little tricky. We have a lot of updates in the
1439    * memstore, all of which have also been written to the log. We need to
1440    * write those updates in the memstore out to disk, while being able to
1441    * process reads/writes as much as possible during the flush operation. Also,
1442    * the log has to state clearly the point in time at which the memstore was
1443    * flushed. (That way, during recovery, we know when we can rely on the
1444    * on-disk flushed structures and when we have to recover the memstore from
1445    * the log.)
1446    *
1447    * <p>So, we have a three-step process:
1448    *
1449    * <ul><li>A. Flush the memstore to the on-disk stores, noting the current
1450    * sequence ID for the log.<li>
1451    *
1452    * <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
1453    * ID that was current at the time of memstore-flush.</li>
1454    *
1455    * <li>C. Get rid of the memstore structures that are now redundant, as
1456    * they've been flushed to the on-disk HStores.</li>
1457    * </ul>
1458    * <p>This method is protected, but can be accessed via several public
1459    * routes.
1460    *
1461    * <p> This method may block for some time.
1462    * @param status
1463    *
1464    * @return true if the region needs compacting
1465    *
1466    * @throws IOException general io exceptions
1467    * @throws DroppedSnapshotException Thrown when replay of hlog is required
1468    * because a Snapshot was not properly persisted.
1469    */
1470   protected boolean internalFlushcache(MonitoredTask status)
1471       throws IOException {
1472     return internalFlushcache(this.log, -1, status);
1473   }
1474 
1475   /**
1476    * @param wal Null if we're NOT to go via hlog/wal.
1477    * @param myseqid The seqid to use if <code>wal</code> is null writing out
1478    * flush file.
1479    * @param status
1480    * @return true if the region needs compacting
1481    * @throws IOException
1482    * @see #internalFlushcache(MonitoredTask)
1483    */
1484   protected boolean internalFlushcache(
1485       final HLog wal, final long myseqid, MonitoredTask status)
1486   throws IOException {
1487     if (this.rsServices != null && this.rsServices.isAborted()) {
1488       // Don't flush when server aborting, it's unsafe
1489       throw new IOException("Aborting flush because server is abortted...");
1490     }
1491     final long startTime = EnvironmentEdgeManager.currentTimeMillis();
1492     // Clear flush flag.
1493     // If nothing to flush, return and avoid logging start/stop flush.
1494     if (this.memstoreSize.get() <= 0) {
1495       return false;
1496     }
1497     if (LOG.isDebugEnabled()) {
1498       LOG.debug("Started memstore flush for " + this +
1499         ", current region memstore size " +
1500         StringUtils.humanReadableInt(this.memstoreSize.get()) +
1501         ((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
1502     }
1503 
1504     // Stop updates while we snapshot the memstore of all stores. We only have
1505     // to do this for a moment.  Its quick.  The subsequent sequence id that
1506     // goes into the HLog after we've flushed all these snapshots also goes
1507     // into the info file that sits beside the flushed files.
1508     // We also set the memstore size to zero here before we allow updates
1509     // again so its value will represent the size of the updates received
1510     // during the flush
1511     MultiVersionConsistencyControl.WriteEntry w = null;
1512 
1513     // We have to take a write lock during snapshot, or else a write could
1514     // end up in both snapshot and memstore (makes it difficult to do atomic
1515     // rows then)
1516     status.setStatus("Obtaining lock to block concurrent updates");
1517     // block waiting for the lock for internal flush
1518     this.updatesLock.writeLock().lock();
1519     long flushsize = this.memstoreSize.get();
1520     status.setStatus("Preparing to flush by snapshotting stores");
1521     List<StoreFlushContext> storeFlushCtxs = new ArrayList<StoreFlushContext>(stores.size());
1522     long flushSeqId = -1L;
1523     try {
1524       // Record the mvcc for all transactions in progress.
1525       w = mvcc.beginMemstoreInsert();
1526       mvcc.advanceMemstore(w);
1527 
1528       if (wal != null) {
1529         Long startSeqId = wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1530         if (startSeqId == null) {
1531           status.setStatus("Flush will not be started for [" + this.getRegionInfo().getEncodedName()
1532               + "] - WAL is going away");
1533           return false;
1534         }
1535         flushSeqId = startSeqId.longValue();
1536       } else {
1537         flushSeqId = myseqid;
1538       }
1539 
1540       for (Store s : stores.values()) {
1541         storeFlushCtxs.add(s.createFlushContext(flushSeqId));
1542       }
1543 
1544       // prepare flush (take a snapshot)
1545       for (StoreFlushContext flush : storeFlushCtxs) {
1546         flush.prepare();
1547       }
1548     } finally {
1549       this.updatesLock.writeLock().unlock();
1550     }
1551     String s = "Finished memstore snapshotting " + this +
1552       ", syncing WAL and waiting on mvcc, flushsize=" + flushsize;
1553     status.setStatus(s);
1554     if (LOG.isTraceEnabled()) LOG.trace(s);
1555 
1556     // sync unflushed WAL changes when deferred log sync is enabled
1557     // see HBASE-8208 for details
1558     if (wal != null && !shouldSyncLog()) {
1559       wal.sync();
1560     }
1561 
1562     // wait for all in-progress transactions to commit to HLog before
1563     // we can start the flush. This prevents
1564     // uncommitted transactions from being written into HFiles.
1565     // We have to block before we start the flush, otherwise keys that
1566     // were removed via a rollbackMemstore could be written to Hfiles.
1567     mvcc.waitForRead(w);
1568 
1569     s = "Flushing stores of " + this;
1570     status.setStatus(s);
1571     if (LOG.isTraceEnabled()) LOG.trace(s);
1572 
1573     // Any failure from here on out will be catastrophic requiring server
1574     // restart so hlog content can be replayed and put back into the memstore.
1575     // Otherwise, the snapshot content while backed up in the hlog, it will not
1576     // be part of the current running servers state.
1577     boolean compactionRequested = false;
1578     try {
1579       // A.  Flush memstore to all the HStores.
1580       // Keep running vector of all store files that includes both old and the
1581       // just-made new flush store file. The new flushed file is still in the
1582       // tmp directory.
1583 
1584       for (StoreFlushContext flush : storeFlushCtxs) {
1585         flush.flushCache(status);
1586       }
1587 
1588       // Switch snapshot (in memstore) -> new hfile (thus causing
1589       // all the store scanners to reset/reseek).
1590       for (StoreFlushContext flush : storeFlushCtxs) {
1591         boolean needsCompaction = flush.commit(status);
1592         if (needsCompaction) {
1593           compactionRequested = true;
1594         }
1595       }
1596       storeFlushCtxs.clear();
1597 
1598       // Set down the memstore size by amount of flush.
1599       this.addAndGetGlobalMemstoreSize(-flushsize);
1600     } catch (Throwable t) {
1601       // An exception here means that the snapshot was not persisted.
1602       // The hlog needs to be replayed so its content is restored to memstore.
1603       // Currently, only a server restart will do this.
1604       // We used to only catch IOEs but its possible that we'd get other
1605       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
1606       // all and sundry.
1607       if (wal != null) {
1608         wal.abortCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1609       }
1610       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
1611           Bytes.toStringBinary(getRegionName()));
1612       dse.initCause(t);
1613       status.abort("Flush failed: " + StringUtils.stringifyException(t));
1614       throw dse;
1615     }
1616 
1617     // If we get to here, the HStores have been written.
1618     if (wal != null) {
1619       wal.completeCacheFlush(this.getRegionInfo().getEncodedNameAsBytes());
1620     }
1621 
1622     // Record latest flush time
1623     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
1624     
1625     // Update the last flushed sequence id for region
1626     if (this.rsServices != null) {
1627       completeSequenceId = flushSeqId;
1628     }
1629 
1630     // C. Finally notify anyone waiting on memstore to clear:
1631     // e.g. checkResources().
1632     synchronized (this) {
1633       notifyAll(); // FindBugs NN_NAKED_NOTIFY
1634     }
1635 
1636     long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
1637     long memstoresize = this.memstoreSize.get();
1638     String msg = "Finished memstore flush of ~" +
1639       StringUtils.humanReadableInt(flushsize) + "/" + flushsize +
1640       ", currentsize=" +
1641       StringUtils.humanReadableInt(memstoresize) + "/" + memstoresize +
1642       " for region " + this + " in " + time + "ms, sequenceid=" + flushSeqId +
1643       ", compaction requested=" + compactionRequested +
1644       ((wal == null)? "; wal=null": "");
1645     LOG.info(msg);
1646     status.setStatus(msg);
1647     this.recentFlushes.add(new Pair<Long,Long>(time/1000, flushsize));
1648 
1649     return compactionRequested;
1650   }
1651 
1652   //////////////////////////////////////////////////////////////////////////////
1653   // get() methods for client use.
1654   //////////////////////////////////////////////////////////////////////////////
1655   /**
1656    * Return all the data for the row that matches <i>row</i> exactly,
1657    * or the one that immediately preceeds it, at or immediately before
1658    * <i>ts</i>.
1659    *
1660    * @param row row key
1661    * @return map of values
1662    * @throws IOException
1663    */
1664   Result getClosestRowBefore(final byte [] row)
1665   throws IOException{
1666     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
1667   }
1668 
1669   /**
1670    * Return all the data for the row that matches <i>row</i> exactly,
1671    * or the one that immediately preceeds it, at or immediately before
1672    * <i>ts</i>.
1673    *
1674    * @param row row key
1675    * @param family column family to find on
1676    * @return map of values
1677    * @throws IOException read exceptions
1678    */
1679   public Result getClosestRowBefore(final byte [] row, final byte [] family)
1680   throws IOException {
1681     if (coprocessorHost != null) {
1682       Result result = new Result();
1683       if (coprocessorHost.preGetClosestRowBefore(row, family, result)) {
1684         return result;
1685       }
1686     }
1687     // look across all the HStores for this region and determine what the
1688     // closest key is across all column families, since the data may be sparse
1689     checkRow(row, "getClosestRowBefore");
1690     startRegionOperation(Operation.GET);
1691     this.readRequestsCount.increment();
1692     try {
1693       Store store = getStore(family);
1694       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
1695       KeyValue key = store.getRowKeyAtOrBefore(row);
1696       Result result = null;
1697       if (key != null) {
1698         Get get = new Get(key.getRow());
1699         get.addFamily(family);
1700         result = get(get);
1701       }
1702       if (coprocessorHost != null) {
1703         coprocessorHost.postGetClosestRowBefore(row, family, result);
1704       }
1705       return result;
1706     } finally {
1707       closeRegionOperation();
1708     }
1709   }
1710 
1711   /**
1712    * Return an iterator that scans over the HRegion, returning the indicated
1713    * columns and rows specified by the {@link Scan}.
1714    * <p>
1715    * This Iterator must be closed by the caller.
1716    *
1717    * @param scan configured {@link Scan}
1718    * @return RegionScanner
1719    * @throws IOException read exceptions
1720    */
1721   public RegionScanner getScanner(Scan scan) throws IOException {
1722    return getScanner(scan, null);
1723   }
1724 
1725   void prepareScanner(Scan scan) throws IOException {
1726     if(!scan.hasFamilies()) {
1727       // Adding all families to scanner
1728       for(byte[] family: this.htableDescriptor.getFamiliesKeys()){
1729         scan.addFamily(family);
1730       }
1731     }
1732   }
1733 
1734   protected RegionScanner getScanner(Scan scan,
1735       List<KeyValueScanner> additionalScanners) throws IOException {
1736     startRegionOperation(Operation.SCAN);
1737     try {
1738       // Verify families are all valid
1739       prepareScanner(scan);
1740       if(scan.hasFamilies()) {
1741         for(byte [] family : scan.getFamilyMap().keySet()) {
1742           checkFamily(family);
1743         }
1744       }
1745       return instantiateRegionScanner(scan, additionalScanners);
1746     } finally {
1747       closeRegionOperation();
1748     }
1749   }
1750 
1751   protected RegionScanner instantiateRegionScanner(Scan scan,
1752       List<KeyValueScanner> additionalScanners) throws IOException {
1753     return new RegionScannerImpl(scan, additionalScanners, this);
1754   }
1755 
1756   /*
1757    * @param delete The passed delete is modified by this method. WARNING!
1758    */
1759   void prepareDelete(Delete delete) throws IOException {
1760     // Check to see if this is a deleteRow insert
1761     if(delete.getFamilyCellMap().isEmpty()){
1762       for(byte [] family : this.htableDescriptor.getFamiliesKeys()){
1763         // Don't eat the timestamp
1764         delete.deleteFamily(family, delete.getTimeStamp());
1765       }
1766     } else {
1767       for(byte [] family : delete.getFamilyCellMap().keySet()) {
1768         if(family == null) {
1769           throw new NoSuchColumnFamilyException("Empty family is invalid");
1770         }
1771         checkFamily(family);
1772       }
1773     }
1774   }
1775 
1776   //////////////////////////////////////////////////////////////////////////////
1777   // set() methods for client use.
1778   //////////////////////////////////////////////////////////////////////////////
1779   /**
1780    * @param delete delete object
1781    * @throws IOException read exceptions
1782    */
1783   public void delete(Delete delete)
1784   throws IOException {
1785     checkReadOnly();
1786     checkResources();
1787     startRegionOperation(Operation.DELETE);
1788     this.writeRequestsCount.increment();
1789     try {
1790       delete.getRow();
1791       // All edits for the given row (across all column families) must happen atomically.
1792       doBatchMutate(delete);
1793     } finally {
1794       closeRegionOperation();
1795     }
1796   }
1797 
1798   /**
1799    * Row needed by below method.
1800    */
1801   private static final byte [] FOR_UNIT_TESTS_ONLY = Bytes.toBytes("ForUnitTestsOnly");
1802   /**
1803    * This is used only by unit tests. Not required to be a public API.
1804    * @param familyMap map of family to edits for the given family.
1805    * @param durability
1806    * @throws IOException
1807    */
1808   void delete(NavigableMap<byte[], List<Cell>> familyMap,
1809       Durability durability) throws IOException {
1810     Delete delete = new Delete(FOR_UNIT_TESTS_ONLY);
1811     delete.setFamilyCellMap(familyMap);
1812     delete.setDurability(durability);
1813     doBatchMutate(delete);
1814   }
1815 
1816   /**
1817    * Setup correct timestamps in the KVs in Delete object.
1818    * Caller should have the row and region locks.
1819    * @param familyMap
1820    * @param byteNow
1821    * @throws IOException
1822    */
1823   void prepareDeleteTimestamps(Map<byte[], List<Cell>> familyMap, byte[] byteNow)
1824       throws IOException {
1825     for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
1826 
1827       byte[] family = e.getKey();
1828       List<Cell> cells = e.getValue();
1829       Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
1830 
1831       for (Cell cell: cells) {
1832         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
1833         //  Check if time is LATEST, change to time of most recent addition if so
1834         //  This is expensive.
1835         if (kv.isLatestTimestamp() && kv.isDeleteType()) {
1836           byte[] qual = kv.getQualifier();
1837           if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
1838 
1839           Integer count = kvCount.get(qual);
1840           if (count == null) {
1841             kvCount.put(qual, 1);
1842           } else {
1843             kvCount.put(qual, count + 1);
1844           }
1845           count = kvCount.get(qual);
1846 
1847           Get get = new Get(kv.getRow());
1848           get.setMaxVersions(count);
1849           get.addColumn(family, qual);
1850 
1851           List<Cell> result = get(get, false);
1852 
1853           if (result.size() < count) {
1854             // Nothing to delete
1855             kv.updateLatestStamp(byteNow);
1856             continue;
1857           }
1858           if (result.size() > count) {
1859             throw new RuntimeException("Unexpected size: " + result.size());
1860           }
1861           KeyValue getkv = KeyValueUtil.ensureKeyValue(result.get(count - 1));
1862           Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(),
1863               getkv.getBuffer(), getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
1864         } else {
1865           kv.updateLatestStamp(byteNow);
1866         }
1867       }
1868     }
1869   }
1870 
1871   /**
1872    * @param put
1873    * @throws IOException
1874    */
1875   public void put(Put put)
1876   throws IOException {
1877     checkReadOnly();
1878 
1879     // Do a rough check that we have resources to accept a write.  The check is
1880     // 'rough' in that between the resource check and the call to obtain a
1881     // read lock, resources may run out.  For now, the thought is that this
1882     // will be extremely rare; we'll deal with it when it happens.
1883     checkResources();
1884     startRegionOperation(Operation.PUT);
1885     this.writeRequestsCount.increment();
1886     try {
1887       // All edits for the given row (across all column families) must happen atomically.
1888       doBatchMutate(put);
1889     } finally {
1890       closeRegionOperation();
1891     }
1892   }
1893 
1894   /**
1895    * Struct-like class that tracks the progress of a batch operation,
1896    * accumulating status codes and tracking the index at which processing
1897    * is proceeding.
1898    */
1899   private static class BatchOperationInProgress<T> {
1900     T[] operations;
1901     int nextIndexToProcess = 0;
1902     OperationStatus[] retCodeDetails;
1903     WALEdit[] walEditsFromCoprocessors;
1904 
1905     public BatchOperationInProgress(T[] operations) {
1906       this.operations = operations;
1907       this.retCodeDetails = new OperationStatus[operations.length];
1908       this.walEditsFromCoprocessors = new WALEdit[operations.length];
1909       Arrays.fill(this.retCodeDetails, OperationStatus.NOT_RUN);
1910     }
1911 
1912     public boolean isDone() {
1913       return nextIndexToProcess == operations.length;
1914     }
1915   }
1916 
1917   /**
1918    * Perform a batch of mutations.
1919    * It supports only Put and Delete mutations and will ignore other types passed.
1920    * @param mutations the list of mutations
1921    * @return an array of OperationStatus which internally contains the
1922    *         OperationStatusCode and the exceptionMessage if any.
1923    * @throws IOException
1924    */
1925   public OperationStatus[] batchMutate(Mutation[] mutations) throws IOException {
1926     return batchMutate(mutations, false);
1927   }
1928 
1929   /**
1930    * Perform a batch of mutations.
1931    * It supports only Put and Delete mutations and will ignore other types passed.
1932    * @param mutations the list of mutations
1933    * @return an array of OperationStatus which internally contains the
1934    *         OperationStatusCode and the exceptionMessage if any.
1935    * @throws IOException
1936    */
1937   OperationStatus[] batchMutate(Mutation[] mutations, boolean isReplay)
1938       throws IOException {
1939     BatchOperationInProgress<Mutation> batchOp =
1940       new BatchOperationInProgress<Mutation>(mutations);
1941 
1942     boolean initialized = false;
1943 
1944     while (!batchOp.isDone()) {
1945       if (!isReplay) {
1946         checkReadOnly();
1947       }
1948       checkResources();
1949 
1950       long newSize;
1951       if (isReplay) {
1952         startRegionOperation(Operation.REPLAY_BATCH_MUTATE);
1953       } else {
1954         startRegionOperation(Operation.BATCH_MUTATE);
1955       }
1956 
1957       try {
1958         if (!initialized) {
1959           if (!isReplay) {
1960             this.writeRequestsCount.increment();
1961             doPreMutationHook(batchOp);
1962           }
1963           initialized = true;
1964         }
1965         long addedSize = doMiniBatchMutation(batchOp, isReplay);
1966         newSize = this.addAndGetGlobalMemstoreSize(addedSize);
1967       } finally {
1968         closeRegionOperation();
1969       }
1970       if (isFlushSize(newSize)) {
1971         requestFlush();
1972       }
1973     }
1974     return batchOp.retCodeDetails;
1975   }
1976 
1977 
1978   private void doPreMutationHook(BatchOperationInProgress<Mutation> batchOp)
1979       throws IOException {
1980     /* Run coprocessor pre hook outside of locks to avoid deadlock */
1981     WALEdit walEdit = new WALEdit();
1982     if (coprocessorHost != null) {
1983       for (int i = 0 ; i < batchOp.operations.length; i++) {
1984         Mutation m = batchOp.operations[i];
1985         if (m instanceof Put) {
1986           if (coprocessorHost.prePut((Put) m, walEdit, m.getDurability())) {
1987             // pre hook says skip this Put
1988             // mark as success and skip in doMiniBatchMutation
1989             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
1990           }
1991         } else if (m instanceof Delete) {
1992           if (coprocessorHost.preDelete((Delete) m, walEdit, m.getDurability())) {
1993             // pre hook says skip this Delete
1994             // mark as success and skip in doMiniBatchMutation
1995             batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
1996           }
1997         } else {
1998           // In case of passing Append mutations along with the Puts and Deletes in batchMutate
1999           // mark the operation return code as failure so that it will not be considered in
2000           // the doMiniBatchMutation
2001           batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.FAILURE,
2002               "Put/Delete mutations only supported in batchMutate() now");
2003         }
2004         if (!walEdit.isEmpty()) {
2005           batchOp.walEditsFromCoprocessors[i] = walEdit;
2006           walEdit = new WALEdit();
2007         }
2008       }
2009     }
2010   }
2011 
2012   @SuppressWarnings("unchecked")
2013   private long doMiniBatchMutation(BatchOperationInProgress<Mutation> batchOp,
2014       boolean isInReplay) throws IOException {
2015 
2016     // variable to note if all Put items are for the same CF -- metrics related
2017     boolean putsCfSetConsistent = true;
2018     //The set of columnFamilies first seen for Put.
2019     Set<byte[]> putsCfSet = null;
2020     // variable to note if all Delete items are for the same CF -- metrics related
2021     boolean deletesCfSetConsistent = true;
2022     //The set of columnFamilies first seen for Delete.
2023     Set<byte[]> deletesCfSet = null;
2024 
2025     WALEdit walEdit = new WALEdit(isInReplay);
2026     MultiVersionConsistencyControl.WriteEntry w = null;
2027     long txid = 0;
2028     boolean doRollBackMemstore = false;
2029     boolean locked = false;
2030 
2031     /** Keep track of the locks we hold so we can release them in finally clause */
2032     List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
2033     // reference family maps directly so coprocessors can mutate them if desired
2034     Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
2035     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
2036     int firstIndex = batchOp.nextIndexToProcess;
2037     int lastIndexExclusive = firstIndex;
2038     boolean success = false;
2039     int noOfPuts = 0, noOfDeletes = 0;
2040     try {
2041       // ------------------------------------
2042       // STEP 1. Try to acquire as many locks as we can, and ensure
2043       // we acquire at least one.
2044       // ----------------------------------
2045       int numReadyToWrite = 0;
2046       long now = EnvironmentEdgeManager.currentTimeMillis();
2047       while (lastIndexExclusive < batchOp.operations.length) {
2048         Mutation mutation = batchOp.operations[lastIndexExclusive];
2049         boolean isPutMutation = mutation instanceof Put;
2050 
2051         Map<byte[], List<Cell>> familyMap = mutation.getFamilyCellMap();
2052         // store the family map reference to allow for mutations
2053         familyMaps[lastIndexExclusive] = familyMap;
2054 
2055         // skip anything that "ran" already
2056         if (batchOp.retCodeDetails[lastIndexExclusive].getOperationStatusCode()
2057             != OperationStatusCode.NOT_RUN) {
2058           lastIndexExclusive++;
2059           continue;
2060         }
2061 
2062         try {
2063           if (isPutMutation) {
2064             // Check the families in the put. If bad, skip this one.
2065             if (isInReplay) {
2066               removeNonExistentColumnFamilyForReplay(familyMap);
2067             } else {
2068               checkFamilies(familyMap.keySet());
2069             }
2070             checkTimestamps(mutation.getFamilyCellMap(), now);
2071           } else {
2072             prepareDelete((Delete) mutation);
2073           }
2074         } catch (NoSuchColumnFamilyException nscf) {
2075           LOG.warn("No such column family in batch mutation", nscf);
2076           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2077               OperationStatusCode.BAD_FAMILY, nscf.getMessage());
2078           lastIndexExclusive++;
2079           continue;
2080         } catch (FailedSanityCheckException fsce) {
2081           LOG.warn("Batch Mutation did not pass sanity check", fsce);
2082           batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
2083               OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
2084           lastIndexExclusive++;
2085           continue;
2086         }
2087 
2088         // If we haven't got any rows in our batch, we should block to
2089         // get the next one.
2090         boolean shouldBlock = numReadyToWrite == 0;
2091         RowLock rowLock = null;
2092         try {
2093           rowLock = getRowLock(mutation.getRow(), shouldBlock);
2094         } catch (IOException ioe) {
2095           LOG.warn("Failed getting lock in batch put, row="
2096             + Bytes.toStringBinary(mutation.getRow()), ioe);
2097         }
2098         if (rowLock == null) {
2099           // We failed to grab another lock
2100           assert !shouldBlock : "Should never fail to get lock when blocking";
2101           break; // stop acquiring more rows for this batch
2102         } else {
2103           acquiredRowLocks.add(rowLock);
2104         }
2105 
2106         lastIndexExclusive++;
2107         numReadyToWrite++;
2108 
2109         if (isPutMutation) {
2110           // If Column Families stay consistent through out all of the
2111           // individual puts then metrics can be reported as a mutliput across
2112           // column families in the first put.
2113           if (putsCfSet == null) {
2114             putsCfSet = mutation.getFamilyCellMap().keySet();
2115           } else {
2116             putsCfSetConsistent = putsCfSetConsistent
2117                 && mutation.getFamilyCellMap().keySet().equals(putsCfSet);
2118           }
2119         } else {
2120           if (deletesCfSet == null) {
2121             deletesCfSet = mutation.getFamilyCellMap().keySet();
2122           } else {
2123             deletesCfSetConsistent = deletesCfSetConsistent
2124                 && mutation.getFamilyCellMap().keySet().equals(deletesCfSet);
2125           }
2126         }
2127       }
2128 
2129       // we should record the timestamp only after we have acquired the rowLock,
2130       // otherwise, newer puts/deletes are not guaranteed to have a newer timestamp
2131       now = EnvironmentEdgeManager.currentTimeMillis();
2132       byte[] byteNow = Bytes.toBytes(now);
2133 
2134       // Nothing to put/delete -- an exception in the above such as NoSuchColumnFamily?
2135       if (numReadyToWrite <= 0) return 0L;
2136 
2137       // We've now grabbed as many mutations off the list as we can
2138 
2139       // ------------------------------------
2140       // STEP 2. Update any LATEST_TIMESTAMP timestamps
2141       // ----------------------------------
2142       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2143         // skip invalid
2144         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2145             != OperationStatusCode.NOT_RUN) continue;
2146 
2147         Mutation mutation = batchOp.operations[i];
2148         if (mutation instanceof Put) {
2149           updateKVTimestamps(familyMaps[i].values(), byteNow);
2150           noOfPuts++;
2151         } else {
2152           prepareDeleteTimestamps(familyMaps[i], byteNow);
2153           noOfDeletes++;
2154         }
2155       }
2156 
2157       lock(this.updatesLock.readLock(), numReadyToWrite);
2158       locked = true;
2159 
2160       //
2161       // ------------------------------------
2162       // Acquire the latest mvcc number
2163       // ----------------------------------
2164       w = mvcc.beginMemstoreInsert();
2165 
2166       // calling the pre CP hook for batch mutation
2167       if (!isInReplay && coprocessorHost != null) {
2168         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2169           new MiniBatchOperationInProgress<Mutation>(batchOp.operations,
2170           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2171         if (coprocessorHost.preBatchMutate(miniBatchOp)) return 0L;
2172       }
2173 
2174       // ------------------------------------
2175       // STEP 3. Write back to memstore
2176       // Write to memstore. It is ok to write to memstore
2177       // first without updating the HLog because we do not roll
2178       // forward the memstore MVCC. The MVCC will be moved up when
2179       // the complete operation is done. These changes are not yet
2180       // visible to scanners till we update the MVCC. The MVCC is
2181       // moved only when the sync is complete.
2182       // ----------------------------------
2183       long addedSize = 0;
2184       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2185         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2186             != OperationStatusCode.NOT_RUN) {
2187           continue;
2188         }
2189         doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
2190         addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
2191       }
2192 
2193       // ------------------------------------
2194       // STEP 4. Build WAL edit
2195       // ----------------------------------
2196       Durability durability = Durability.USE_DEFAULT;
2197       for (int i = firstIndex; i < lastIndexExclusive; i++) {
2198         // Skip puts that were determined to be invalid during preprocessing
2199         if (batchOp.retCodeDetails[i].getOperationStatusCode()
2200             != OperationStatusCode.NOT_RUN) {
2201           continue;
2202         }
2203         batchOp.retCodeDetails[i] = OperationStatus.SUCCESS;
2204 
2205         Mutation m = batchOp.operations[i];
2206         Durability tmpDur = getEffectiveDurability(m.getDurability());
2207         if (tmpDur.ordinal() > durability.ordinal()) {
2208           durability = tmpDur;
2209         }
2210         if (tmpDur == Durability.SKIP_WAL) {
2211           recordMutationWithoutWal(m.getFamilyCellMap());
2212           continue;
2213         }
2214 
2215         // Add WAL edits by CP
2216         WALEdit fromCP = batchOp.walEditsFromCoprocessors[i];
2217         if (fromCP != null) {
2218           for (KeyValue kv : fromCP.getKeyValues()) {
2219             walEdit.add(kv);
2220           }
2221         }
2222         addFamilyMapToWALEdit(familyMaps[i], walEdit);
2223       }
2224 
2225       // -------------------------
2226       // STEP 5. Append the edit to WAL. Do not sync wal.
2227       // -------------------------
2228       Mutation mutation = batchOp.operations[firstIndex];
2229       if (walEdit.size() > 0) {
2230         txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
2231               walEdit, mutation.getClusterIds(), now, this.htableDescriptor);
2232       }
2233 
2234       // -------------------------------
2235       // STEP 6. Release row locks, etc.
2236       // -------------------------------
2237       if (locked) {
2238         this.updatesLock.readLock().unlock();
2239         locked = false;
2240       }
2241       releaseRowLocks(acquiredRowLocks);
2242 
2243       // -------------------------
2244       // STEP 7. Sync wal.
2245       // -------------------------
2246       if (walEdit.size() > 0) {
2247         syncOrDefer(txid, durability);
2248       }
2249       doRollBackMemstore = false;
2250       // calling the post CP hook for batch mutation
2251       if (!isInReplay && coprocessorHost != null) {
2252         MiniBatchOperationInProgress<Mutation> miniBatchOp =
2253           new MiniBatchOperationInProgress<Mutation>(batchOp.operations,
2254           batchOp.retCodeDetails, batchOp.walEditsFromCoprocessors, firstIndex, lastIndexExclusive);
2255         coprocessorHost.postBatchMutate(miniBatchOp);
2256       }
2257 
2258       // ------------------------------------------------------------------
2259       // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
2260       // ------------------------------------------------------------------
2261       if (w != null) {
2262         mvcc.completeMemstoreInsert(w);
2263         w = null;
2264       }
2265 
2266       // ------------------------------------
2267       // STEP 9. Run coprocessor post hooks. This should be done after the wal is
2268       // synced so that the coprocessor contract is adhered to.
2269       // ------------------------------------
2270       if (!isInReplay && coprocessorHost != null) {
2271         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2272           // only for successful puts
2273           if (batchOp.retCodeDetails[i].getOperationStatusCode()
2274               != OperationStatusCode.SUCCESS) {
2275             continue;
2276           }
2277           Mutation m = batchOp.operations[i];
2278           if (m instanceof Put) {
2279             coprocessorHost.postPut((Put) m, walEdit, m.getDurability());
2280           } else {
2281             coprocessorHost.postDelete((Delete) m, walEdit, m.getDurability());
2282           }
2283         }
2284       }
2285 
2286       success = true;
2287       return addedSize;
2288     } finally {
2289 
2290       // if the wal sync was unsuccessful, remove keys from memstore
2291       if (doRollBackMemstore) {
2292         rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive);
2293       }
2294       if (w != null) mvcc.completeMemstoreInsert(w);
2295 
2296       if (locked) {
2297         this.updatesLock.readLock().unlock();
2298       }
2299       releaseRowLocks(acquiredRowLocks);
2300 
2301       // See if the column families were consistent through the whole thing.
2302       // if they were then keep them. If they were not then pass a null.
2303       // null will be treated as unknown.
2304       // Total time taken might be involving Puts and Deletes.
2305       // Split the time for puts and deletes based on the total number of Puts and Deletes.
2306 
2307       if (noOfPuts > 0) {
2308         // There were some Puts in the batch.
2309         if (this.metricsRegion != null) {
2310           this.metricsRegion.updatePut();
2311         }
2312       }
2313       if (noOfDeletes > 0) {
2314         // There were some Deletes in the batch.
2315         if (this.metricsRegion != null) {
2316           this.metricsRegion.updateDelete();
2317         }
2318       }
2319       if (!success) {
2320         for (int i = firstIndex; i < lastIndexExclusive; i++) {
2321           if (batchOp.retCodeDetails[i].getOperationStatusCode() == OperationStatusCode.NOT_RUN) {
2322             batchOp.retCodeDetails[i] = OperationStatus.FAILURE;
2323           }
2324         }
2325       }
2326       batchOp.nextIndexToProcess = lastIndexExclusive;
2327     }
2328   }
2329 
2330   /**
2331    * Returns effective durability from the passed durability and
2332    * the table descriptor.
2333    */
2334   protected Durability getEffectiveDurability(Durability d) {
2335     return d == Durability.USE_DEFAULT ? this.durability : d;
2336   }
2337 
2338   //TODO, Think that gets/puts and deletes should be refactored a bit so that
2339   //the getting of the lock happens before, so that you would just pass it into
2340   //the methods. So in the case of checkAndMutate you could just do lockRow,
2341   //get, put, unlockRow or something
2342   /**
2343    *
2344    * @param row
2345    * @param family
2346    * @param qualifier
2347    * @param compareOp
2348    * @param comparator
2349    * @param w
2350    * @param writeToWAL
2351    * @throws IOException
2352    * @return true if the new put was executed, false otherwise
2353    */
2354   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
2355       CompareOp compareOp, ByteArrayComparable comparator, Mutation w,
2356       boolean writeToWAL)
2357   throws IOException{
2358     checkReadOnly();
2359     //TODO, add check for value length or maybe even better move this to the
2360     //client if this becomes a global setting
2361     checkResources();
2362     boolean isPut = w instanceof Put;
2363     if (!isPut && !(w instanceof Delete))
2364       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action must " +
2365       		"be Put or Delete");
2366     Row r = (Row)w;
2367     if (!Bytes.equals(row, r.getRow())) {
2368       throw new org.apache.hadoop.hbase.DoNotRetryIOException("Action's " +
2369       		"getRow must match the passed row");
2370     }
2371 
2372     startRegionOperation();
2373     try {
2374       Get get = new Get(row);
2375       checkFamily(family);
2376       get.addColumn(family, qualifier);
2377 
2378       // Lock row - note that doBatchMutate will relock this row if called
2379       RowLock rowLock = getRowLock(get.getRow());
2380       // wait for all previous transactions to complete (with lock held)
2381       mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
2382       List<Cell> result;
2383       try {
2384         result = get(get, false);
2385 
2386         boolean valueIsNull = comparator.getValue() == null ||
2387           comparator.getValue().length == 0;
2388         boolean matches = false;
2389         if (result.size() == 0 && valueIsNull) {
2390           matches = true;
2391         } else if (result.size() > 0 && result.get(0).getValueLength() == 0 &&
2392             valueIsNull) {
2393           matches = true;
2394         } else if (result.size() == 1 && !valueIsNull) {
2395           Cell kv = result.get(0);
2396           int compareResult = comparator.compareTo(kv.getValueArray(),
2397               kv.getValueOffset(), kv.getValueLength());
2398           switch (compareOp) {
2399           case LESS:
2400             matches = compareResult <= 0;
2401             break;
2402           case LESS_OR_EQUAL:
2403             matches = compareResult < 0;
2404             break;
2405           case EQUAL:
2406             matches = compareResult == 0;
2407             break;
2408           case NOT_EQUAL:
2409             matches = compareResult != 0;
2410             break;
2411           case GREATER_OR_EQUAL:
2412             matches = compareResult > 0;
2413             break;
2414           case GREATER:
2415             matches = compareResult >= 0;
2416             break;
2417           default:
2418             throw new RuntimeException("Unknown Compare op " + compareOp.name());
2419           }
2420         }
2421         //If matches put the new put or delete the new delete
2422         if (matches) {
2423           // All edits for the given row (across all column families) must
2424           // happen atomically.
2425           doBatchMutate((Mutation)w);
2426           this.checkAndMutateChecksPassed.increment();
2427           return true;
2428         }
2429         this.checkAndMutateChecksFailed.increment();
2430         return false;
2431       } finally {
2432         rowLock.release();
2433       }
2434     } finally {
2435       closeRegionOperation();
2436     }
2437   }
2438 
2439   private void doBatchMutate(Mutation mutation) throws IOException,
2440       org.apache.hadoop.hbase.DoNotRetryIOException {
2441     OperationStatus[] batchMutate = this.batchMutate(new Mutation[] { mutation });
2442     if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.SANITY_CHECK_FAILURE)) {
2443       throw new FailedSanityCheckException(batchMutate[0].getExceptionMsg());
2444     } else if (batchMutate[0].getOperationStatusCode().equals(OperationStatusCode.BAD_FAMILY)) {
2445       throw new NoSuchColumnFamilyException(batchMutate[0].getExceptionMsg());
2446     }
2447   }
2448 
2449   /**
2450    * Complete taking the snapshot on the region. Writes the region info and adds references to the
2451    * working snapshot directory.
2452    *
2453    * TODO for api consistency, consider adding another version with no {@link ForeignExceptionSnare}
2454    * arg.  (In the future other cancellable HRegion methods could eventually add a
2455    * {@link ForeignExceptionSnare}, or we could do something fancier).
2456    *
2457    * @param desc snasphot description object
2458    * @param exnSnare ForeignExceptionSnare that captures external exeptions in case we need to
2459    *   bail out.  This is allowed to be null and will just be ignored in that case.
2460    * @throws IOException if there is an external or internal error causing the snapshot to fail
2461    */
2462   public void addRegionToSnapshot(SnapshotDescription desc,
2463       ForeignExceptionSnare exnSnare) throws IOException {
2464     // This should be "fast" since we don't rewrite store files but instead
2465     // back up the store files by creating a reference
2466     Path rootDir = FSUtils.getRootDir(this.rsServices.getConfiguration());
2467     Path snapshotDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(desc, rootDir);
2468 
2469     // 1. dump region meta info into the snapshot directory
2470     LOG.debug("Storing region-info for snapshot.");
2471     HRegionFileSystem snapshotRegionFs = HRegionFileSystem.createRegionOnFileSystem(conf,
2472         this.fs.getFileSystem(), snapshotDir, getRegionInfo());
2473 
2474     // 2. iterate through all the stores in the region
2475     LOG.debug("Creating references for hfiles");
2476 
2477     // This ensures that we have an atomic view of the directory as long as we have < ls limit
2478     // (batch size of the files in a directory) on the namenode. Otherwise, we get back the files in
2479     // batches and may miss files being added/deleted. This could be more robust (iteratively
2480     // checking to see if we have all the files until we are sure), but the limit is currently 1000
2481     // files/batch, far more than the number of store files under a single column family.
2482     for (Store store : stores.values()) {
2483       // 2.1. build the snapshot reference directory for the store
2484       Path dstStoreDir = snapshotRegionFs.getStoreDir(store.getFamily().getNameAsString());
2485       List<StoreFile> storeFiles = new ArrayList<StoreFile>(store.getStorefiles());
2486       if (LOG.isDebugEnabled()) {
2487         LOG.debug("Adding snapshot references for " + storeFiles  + " hfiles");
2488       }
2489 
2490       // 2.2. iterate through all the store's files and create "references".
2491       int sz = storeFiles.size();
2492       for (int i = 0; i < sz; i++) {
2493         if (exnSnare != null) {
2494           exnSnare.rethrowException();
2495         }
2496         StoreFile storeFile = storeFiles.get(i);
2497         Path file = storeFile.getPath();
2498 
2499         LOG.debug("Creating reference for file (" + (i+1) + "/" + sz + ") : " + file);
2500         Path referenceFile = new Path(dstStoreDir, file.getName());
2501         boolean success = true;
2502         if (storeFile.isReference()) {
2503           // write the Reference object to the snapshot
2504           storeFile.getFileInfo().getReference().write(fs.getFileSystem(), referenceFile);
2505         } else {
2506           // create "reference" to this store file.  It is intentionally an empty file -- all
2507           // necessary information is captured by its fs location and filename.  This allows us to
2508           // only figure out what needs to be done via a single nn operation (instead of having to
2509           // open and read the files as well).
2510           success = fs.getFileSystem().createNewFile(referenceFile);
2511         }
2512         if (!success) {
2513           throw new IOException("Failed to create reference file:" + referenceFile);
2514         }
2515       }
2516     }
2517   }
2518 
2519   /**
2520    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP} with the
2521    * provided current timestamp.
2522    */
2523   void updateKVTimestamps(final Iterable<List<Cell>> keyLists, final byte[] now) {
2524     for (List<Cell> cells: keyLists) {
2525       if (cells == null) continue;
2526       for (Cell cell : cells) {
2527         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2528         kv.updateLatestStamp(now);
2529       }
2530     }
2531   }
2532 
2533   /*
2534    * Check if resources to support an update.
2535    *
2536    * We throw RegionTooBusyException if above memstore limit
2537    * and expect client to retry using some kind of backoff
2538   */
2539   private void checkResources()
2540     throws RegionTooBusyException {
2541     // If catalog region, do not impose resource constraints or block updates.
2542     if (this.getRegionInfo().isMetaRegion()) return;
2543 
2544     if (this.memstoreSize.get() > this.blockingMemStoreSize) {
2545       requestFlush();
2546       throw new RegionTooBusyException("Above memstore limit, " +
2547           "regionName=" + (this.getRegionInfo() == null ? "unknown" :
2548           this.getRegionInfo().getRegionNameAsString()) +
2549           ", server=" + (this.getRegionServerServices() == null ? "unknown" :
2550           this.getRegionServerServices().getServerName()) +
2551           ", memstoreSize=" + memstoreSize.get() +
2552           ", blockingMemStoreSize=" + blockingMemStoreSize);
2553     }
2554   }
2555 
2556   /**
2557    * @throws IOException Throws exception if region is in read-only mode.
2558    */
2559   protected void checkReadOnly() throws IOException {
2560     if (this.writestate.isReadOnly()) {
2561       throw new IOException("region is read only");
2562     }
2563   }
2564 
2565   /**
2566    * Add updates first to the hlog and then add values to memstore.
2567    * Warning: Assumption is caller has lock on passed in row.
2568    * @param family
2569    * @param edits Cell updates by column
2570    * @praram now
2571    * @throws IOException
2572    */
2573   private void put(final byte [] row, byte [] family, List<Cell> edits)
2574   throws IOException {
2575     NavigableMap<byte[], List<Cell>> familyMap;
2576     familyMap = new TreeMap<byte[], List<Cell>>(Bytes.BYTES_COMPARATOR);
2577 
2578     familyMap.put(family, edits);
2579     Put p = new Put(row);
2580     p.setFamilyCellMap(familyMap);
2581     doBatchMutate(p);
2582   }
2583 
2584   /**
2585    * Atomically apply the given map of family->edits to the memstore.
2586    * This handles the consistency control on its own, but the caller
2587    * should already have locked updatesLock.readLock(). This also does
2588    * <b>not</b> check the families for validity.
2589    *
2590    * @param familyMap Map of kvs per family
2591    * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
2592    *        If null, then this method internally creates a mvcc transaction.
2593    * @return the additional memory usage of the memstore caused by the
2594    * new entries.
2595    */
2596   private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
2597     MultiVersionConsistencyControl.WriteEntry localizedWriteEntry) {
2598     long size = 0;
2599     boolean freemvcc = false;
2600 
2601     try {
2602       if (localizedWriteEntry == null) {
2603         localizedWriteEntry = mvcc.beginMemstoreInsert();
2604         freemvcc = true;
2605       }
2606 
2607       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2608         byte[] family = e.getKey();
2609         List<Cell> cells = e.getValue();
2610 
2611         Store store = getStore(family);
2612         for (Cell cell: cells) {
2613           KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2614           kv.setMvccVersion(localizedWriteEntry.getWriteNumber());
2615           size += store.add(kv);
2616         }
2617       }
2618     } finally {
2619       if (freemvcc) {
2620         mvcc.completeMemstoreInsert(localizedWriteEntry);
2621       }
2622     }
2623 
2624      return size;
2625    }
2626 
2627   /**
2628    * Remove all the keys listed in the map from the memstore. This method is
2629    * called when a Put/Delete has updated memstore but subequently fails to update
2630    * the wal. This method is then invoked to rollback the memstore.
2631    */
2632   private void rollbackMemstore(BatchOperationInProgress<Mutation> batchOp,
2633                                 Map<byte[], List<Cell>>[] familyMaps,
2634                                 int start, int end) {
2635     int kvsRolledback = 0;
2636     for (int i = start; i < end; i++) {
2637       // skip over request that never succeeded in the first place.
2638       if (batchOp.retCodeDetails[i].getOperationStatusCode()
2639             != OperationStatusCode.SUCCESS) {
2640         continue;
2641       }
2642 
2643       // Rollback all the kvs for this row.
2644       Map<byte[], List<Cell>> familyMap  = familyMaps[i];
2645       for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
2646         byte[] family = e.getKey();
2647         List<Cell> cells = e.getValue();
2648 
2649         // Remove those keys from the memstore that matches our
2650         // key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
2651         // that even the memstoreTS has to match for keys that will be rolleded-back.
2652         Store store = getStore(family);
2653         for (Cell cell: cells) {
2654           store.rollback(KeyValueUtil.ensureKeyValue(cell));
2655           kvsRolledback++;
2656         }
2657       }
2658     }
2659     LOG.debug("rollbackMemstore rolled back " + kvsRolledback +
2660         " keyvalues from start:" + start + " to end:" + end);
2661   }
2662 
2663   /**
2664    * Check the collection of families for validity.
2665    * @throws NoSuchColumnFamilyException if a family does not exist.
2666    */
2667   void checkFamilies(Collection<byte[]> families)
2668   throws NoSuchColumnFamilyException {
2669     for (byte[] family : families) {
2670       checkFamily(family);
2671     }
2672   }
2673 
2674   /**
2675    * During replay, there could exist column families which are removed between region server
2676    * failure and replay
2677    */
2678   private void removeNonExistentColumnFamilyForReplay(
2679       final Map<byte[], List<Cell>> familyMap) {
2680     List<byte[]> nonExistentList = null;
2681     for (byte[] family : familyMap.keySet()) {
2682       if (!this.htableDescriptor.hasFamily(family)) {
2683         if (nonExistentList == null) {
2684           nonExistentList = new ArrayList<byte[]>();
2685         }
2686         nonExistentList.add(family);
2687       }
2688     }
2689     if (nonExistentList != null) {
2690       for (byte[] family : nonExistentList) {
2691         // Perhaps schema was changed between crash and replay
2692         LOG.info("No family for " + Bytes.toString(family) + " omit from reply.");
2693         familyMap.remove(family);
2694       }
2695     }
2696   }
2697 
2698   void checkTimestamps(final Map<byte[], List<Cell>> familyMap,
2699       long now) throws FailedSanityCheckException {
2700     if (timestampSlop == HConstants.LATEST_TIMESTAMP) {
2701       return;
2702     }
2703     long maxTs = now + timestampSlop;
2704     for (List<Cell> kvs : familyMap.values()) {
2705       for (Cell cell : kvs) {
2706         // see if the user-side TS is out of range. latest = server-side
2707         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
2708         if (!kv.isLatestTimestamp() && kv.getTimestamp() > maxTs) {
2709           throw new FailedSanityCheckException("Timestamp for KV out of range "
2710               + cell + " (too.new=" + timestampSlop + ")");
2711         }
2712       }
2713     }
2714   }
2715 
2716   /**
2717    * Append the given map of family->edits to a WALEdit data structure.
2718    * This does not write to the HLog itself.
2719    * @param familyMap map of family->edits
2720    * @param walEdit the destination entry to append into
2721    */
2722   private void addFamilyMapToWALEdit(Map<byte[], List<Cell>> familyMap,
2723       WALEdit walEdit) {
2724     for (List<Cell> edits : familyMap.values()) {
2725       for (Cell cell : edits) {
2726         walEdit.add(KeyValueUtil.ensureKeyValue(cell));
2727       }
2728     }
2729   }
2730 
2731   private void requestFlush() {
2732     if (this.rsServices == null) {
2733       return;
2734     }
2735     synchronized (writestate) {
2736       if (this.writestate.isFlushRequested()) {
2737         return;
2738       }
2739       writestate.flushRequested = true;
2740     }
2741     // Make request outside of synchronize block; HBASE-818.
2742     this.rsServices.getFlushRequester().requestFlush(this);
2743     if (LOG.isDebugEnabled()) {
2744       LOG.debug("Flush requested on " + this);
2745     }
2746   }
2747 
2748   /*
2749    * @param size
2750    * @return True if size is over the flush threshold
2751    */
2752   private boolean isFlushSize(final long size) {
2753     return size > this.memstoreFlushSize;
2754   }
2755 
2756   /**
2757    * Read the edits log put under this region by wal log splitting process.  Put
2758    * the recovered edits back up into this region.
2759    *
2760    * <p>We can ignore any log message that has a sequence ID that's equal to or
2761    * lower than minSeqId.  (Because we know such log messages are already
2762    * reflected in the HFiles.)
2763    *
2764    * <p>While this is running we are putting pressure on memory yet we are
2765    * outside of our usual accounting because we are not yet an onlined region
2766    * (this stuff is being run as part of Region initialization).  This means
2767    * that if we're up against global memory limits, we'll not be flagged to flush
2768    * because we are not online. We can't be flushed by usual mechanisms anyways;
2769    * we're not yet online so our relative sequenceids are not yet aligned with
2770    * HLog sequenceids -- not till we come up online, post processing of split
2771    * edits.
2772    *
2773    * <p>But to help relieve memory pressure, at least manage our own heap size
2774    * flushing if are in excess of per-region limits.  Flushing, though, we have
2775    * to be careful and avoid using the regionserver/hlog sequenceid.  Its running
2776    * on a different line to whats going on in here in this region context so if we
2777    * crashed replaying these edits, but in the midst had a flush that used the
2778    * regionserver log with a sequenceid in excess of whats going on in here
2779    * in this region and with its split editlogs, then we could miss edits the
2780    * next time we go to recover. So, we have to flush inline, using seqids that
2781    * make sense in a this single region context only -- until we online.
2782    *
2783    * @param regiondir
2784    * @param maxSeqIdInStores Any edit found in split editlogs needs to be in excess of
2785    * the maxSeqId for the store to be applied, else its skipped.
2786    * @param reporter
2787    * @return the sequence id of the last edit added to this region out of the
2788    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
2789    * @throws UnsupportedEncodingException
2790    * @throws IOException
2791    */
2792   protected long replayRecoveredEditsIfAny(final Path regiondir,
2793       Map<byte[], Long> maxSeqIdInStores,
2794       final CancelableProgressable reporter, final MonitoredTask status)
2795       throws UnsupportedEncodingException, IOException {
2796     long minSeqIdForTheRegion = -1;
2797     for (Long maxSeqIdInStore : maxSeqIdInStores.values()) {
2798       if (maxSeqIdInStore < minSeqIdForTheRegion || minSeqIdForTheRegion == -1) {
2799         minSeqIdForTheRegion = maxSeqIdInStore;
2800       }
2801     }
2802     long seqid = minSeqIdForTheRegion;
2803 
2804     FileSystem fs = this.fs.getFileSystem();
2805     NavigableSet<Path> files = HLogUtil.getSplitEditFilesSorted(fs, regiondir);
2806     if (LOG.isDebugEnabled()) {
2807       LOG.debug("Found " + (files == null ? 0 : files.size())
2808         + " recovered edits file(s) under " + regiondir);
2809     }
2810 
2811     if (files == null || files.isEmpty()) return seqid;
2812 
2813     for (Path edits: files) {
2814       if (edits == null || !fs.exists(edits)) {
2815         LOG.warn("Null or non-existent edits file: " + edits);
2816         continue;
2817       }
2818       if (isZeroLengthThenDelete(fs, edits)) continue;
2819 
2820       long maxSeqId = Long.MAX_VALUE;
2821       String fileName = edits.getName();
2822       maxSeqId = Math.abs(Long.parseLong(fileName));
2823       if (maxSeqId <= minSeqIdForTheRegion) {
2824         if (LOG.isDebugEnabled()) {
2825           String msg = "Maximum sequenceid for this log is " + maxSeqId
2826             + " and minimum sequenceid for the region is " + minSeqIdForTheRegion
2827             + ", skipped the whole file, path=" + edits;
2828           LOG.debug(msg);
2829         }
2830         continue;
2831       }
2832 
2833       try {
2834         seqid = replayRecoveredEdits(edits, maxSeqIdInStores, reporter);
2835       } catch (IOException e) {
2836         boolean skipErrors = conf.getBoolean(
2837             HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS,
2838             conf.getBoolean(
2839                 "hbase.skip.errors",
2840                 HConstants.DEFAULT_HREGION_EDITS_REPLAY_SKIP_ERRORS));
2841         if (conf.get("hbase.skip.errors") != null) {
2842           LOG.warn(
2843               "The property 'hbase.skip.errors' has been deprecated. Please use " +
2844               HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS + " instead.");
2845         }
2846         if (skipErrors) {
2847           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
2848           LOG.error(HConstants.HREGION_EDITS_REPLAY_SKIP_ERRORS
2849               + "=true so continuing. Renamed " + edits +
2850               " as " + p, e);
2851         } else {
2852           throw e;
2853         }
2854       }
2855     }
2856     // The edits size added into rsAccounting during this replaying will not
2857     // be required any more. So just clear it.
2858     if (this.rsAccounting != null) {
2859       this.rsAccounting.clearRegionReplayEditsSize(this.getRegionName());
2860     }
2861     if (seqid > minSeqIdForTheRegion) {
2862       // Then we added some edits to memory. Flush and cleanup split edit files.
2863       internalFlushcache(null, seqid, status);
2864     }
2865     // Now delete the content of recovered edits.  We're done w/ them.
2866     for (Path file: files) {
2867       if (!fs.delete(file, false)) {
2868         LOG.error("Failed delete of " + file);
2869       } else {
2870         LOG.debug("Deleted recovered.edits file=" + file);
2871       }
2872     }
2873     return seqid;
2874   }
2875 
2876   /*
2877    * @param edits File of recovered edits.
2878    * @param maxSeqIdInStores Maximum sequenceid found in each store.  Edits in log
2879    * must be larger than this to be replayed for each store.
2880    * @param reporter
2881    * @return the sequence id of the last edit added to this region out of the
2882    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
2883    * @throws IOException
2884    */
2885   private long replayRecoveredEdits(final Path edits,
2886       Map<byte[], Long> maxSeqIdInStores, final CancelableProgressable reporter)
2887     throws IOException {
2888     String msg = "Replaying edits from " + edits;
2889     LOG.info(msg);
2890     MonitoredTask status = TaskMonitor.get().createStatus(msg);
2891     FileSystem fs = this.fs.getFileSystem();
2892 
2893     status.setStatus("Opening logs");
2894     HLog.Reader reader = null;
2895     try {
2896       reader = HLogFactory.createReader(fs, edits, conf);
2897       long currentEditSeqId = -1;
2898       long firstSeqIdInLog = -1;
2899       long skippedEdits = 0;
2900       long editsCount = 0;
2901       long intervalEdits = 0;
2902       HLog.Entry entry;
2903       Store store = null;
2904       boolean reported_once = false;
2905 
2906       try {
2907         // How many edits seen before we check elapsed time
2908         int interval = this.conf.getInt("hbase.hstore.report.interval.edits",
2909             2000);
2910         // How often to send a progress report (default 1/2 master timeout)
2911         int period = this.conf.getInt("hbase.hstore.report.period",
2912           this.conf.getInt(AssignmentManager.ASSIGNMENT_TIMEOUT,
2913             AssignmentManager.DEFAULT_ASSIGNMENT_TIMEOUT_DEFAULT) / 2);
2914         long lastReport = EnvironmentEdgeManager.currentTimeMillis();
2915 
2916         while ((entry = reader.next()) != null) {
2917           HLogKey key = entry.getKey();
2918           WALEdit val = entry.getEdit();
2919 
2920           if (reporter != null) {
2921             intervalEdits += val.size();
2922             if (intervalEdits >= interval) {
2923               // Number of edits interval reached
2924               intervalEdits = 0;
2925               long cur = EnvironmentEdgeManager.currentTimeMillis();
2926               if (lastReport + period <= cur) {
2927                 status.setStatus("Replaying edits..." +
2928                     " skipped=" + skippedEdits +
2929                     " edits=" + editsCount);
2930                 // Timeout reached
2931                 if(!reporter.progress()) {
2932                   msg = "Progressable reporter failed, stopping replay";
2933                   LOG.warn(msg);
2934                   status.abort(msg);
2935                   throw new IOException(msg);
2936                 }
2937                 reported_once = true;
2938                 lastReport = cur;
2939               }
2940             }
2941           }
2942 
2943           // Start coprocessor replay here. The coprocessor is for each WALEdit
2944           // instead of a KeyValue.
2945           if (coprocessorHost != null) {
2946             status.setStatus("Running pre-WAL-restore hook in coprocessors");
2947             if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
2948               // if bypass this log entry, ignore it ...
2949               continue;
2950             }
2951           }
2952 
2953           if (firstSeqIdInLog == -1) {
2954             firstSeqIdInLog = key.getLogSeqNum();
2955           }
2956           boolean flush = false;
2957           for (KeyValue kv: val.getKeyValues()) {
2958             // Check this edit is for me. Also, guard against writing the special
2959             // METACOLUMN info such as HBASE::CACHEFLUSH entries
2960             if (kv.matchingFamily(WALEdit.METAFAMILY) ||
2961                 !Bytes.equals(key.getEncodedRegionName(),
2962                   this.getRegionInfo().getEncodedNameAsBytes())) {
2963               //this is a special edit, we should handle it
2964               CompactionDescriptor compaction = WALEdit.getCompaction(kv);
2965               if (compaction != null) {
2966                 //replay the compaction
2967                 completeCompactionMarker(compaction);
2968               }
2969 
2970               skippedEdits++;
2971               continue;
2972             }
2973             // Figure which store the edit is meant for.
2974             if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
2975               store = this.stores.get(kv.getFamily());
2976             }
2977             if (store == null) {
2978               // This should never happen.  Perhaps schema was changed between
2979               // crash and redeploy?
2980               LOG.warn("No family for " + kv);
2981               skippedEdits++;
2982               continue;
2983             }
2984             // Now, figure if we should skip this edit.
2985             if (key.getLogSeqNum() <= maxSeqIdInStores.get(store.getFamily()
2986                 .getName())) {
2987               skippedEdits++;
2988               continue;
2989             }
2990             currentEditSeqId = key.getLogSeqNum();
2991             // Once we are over the limit, restoreEdit will keep returning true to
2992             // flush -- but don't flush until we've played all the kvs that make up
2993             // the WALEdit.
2994             flush = restoreEdit(store, kv);
2995             editsCount++;
2996           }
2997           if (flush) internalFlushcache(null, currentEditSeqId, status);
2998 
2999           if (coprocessorHost != null) {
3000             coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
3001           }
3002         }
3003       } catch (EOFException eof) {
3004         Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3005         msg = "Encountered EOF. Most likely due to Master failure during " +
3006             "log spliting, so we have this data in another edit.  " +
3007             "Continuing, but renaming " + edits + " as " + p;
3008         LOG.warn(msg, eof);
3009         status.abort(msg);
3010       } catch (IOException ioe) {
3011         // If the IOE resulted from bad file format,
3012         // then this problem is idempotent and retrying won't help
3013         if (ioe.getCause() instanceof ParseException) {
3014           Path p = HLogUtil.moveAsideBadEditsFile(fs, edits);
3015           msg = "File corruption encountered!  " +
3016               "Continuing, but renaming " + edits + " as " + p;
3017           LOG.warn(msg, ioe);
3018           status.setStatus(msg);
3019         } else {
3020           status.abort(StringUtils.stringifyException(ioe));
3021           // other IO errors may be transient (bad network connection,
3022           // checksum exception on one datanode, etc).  throw & retry
3023           throw ioe;
3024         }
3025       }
3026       if (reporter != null && !reported_once) {
3027         reporter.progress();
3028       }
3029       msg = "Applied " + editsCount + ", skipped " + skippedEdits +
3030         ", firstSequenceidInLog=" + firstSeqIdInLog +
3031         ", maxSequenceidInLog=" + currentEditSeqId + ", path=" + edits;
3032       status.markComplete(msg);
3033       LOG.debug(msg);
3034       return currentEditSeqId;
3035     } finally {
3036       status.cleanup();
3037       if (reader != null) {
3038          reader.close();
3039       }
3040     }
3041   }
3042 
3043   /**
3044    * Call to complete a compaction. Its for the case where we find in the WAL a compaction
3045    * that was not finished.  We could find one recovering a WAL after a regionserver crash.
3046    * See HBASE-2331.
3047    * @param fs
3048    * @param compaction
3049    */
3050   void completeCompactionMarker(CompactionDescriptor compaction)
3051       throws IOException {
3052     Store store = this.getStore(compaction.getFamilyName().toByteArray());
3053     if (store == null) {
3054       LOG.warn("Found Compaction WAL edit for deleted family:" +
3055           Bytes.toString(compaction.getFamilyName().toByteArray()));
3056       return;
3057     }
3058     store.completeCompactionMarker(compaction);
3059   }
3060 
3061   /**
3062    * Used by tests
3063    * @param s Store to add edit too.
3064    * @param kv KeyValue to add.
3065    * @return True if we should flush.
3066    */
3067   protected boolean restoreEdit(final Store s, final KeyValue kv) {
3068     long kvSize = s.add(kv);
3069     if (this.rsAccounting != null) {
3070       rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
3071     }
3072     return isFlushSize(this.addAndGetGlobalMemstoreSize(kvSize));
3073   }
3074 
3075   /*
3076    * @param fs
3077    * @param p File to check.
3078    * @return True if file was zero-length (and if so, we'll delete it in here).
3079    * @throws IOException
3080    */
3081   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
3082       throws IOException {
3083     FileStatus stat = fs.getFileStatus(p);
3084     if (stat.getLen() > 0) return false;
3085     LOG.warn("File " + p + " is zero-length, deleting.");
3086     fs.delete(p, false);
3087     return true;
3088   }
3089 
3090   protected HStore instantiateHStore(final HColumnDescriptor family) throws IOException {
3091     return new HStore(this, family, this.conf);
3092   }
3093 
3094   /**
3095    * Return HStore instance.
3096    * Use with caution.  Exposed for use of fixup utilities.
3097    * @param column Name of column family hosted by this region.
3098    * @return Store that goes with the family on passed <code>column</code>.
3099    * TODO: Make this lookup faster.
3100    */
3101   public Store getStore(final byte[] column) {
3102     return this.stores.get(column);
3103   }
3104 
3105   public Map<byte[], Store> getStores() {
3106     return this.stores;
3107   }
3108 
3109   /**
3110    * Return list of storeFiles for the set of CFs.
3111    * Uses closeLock to prevent the race condition where a region closes
3112    * in between the for loop - closing the stores one by one, some stores
3113    * will return 0 files.
3114    * @return List of storeFiles.
3115    */
3116   public List<String> getStoreFileList(final byte [][] columns)
3117     throws IllegalArgumentException {
3118     List<String> storeFileNames = new ArrayList<String>();
3119     synchronized(closeLock) {
3120       for(byte[] column : columns) {
3121         Store store = this.stores.get(column);
3122         if (store == null) {
3123           throw new IllegalArgumentException("No column family : " +
3124               new String(column) + " available");
3125         }
3126         for (StoreFile storeFile: store.getStorefiles()) {
3127           storeFileNames.add(storeFile.getPath().toString());
3128         }
3129       }
3130     }
3131     return storeFileNames;
3132   }
3133   //////////////////////////////////////////////////////////////////////////////
3134   // Support code
3135   //////////////////////////////////////////////////////////////////////////////
3136 
3137   /** Make sure this is a valid row for the HRegion */
3138   void checkRow(final byte [] row, String op) throws IOException {
3139     if (!rowIsInRange(getRegionInfo(), row)) {
3140       throw new WrongRegionException("Requested row out of range for " +
3141           op + " on HRegion " + this + ", startKey='" +
3142           Bytes.toStringBinary(getStartKey()) + "', getEndKey()='" +
3143           Bytes.toStringBinary(getEndKey()) + "', row='" +
3144           Bytes.toStringBinary(row) + "'");
3145     }
3146   }
3147 
3148   /**
3149    * Tries to acquire a lock on the given row.
3150    * @param waitForLock if true, will block until the lock is available.
3151    *        Otherwise, just tries to obtain the lock and returns
3152    *        false if unavailable.
3153    * @return the row lock if acquired,
3154    *   null if waitForLock was false and the lock was not acquired
3155    * @throws IOException if waitForLock was true and the lock could not be acquired after waiting
3156    */
3157   public RowLock getRowLock(byte[] row, boolean waitForLock) throws IOException {
3158     checkRow(row, "row lock");
3159     startRegionOperation();
3160     try {
3161       HashedBytes rowKey = new HashedBytes(row);
3162       RowLockContext rowLockContext = new RowLockContext(rowKey);
3163 
3164       // loop until we acquire the row lock (unless !waitForLock)
3165       while (true) {
3166         RowLockContext existingContext = lockedRows.putIfAbsent(rowKey, rowLockContext);
3167         if (existingContext == null) {
3168           // Row is not already locked by any thread, use newly created context.
3169           break;
3170         } else if (existingContext.ownedByCurrentThread()) {
3171           // Row is already locked by current thread, reuse existing context instead.
3172           rowLockContext = existingContext;
3173           break;
3174         } else {
3175           // Row is already locked by some other thread, give up or wait for it
3176           if (!waitForLock) {
3177             return null;
3178           }
3179           try {
3180             if (!existingContext.latch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) {
3181               throw new IOException("Timed out waiting for lock for row: " + rowKey);
3182             }
3183           } catch (InterruptedException ie) {
3184             LOG.warn("Thread interrupted waiting for lock on row: " + rowKey);
3185             InterruptedIOException iie = new InterruptedIOException();
3186             iie.initCause(ie);
3187             throw iie;
3188           }
3189         }
3190       }
3191 
3192       // allocate new lock for this thread
3193       return rowLockContext.newLock();
3194     } finally {
3195       closeRegionOperation();
3196     }
3197   }
3198 
3199   /**
3200    * Acqures a lock on the given row.
3201    * The same thread may acquire multiple locks on the same row.
3202    * @return the acquired row lock
3203    * @throws IOException if the lock could not be acquired after waiting
3204    */
3205   public RowLock getRowLock(byte[] row) throws IOException {
3206     return getRowLock(row, true);
3207   }
3208 
3209   /**
3210    * If the given list of row locks is not null, releases all locks.
3211    */
3212   public void releaseRowLocks(List<RowLock> rowLocks) {
3213     if (rowLocks != null) {
3214       for (RowLock rowLock : rowLocks) {
3215         rowLock.release();
3216       }
3217       rowLocks.clear();
3218     }
3219   }
3220 
3221   /**
3222    * Determines whether multiple column families are present
3223    * Precondition: familyPaths is not null
3224    *
3225    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3226    */
3227   private static boolean hasMultipleColumnFamilies(
3228       List<Pair<byte[], String>> familyPaths) {
3229     boolean multipleFamilies = false;
3230     byte[] family = null;
3231     for (Pair<byte[], String> pair : familyPaths) {
3232       byte[] fam = pair.getFirst();
3233       if (family == null) {
3234         family = fam;
3235       } else if (!Bytes.equals(family, fam)) {
3236         multipleFamilies = true;
3237         break;
3238       }
3239     }
3240     return multipleFamilies;
3241   }
3242 
3243 
3244   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths,
3245                                 boolean assignSeqId) throws IOException {
3246     return bulkLoadHFiles(familyPaths, assignSeqId, null);
3247   }
3248 
3249   /**
3250    * Attempts to atomically load a group of hfiles.  This is critical for loading
3251    * rows with multiple column families atomically.
3252    *
3253    * @param familyPaths List of Pair<byte[] column family, String hfilePath>
3254    * @param bulkLoadListener Internal hooks enabling massaging/preparation of a
3255    * file about to be bulk loaded
3256    * @param assignSeqId
3257    * @return true if successful, false if failed recoverably
3258    * @throws IOException if failed unrecoverably.
3259    */
3260   public boolean bulkLoadHFiles(List<Pair<byte[], String>> familyPaths, boolean assignSeqId,
3261       BulkLoadListener bulkLoadListener) throws IOException {
3262     Preconditions.checkNotNull(familyPaths);
3263     // we need writeLock for multi-family bulk load
3264     startBulkRegionOperation(hasMultipleColumnFamilies(familyPaths));
3265     try {
3266       this.writeRequestsCount.increment();
3267 
3268       // There possibly was a split that happend between when the split keys
3269       // were gathered and before the HReiogn's write lock was taken.  We need
3270       // to validate the HFile region before attempting to bulk load all of them
3271       List<IOException> ioes = new ArrayList<IOException>();
3272       List<Pair<byte[], String>> failures = new ArrayList<Pair<byte[], String>>();
3273       for (Pair<byte[], String> p : familyPaths) {
3274         byte[] familyName = p.getFirst();
3275         String path = p.getSecond();
3276 
3277         Store store = getStore(familyName);
3278         if (store == null) {
3279           IOException ioe = new org.apache.hadoop.hbase.DoNotRetryIOException(
3280               "No such column family " + Bytes.toStringBinary(familyName));
3281           ioes.add(ioe);
3282         } else {
3283           try {
3284             store.assertBulkLoadHFileOk(new Path(path));
3285           } catch (WrongRegionException wre) {
3286             // recoverable (file doesn't fit in region)
3287             failures.add(p);
3288           } catch (IOException ioe) {
3289             // unrecoverable (hdfs problem)
3290             ioes.add(ioe);
3291           }
3292         }
3293       }
3294 
3295       // validation failed because of some sort of IO problem.
3296       if (ioes.size() != 0) {
3297         IOException e = MultipleIOException.createIOException(ioes);
3298         LOG.error("There were one or more IO errors when checking if the bulk load is ok.", e);
3299         throw e;
3300       }
3301 
3302       // validation failed, bail out before doing anything permanent.
3303       if (failures.size() != 0) {
3304         StringBuilder list = new StringBuilder();
3305         for (Pair<byte[], String> p : failures) {
3306           list.append("\n").append(Bytes.toString(p.getFirst())).append(" : ")
3307             .append(p.getSecond());
3308         }
3309         // problem when validating
3310         LOG.warn("There was a recoverable bulk load failure likely due to a" +
3311             " split.  These (family, HFile) pairs were not loaded: " + list);
3312         return false;
3313       }
3314 
3315       for (Pair<byte[], String> p : familyPaths) {
3316         byte[] familyName = p.getFirst();
3317         String path = p.getSecond();
3318         Store store = getStore(familyName);
3319         try {
3320           String finalPath = path;
3321           if(bulkLoadListener != null) {
3322             finalPath = bulkLoadListener.prepareBulkLoad(familyName, path);
3323           }
3324           store.bulkLoadHFile(finalPath, assignSeqId ? this.log.obtainSeqNum() : -1);
3325           if(bulkLoadListener != null) {
3326             bulkLoadListener.doneBulkLoad(familyName, path);
3327           }
3328         } catch (IOException ioe) {
3329           // A failure here can cause an atomicity violation that we currently
3330           // cannot recover from since it is likely a failed HDFS operation.
3331 
3332           // TODO Need a better story for reverting partial failures due to HDFS.
3333           LOG.error("There was a partial failure due to IO when attempting to" +
3334               " load " + Bytes.toString(p.getFirst()) + " : "+ p.getSecond(), ioe);
3335           if(bulkLoadListener != null) {
3336             try {
3337               bulkLoadListener.failedBulkLoad(familyName, path);
3338             } catch (Exception ex) {
3339               LOG.error("Error while calling failedBulkLoad for family "+
3340                   Bytes.toString(familyName)+" with path "+path, ex);
3341             }
3342           }
3343           throw ioe;
3344         }
3345       }
3346       return true;
3347     } finally {
3348       closeBulkRegionOperation();
3349     }
3350   }
3351 
3352   @Override
3353   public boolean equals(Object o) {
3354     if (!(o instanceof HRegion)) {
3355       return false;
3356     }
3357     return Bytes.equals(this.getRegionName(), ((HRegion) o).getRegionName());
3358   }
3359 
3360   @Override
3361   public int hashCode() {
3362     return Bytes.hashCode(this.getRegionName());
3363   }
3364 
3365   @Override
3366   public String toString() {
3367     return this.getRegionNameAsString();
3368   }
3369 
3370   /**
3371    * RegionScannerImpl is used to combine scanners from multiple Stores (aka column families).
3372    */
3373   class RegionScannerImpl implements RegionScanner {
3374     // Package local for testability
3375     KeyValueHeap storeHeap = null;
3376     /** Heap of key-values that are not essential for the provided filters and are thus read
3377      * on demand, if on-demand column family loading is enabled.*/
3378     KeyValueHeap joinedHeap = null;
3379     /**
3380      * If the joined heap data gathering is interrupted due to scan limits, this will
3381      * contain the row for which we are populating the values.*/
3382     private KeyValue joinedContinuationRow = null;
3383     // KeyValue indicating that limit is reached when scanning
3384     private final KeyValue KV_LIMIT = new KeyValue();
3385     private final byte [] stopRow;
3386     private Filter filter;
3387     private int batch;
3388     private int isScan;
3389     private boolean filterClosed = false;
3390     private long readPt;
3391     private long maxResultSize;
3392     private HRegion region;
3393 
3394     @Override
3395     public HRegionInfo getRegionInfo() {
3396       return region.getRegionInfo();
3397     }
3398 
3399     RegionScannerImpl(Scan scan, List<KeyValueScanner> additionalScanners, HRegion region)
3400         throws IOException {
3401 
3402       this.region = region;
3403       this.maxResultSize = scan.getMaxResultSize();
3404       if (scan.hasFilter()) {
3405         this.filter = new FilterWrapper(scan.getFilter());
3406       } else {
3407         this.filter = null;
3408       }
3409 
3410       this.batch = scan.getBatch();
3411       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW) && !scan.isGetScan()) {
3412         this.stopRow = null;
3413       } else {
3414         this.stopRow = scan.getStopRow();
3415       }
3416       // If we are doing a get, we want to be [startRow,endRow] normally
3417       // it is [startRow,endRow) and if startRow=endRow we get nothing.
3418       this.isScan = scan.isGetScan() ? -1 : 0;
3419 
3420       // synchronize on scannerReadPoints so that nobody calculates
3421       // getSmallestReadPoint, before scannerReadPoints is updated.
3422       IsolationLevel isolationLevel = scan.getIsolationLevel();
3423       synchronized(scannerReadPoints) {
3424         if (isolationLevel == IsolationLevel.READ_UNCOMMITTED) {
3425           // This scan can read even uncommitted transactions
3426           this.readPt = Long.MAX_VALUE;
3427           MultiVersionConsistencyControl.setThreadReadPoint(this.readPt);
3428         } else {
3429           this.readPt = MultiVersionConsistencyControl.resetThreadReadPoint(mvcc);
3430         }
3431         scannerReadPoints.put(this, this.readPt);
3432       }
3433 
3434       // Here we separate all scanners into two lists - scanner that provide data required
3435       // by the filter to operate (scanners list) and all others (joinedScanners list).
3436       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
3437       List<KeyValueScanner> joinedScanners = new ArrayList<KeyValueScanner>();
3438       if (additionalScanners != null) {
3439         scanners.addAll(additionalScanners);
3440       }
3441 
3442       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
3443           scan.getFamilyMap().entrySet()) {
3444         Store store = stores.get(entry.getKey());
3445         KeyValueScanner scanner = store.getScanner(scan, entry.getValue());
3446         if (this.filter == null || !scan.doLoadColumnFamiliesOnDemand()
3447           || this.filter.isFamilyEssential(entry.getKey())) {
3448           scanners.add(scanner);
3449         } else {
3450           joinedScanners.add(scanner);
3451         }
3452       }
3453       this.storeHeap = new KeyValueHeap(scanners, comparator);
3454       if (!joinedScanners.isEmpty()) {
3455         this.joinedHeap = new KeyValueHeap(joinedScanners, comparator);
3456       }
3457     }
3458 
3459     RegionScannerImpl(Scan scan, HRegion region) throws IOException {
3460       this(scan, null, region);
3461     }
3462 
3463     @Override
3464     public long getMaxResultSize() {
3465       return maxResultSize;
3466     }
3467 
3468     @Override
3469     public long getMvccReadPoint() {
3470       return this.readPt;
3471     }
3472 
3473     /**
3474      * Reset both the filter and the old filter.
3475      *
3476      * @throws IOException in case a filter raises an I/O exception.
3477      */
3478     protected void resetFilters() throws IOException {
3479       if (filter != null) {
3480         filter.reset();
3481       }
3482     }
3483 
3484     @Override
3485     public boolean next(List<Cell> outResults)
3486         throws IOException {
3487       // apply the batching limit by default
3488       return next(outResults, batch);
3489     }
3490 
3491     @Override
3492     public synchronized boolean next(List<Cell> outResults, int limit) throws IOException {
3493       if (this.filterClosed) {
3494         throw new UnknownScannerException("Scanner was closed (timed out?) " +
3495             "after we renewed it. Could be caused by a very slow scanner " +
3496             "or a lengthy garbage collection");
3497       }
3498       startRegionOperation(Operation.SCAN);
3499       readRequestsCount.increment();
3500       try {
3501 
3502         // This could be a new thread from the last time we called next().
3503         MultiVersionConsistencyControl.setThreadReadPoint(this.readPt);
3504 
3505         return nextRaw(outResults, limit);
3506       } finally {
3507         closeRegionOperation();
3508       }
3509     }
3510 
3511     @Override
3512     public boolean nextRaw(List<Cell> outResults)
3513         throws IOException {
3514       return nextRaw(outResults, batch);
3515     }
3516 
3517     @Override
3518     public boolean nextRaw(List<Cell> outResults, int limit) throws IOException {
3519       boolean returnResult;
3520       if (outResults.isEmpty()) {
3521         // Usually outResults is empty. This is true when next is called
3522         // to handle scan or get operation.
3523         returnResult = nextInternal(outResults, limit);
3524       } else {
3525         List<Cell> tmpList = new ArrayList<Cell>();
3526         returnResult = nextInternal(tmpList, limit);
3527         outResults.addAll(tmpList);
3528       }
3529       resetFilters();
3530       if (isFilterDone()) {
3531         return false;
3532       }
3533       if (region != null && region.metricsRegion != null) {
3534         long totalSize = 0;
3535         if (outResults != null) {
3536           for(Cell c:outResults) {
3537             // TODO clean up
3538             KeyValue kv = KeyValueUtil.ensureKeyValue(c);
3539 
3540             totalSize += kv.getLength();
3541           }
3542         }
3543         region.metricsRegion.updateScanNext(totalSize);
3544       }
3545       return returnResult;
3546     }
3547 
3548 
3549     private void populateFromJoinedHeap(List<Cell> results, int limit)
3550         throws IOException {
3551       assert joinedContinuationRow != null;
3552       KeyValue kv = populateResult(results, this.joinedHeap, limit,
3553           joinedContinuationRow.getBuffer(), joinedContinuationRow.getRowOffset(),
3554           joinedContinuationRow.getRowLength());
3555       if (kv != KV_LIMIT) {
3556         // We are done with this row, reset the continuation.
3557         joinedContinuationRow = null;
3558       }
3559       // As the data is obtained from two independent heaps, we need to
3560       // ensure that result list is sorted, because Result relies on that.
3561       Collections.sort(results, comparator);
3562     }
3563 
3564     /**
3565      * Fetches records with currentRow into results list, until next row or limit (if not -1).
3566      * @param results
3567      * @param heap KeyValueHeap to fetch data from.It must be positioned on correct row before call.
3568      * @param limit Max amount of KVs to place in result list, -1 means no limit.
3569      * @param currentRow Byte array with key we are fetching.
3570      * @param offset offset for currentRow
3571      * @param length length for currentRow
3572      * @return KV_LIMIT if limit reached, next KeyValue otherwise.
3573      */
3574     private KeyValue populateResult(List<Cell> results, KeyValueHeap heap, int limit,
3575         byte[] currentRow, int offset, short length) throws IOException {
3576       KeyValue nextKv;
3577       do {
3578         heap.next(results, limit - results.size());
3579         if (limit > 0 && results.size() == limit) {
3580           return KV_LIMIT;
3581         }
3582         nextKv = heap.peek();
3583       } while (nextKv != null && nextKv.matchingRow(currentRow, offset, length));
3584 
3585       return nextKv;
3586     }
3587 
3588     /*
3589      * @return True if a filter rules the scanner is over, done.
3590      */
3591     @Override
3592     public synchronized boolean isFilterDone() throws IOException {
3593       return this.filter != null && this.filter.filterAllRemaining();
3594     }
3595 
3596     private boolean nextInternal(List<Cell> results, int limit)
3597     throws IOException {
3598       if (!results.isEmpty()) {
3599         throw new IllegalArgumentException("First parameter should be an empty list");
3600       }
3601       RpcCallContext rpcCall = RpcServer.getCurrentCall();
3602       // The loop here is used only when at some point during the next we determine
3603       // that due to effects of filters or otherwise, we have an empty row in the result.
3604       // Then we loop and try again. Otherwise, we must get out on the first iteration via return,
3605       // "true" if there's more data to read, "false" if there isn't (storeHeap is at a stop row,
3606       // and joinedHeap has no more data to read for the last row (if set, joinedContinuationRow).
3607       while (true) {
3608         if (rpcCall != null) {
3609           // If a user specifies a too-restrictive or too-slow scanner, the
3610           // client might time out and disconnect while the server side
3611           // is still processing the request. We should abort aggressively
3612           // in that case.
3613           long afterTime = rpcCall.disconnectSince();
3614           if (afterTime >= 0) {
3615             throw new CallerDisconnectedException(
3616                 "Aborting on region " + getRegionNameAsString() + ", call " +
3617                     this + " after " + afterTime + " ms, since " +
3618                     "caller disconnected");
3619           }
3620         }
3621 
3622         // Let's see what we have in the storeHeap.
3623         KeyValue current = this.storeHeap.peek();
3624 
3625         byte[] currentRow = null;
3626         int offset = 0;
3627         short length = 0;
3628         if (current != null) {
3629           currentRow = current.getBuffer();
3630           offset = current.getRowOffset();
3631           length = current.getRowLength();
3632         }
3633         boolean stopRow = isStopRow(currentRow, offset, length);
3634         // Check if we were getting data from the joinedHeap and hit the limit.
3635         // If not, then it's main path - getting results from storeHeap.
3636         if (joinedContinuationRow == null) {
3637           // First, check if we are at a stop row. If so, there are no more results.
3638           if (stopRow) {
3639             if (filter != null && filter.hasFilterRow()) {
3640               filter.filterRowCells(results);
3641             }
3642             return false;
3643           }
3644 
3645           // Check if rowkey filter wants to exclude this row. If so, loop to next.
3646           // Technically, if we hit limits before on this row, we don't need this call.
3647           if (filterRowKey(currentRow, offset, length)) {
3648             boolean moreRows = nextRow(currentRow, offset, length);
3649             if (!moreRows) return false;
3650             results.clear();
3651             continue;
3652           }
3653 
3654           KeyValue nextKv = populateResult(results, this.storeHeap, limit, currentRow, offset,
3655               length);
3656           // Ok, we are good, let's try to get some results from the main heap.
3657           if (nextKv == KV_LIMIT) {
3658             if (this.filter != null && filter.hasFilterRow()) {
3659               throw new IncompatibleFilterException(
3660                 "Filter whose hasFilterRow() returns true is incompatible with scan with limit!");
3661             }
3662             return true; // We hit the limit.
3663           }
3664 
3665           stopRow = nextKv == null ||
3666               isStopRow(nextKv.getBuffer(), nextKv.getRowOffset(), nextKv.getRowLength());
3667           // save that the row was empty before filters applied to it.
3668           final boolean isEmptyRow = results.isEmpty();
3669 
3670           // We have the part of the row necessary for filtering (all of it, usually).
3671           // First filter with the filterRow(List).
3672           if (filter != null && filter.hasFilterRow()) {
3673             filter.filterRowCells(results);
3674           }
3675           if (isEmptyRow) {
3676             boolean moreRows = nextRow(currentRow, offset, length);
3677             if (!moreRows) return false;
3678             results.clear();
3679             // This row was totally filtered out, if this is NOT the last row,
3680             // we should continue on. Otherwise, nothing else to do.
3681             if (!stopRow) continue;
3682             return false;
3683           }
3684 
3685           // Ok, we are done with storeHeap for this row.
3686           // Now we may need to fetch additional, non-essential data into row.
3687           // These values are not needed for filter to work, so we postpone their
3688           // fetch to (possibly) reduce amount of data loads from disk.
3689           if (this.joinedHeap != null) {
3690             KeyValue nextJoinedKv = joinedHeap.peek();
3691             // If joinedHeap is pointing to some other row, try to seek to a correct one.
3692             boolean mayHaveData =
3693               (nextJoinedKv != null && nextJoinedKv.matchingRow(currentRow, offset, length))
3694               || (this.joinedHeap.requestSeek(KeyValue.createFirstOnRow(currentRow, offset, length),
3695                 true, true)
3696                 && joinedHeap.peek() != null
3697                 && joinedHeap.peek().matchingRow(currentRow, offset, length));
3698             if (mayHaveData) {
3699               joinedContinuationRow = current;
3700               populateFromJoinedHeap(results, limit);
3701             }
3702           }
3703         } else {
3704           // Populating from the joined heap was stopped by limits, populate some more.
3705           populateFromJoinedHeap(results, limit);
3706         }
3707 
3708         // We may have just called populateFromJoinedMap and hit the limits. If that is
3709         // the case, we need to call it again on the next next() invocation.
3710         if (joinedContinuationRow != null) {
3711           return true;
3712         }
3713 
3714         // Finally, we are done with both joinedHeap and storeHeap.
3715         // Double check to prevent empty rows from appearing in result. It could be
3716         // the case when SingleColumnValueExcludeFilter is used.
3717         if (results.isEmpty()) {
3718           boolean moreRows = nextRow(currentRow, offset, length);
3719           if (!moreRows) return false;
3720           if (!stopRow) continue;
3721         }
3722 
3723         // We are done. Return the result.
3724         return !stopRow;
3725       }
3726     }
3727 
3728     private boolean filterRowKey(byte[] row, int offset, short length) throws IOException {
3729       return filter != null
3730           && filter.filterRowKey(row, offset, length);
3731     }
3732 
3733     protected boolean nextRow(byte [] currentRow, int offset, short length) throws IOException {
3734       assert this.joinedContinuationRow == null: "Trying to go to next row during joinedHeap read.";
3735       KeyValue next;
3736       while ((next = this.storeHeap.peek()) != null &&
3737              next.matchingRow(currentRow, offset, length)) {
3738         this.storeHeap.next(MOCKED_LIST);
3739       }
3740       resetFilters();
3741       // Calling the hook in CP which allows it to do a fast forward
3742       if (this.region.getCoprocessorHost() != null) {
3743         return this.region.getCoprocessorHost().postScannerFilterRow(this, currentRow);
3744       }
3745       return true;
3746     }
3747 
3748     private boolean isStopRow(byte [] currentRow, int offset, short length) {
3749       return currentRow == null ||
3750           (stopRow != null &&
3751           comparator.compareRows(stopRow, 0, stopRow.length,
3752             currentRow, offset, length) <= isScan);
3753     }
3754 
3755     @Override
3756     public synchronized void close() {
3757       if (storeHeap != null) {
3758         storeHeap.close();
3759         storeHeap = null;
3760       }
3761       if (joinedHeap != null) {
3762         joinedHeap.close();
3763         joinedHeap = null;
3764       }
3765       // no need to sychronize here.
3766       scannerReadPoints.remove(this);
3767       this.filterClosed = true;
3768     }
3769 
3770     KeyValueHeap getStoreHeapForTesting() {
3771       return storeHeap;
3772     }
3773 
3774     @Override
3775     public synchronized boolean reseek(byte[] row) throws IOException {
3776       if (row == null) {
3777         throw new IllegalArgumentException("Row cannot be null.");
3778       }
3779       boolean result = false;
3780       startRegionOperation();
3781       try {
3782         // This could be a new thread from the last time we called next().
3783         MultiVersionConsistencyControl.setThreadReadPoint(this.readPt);
3784         KeyValue kv = KeyValue.createFirstOnRow(row);
3785         // use request seek to make use of the lazy seek option. See HBASE-5520
3786         result = this.storeHeap.requestSeek(kv, true, true);
3787         if (this.joinedHeap != null) {
3788           result = this.joinedHeap.requestSeek(kv, true, true) || result;
3789         }
3790       } finally {
3791         closeRegionOperation();
3792       }
3793       return result;
3794     }
3795   }
3796 
3797   // Utility methods
3798   /**
3799    * A utility method to create new instances of HRegion based on the
3800    * {@link HConstants#REGION_IMPL} configuration property.
3801    * @param tableDir qualified path of directory where region should be located,
3802    * usually the table directory.
3803    * @param log The HLog is the outbound log for any updates to the HRegion
3804    * (There's a single HLog for all the HRegions on a single HRegionServer.)
3805    * The log file is a logfile from the previous execution that's
3806    * custom-computed for this HRegion. The HRegionServer computes and sorts the
3807    * appropriate log info for this HRegion. If there is a previous log file
3808    * (implying that the HRegion has been written-to before), then read it from
3809    * the supplied path.
3810    * @param fs is the filesystem.
3811    * @param conf is global configuration settings.
3812    * @param regionInfo - HRegionInfo that describes the region
3813    * is new), then read them from the supplied path.
3814    * @param htd the table descriptor
3815    * @param rsServices
3816    * @return the new instance
3817    */
3818   static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs,
3819       Configuration conf, HRegionInfo regionInfo, final HTableDescriptor htd,
3820       RegionServerServices rsServices) {
3821     try {
3822       @SuppressWarnings("unchecked")
3823       Class<? extends HRegion> regionClass =
3824           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
3825 
3826       Constructor<? extends HRegion> c =
3827           regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
3828               Configuration.class, HRegionInfo.class, HTableDescriptor.class,
3829               RegionServerServices.class);
3830 
3831       return c.newInstance(tableDir, log, fs, conf, regionInfo, htd, rsServices);
3832     } catch (Throwable e) {
3833       // todo: what should I throw here?
3834       throw new IllegalStateException("Could not instantiate a region instance.", e);
3835     }
3836   }
3837 
3838   /**
3839    * Convenience method creating new HRegions. Used by createTable and by the
3840    * bootstrap code in the HMaster constructor.
3841    * Note, this method creates an {@link HLog} for the created region. It
3842    * needs to be closed explicitly.  Use {@link HRegion#getLog()} to get
3843    * access.  <b>When done with a region created using this method, you will
3844    * need to explicitly close the {@link HLog} it created too; it will not be
3845    * done for you.  Not closing the log will leave at least a daemon thread
3846    * running.</b>  Call {@link #closeHRegion(HRegion)} and it will do
3847    * necessary cleanup for you.
3848    * @param info Info for region to create.
3849    * @param rootDir Root directory for HBase instance
3850    * @param conf
3851    * @param hTableDescriptor
3852    * @return new HRegion
3853    *
3854    * @throws IOException
3855    */
3856   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
3857       final Configuration conf, final HTableDescriptor hTableDescriptor)
3858   throws IOException {
3859     return createHRegion(info, rootDir, conf, hTableDescriptor, null);
3860   }
3861 
3862   /**
3863    * This will do the necessary cleanup a call to
3864    * {@link #createHRegion(HRegionInfo, Path, Configuration, HTableDescriptor)}
3865    * requires.  This method will close the region and then close its
3866    * associated {@link HLog} file.  You use it if you call the other createHRegion,
3867    * the one that takes an {@link HLog} instance but don't be surprised by the
3868    * call to the {@link HLog#closeAndDelete()} on the {@link HLog} the
3869    * HRegion was carrying.
3870    * @param r
3871    * @throws IOException
3872    */
3873   public static void closeHRegion(final HRegion r) throws IOException {
3874     if (r == null) return;
3875     r.close();
3876     if (r.getLog() == null) return;
3877     r.getLog().closeAndDelete();
3878   }
3879 
3880   /**
3881    * Convenience method creating new HRegions. Used by createTable.
3882    * The {@link HLog} for the created region needs to be closed explicitly.
3883    * Use {@link HRegion#getLog()} to get access.
3884    *
3885    * @param info Info for region to create.
3886    * @param rootDir Root directory for HBase instance
3887    * @param conf
3888    * @param hTableDescriptor
3889    * @param hlog shared HLog
3890    * @param initialize - true to initialize the region
3891    * @return new HRegion
3892    *
3893    * @throws IOException
3894    */
3895   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
3896                                       final Configuration conf,
3897                                       final HTableDescriptor hTableDescriptor,
3898                                       final HLog hlog,
3899                                       final boolean initialize)
3900       throws IOException {
3901     return createHRegion(info, rootDir, conf, hTableDescriptor,
3902         hlog, initialize, false);
3903   }
3904 
3905   /**
3906    * Convenience method creating new HRegions. Used by createTable.
3907    * The {@link HLog} for the created region needs to be closed
3908    * explicitly, if it is not null.
3909    * Use {@link HRegion#getLog()} to get access.
3910    *
3911    * @param info Info for region to create.
3912    * @param rootDir Root directory for HBase instance
3913    * @param conf
3914    * @param hTableDescriptor
3915    * @param hlog shared HLog
3916    * @param initialize - true to initialize the region
3917    * @param ignoreHLog - true to skip generate new hlog if it is null, mostly for createTable
3918    * @return new HRegion
3919    * @throws IOException
3920    */
3921   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
3922                                       final Configuration conf,
3923                                       final HTableDescriptor hTableDescriptor,
3924                                       final HLog hlog,
3925                                       final boolean initialize, final boolean ignoreHLog)
3926       throws IOException {
3927     LOG.info("creating HRegion " + info.getTable().getNameAsString()
3928         + " HTD == " + hTableDescriptor + " RootDir = " + rootDir +
3929         " Table name == " + info.getTable().getNameAsString());
3930 
3931     Path tableDir = FSUtils.getTableDir(rootDir, info.getTable());
3932     FileSystem fs = FileSystem.get(conf);
3933     HRegionFileSystem rfs = HRegionFileSystem.createRegionOnFileSystem(conf, fs, tableDir, info);
3934     HLog effectiveHLog = hlog;
3935     if (hlog == null && !ignoreHLog) {
3936       effectiveHLog = HLogFactory.createHLog(fs, rfs.getRegionDir(),
3937                                              HConstants.HREGION_LOGDIR_NAME, conf);
3938     }
3939     HRegion region = HRegion.newHRegion(tableDir,
3940         effectiveHLog, fs, conf, info, hTableDescriptor, null);
3941     if (initialize) {
3942       region.initialize();
3943     }
3944     return region;
3945   }
3946 
3947   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
3948                                       final Configuration conf,
3949                                       final HTableDescriptor hTableDescriptor,
3950                                       final HLog hlog)
3951     throws IOException {
3952     return createHRegion(info, rootDir, conf, hTableDescriptor, hlog, true);
3953   }
3954 
3955 
3956   /**
3957    * Open a Region.
3958    * @param info Info for region to be opened.
3959    * @param wal HLog for region to use. This method will call
3960    * HLog#setSequenceNumber(long) passing the result of the call to
3961    * HRegion#getMinSequenceId() to ensure the log id is properly kept
3962    * up.  HRegionStore does this every time it opens a new region.
3963    * @param conf
3964    * @return new HRegion
3965    *
3966    * @throws IOException
3967    */
3968   public static HRegion openHRegion(final HRegionInfo info,
3969       final HTableDescriptor htd, final HLog wal,
3970       final Configuration conf)
3971   throws IOException {
3972     return openHRegion(info, htd, wal, conf, null, null);
3973   }
3974 
3975   /**
3976    * Open a Region.
3977    * @param info Info for region to be opened
3978    * @param htd the table descriptor
3979    * @param wal HLog for region to use. This method will call
3980    * HLog#setSequenceNumber(long) passing the result of the call to
3981    * HRegion#getMinSequenceId() to ensure the log id is properly kept
3982    * up.  HRegionStore does this every time it opens a new region.
3983    * @param conf The Configuration object to use.
3984    * @param rsServices An interface we can request flushes against.
3985    * @param reporter An interface we can report progress against.
3986    * @return new HRegion
3987    *
3988    * @throws IOException
3989    */
3990   public static HRegion openHRegion(final HRegionInfo info,
3991     final HTableDescriptor htd, final HLog wal, final Configuration conf,
3992     final RegionServerServices rsServices,
3993     final CancelableProgressable reporter)
3994   throws IOException {
3995     return openHRegion(FSUtils.getRootDir(conf), info, htd, wal, conf, rsServices, reporter);
3996   }
3997 
3998   /**
3999    * Open a Region.
4000    * @param rootDir Root directory for HBase instance
4001    * @param info Info for region to be opened.
4002    * @param htd the table descriptor
4003    * @param wal HLog for region to use. This method will call
4004    * HLog#setSequenceNumber(long) passing the result of the call to
4005    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4006    * up.  HRegionStore does this every time it opens a new region.
4007    * @param conf The Configuration object to use.
4008    * @return new HRegion
4009    * @throws IOException
4010    */
4011   public static HRegion openHRegion(Path rootDir, final HRegionInfo info,
4012       final HTableDescriptor htd, final HLog wal, final Configuration conf)
4013   throws IOException {
4014     return openHRegion(rootDir, info, htd, wal, conf, null, null);
4015   }
4016 
4017   /**
4018    * Open a Region.
4019    * @param rootDir Root directory for HBase instance
4020    * @param info Info for region to be opened.
4021    * @param htd the table descriptor
4022    * @param wal HLog for region to use. This method will call
4023    * HLog#setSequenceNumber(long) passing the result of the call to
4024    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4025    * up.  HRegionStore does this every time it opens a new region.
4026    * @param conf The Configuration object to use.
4027    * @param rsServices An interface we can request flushes against.
4028    * @param reporter An interface we can report progress against.
4029    * @return new HRegion
4030    * @throws IOException
4031    */
4032   public static HRegion openHRegion(final Path rootDir, final HRegionInfo info,
4033       final HTableDescriptor htd, final HLog wal, final Configuration conf,
4034       final RegionServerServices rsServices,
4035       final CancelableProgressable reporter)
4036   throws IOException {
4037     FileSystem fs = null;
4038     if (rsServices != null) {
4039       fs = rsServices.getFileSystem();
4040     }
4041     if (fs == null) {
4042       fs = FileSystem.get(conf);
4043     }
4044     return openHRegion(conf, fs, rootDir, info, htd, wal, rsServices, reporter);
4045   }
4046 
4047   /**
4048    * Open a Region.
4049    * @param conf The Configuration object to use.
4050    * @param fs Filesystem to use
4051    * @param rootDir Root directory for HBase instance
4052    * @param info Info for region to be opened.
4053    * @param htd the table descriptor
4054    * @param wal HLog for region to use. This method will call
4055    * HLog#setSequenceNumber(long) passing the result of the call to
4056    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4057    * up.  HRegionStore does this every time it opens a new region.
4058    * @return new HRegion
4059    * @throws IOException
4060    */
4061   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4062       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal)
4063       throws IOException {
4064     return openHRegion(conf, fs, rootDir, info, htd, wal, null, null);
4065   }
4066 
4067   /**
4068    * Open a Region.
4069    * @param conf The Configuration object to use.
4070    * @param fs Filesystem to use
4071    * @param rootDir Root directory for HBase instance
4072    * @param info Info for region to be opened.
4073    * @param htd the table descriptor
4074    * @param wal HLog for region to use. This method will call
4075    * HLog#setSequenceNumber(long) passing the result of the call to
4076    * HRegion#getMinSequenceId() to ensure the log id is properly kept
4077    * up.  HRegionStore does this every time it opens a new region.
4078    * @param rsServices An interface we can request flushes against.
4079    * @param reporter An interface we can report progress against.
4080    * @return new HRegion
4081    * @throws IOException
4082    */
4083   public static HRegion openHRegion(final Configuration conf, final FileSystem fs,
4084       final Path rootDir, final HRegionInfo info, final HTableDescriptor htd, final HLog wal,
4085       final RegionServerServices rsServices, final CancelableProgressable reporter)
4086       throws IOException {
4087     if (info == null) throw new NullPointerException("Passed region info is null");
4088     if (LOG.isDebugEnabled()) {
4089       LOG.debug("Opening region: " + info);
4090     }
4091     Path dir = FSUtils.getTableDir(rootDir, info.getTable());
4092     HRegion r = HRegion.newHRegion(dir, wal, fs, conf, info, htd, rsServices);
4093     return r.openHRegion(reporter);
4094   }
4095 
4096   /**
4097    * Useful when reopening a closed region (normally for unit tests)
4098    * @param other original object
4099    * @param reporter An interface we can report progress against.
4100    * @return new HRegion
4101    * @throws IOException
4102    */
4103   public static HRegion openHRegion(final HRegion other, final CancelableProgressable reporter)
4104       throws IOException {
4105     HRegionFileSystem regionFs = other.getRegionFileSystem();
4106     HRegion r = newHRegion(regionFs.getTableDir(), other.getLog(), regionFs.getFileSystem(),
4107         other.baseConf, other.getRegionInfo(), other.getTableDesc(), null);
4108     return r.openHRegion(reporter);
4109   }
4110 
4111   /**
4112    * Open HRegion.
4113    * Calls initialize and sets sequenceid.
4114    * @param reporter
4115    * @return Returns <code>this</code>
4116    * @throws IOException
4117    */
4118   protected HRegion openHRegion(final CancelableProgressable reporter)
4119   throws IOException {
4120     checkCompressionCodecs();
4121 
4122     this.openSeqNum = initialize(reporter);
4123     if (this.log != null) {
4124       this.log.setSequenceNumber(this.openSeqNum);
4125     }
4126 
4127     return this;
4128   }
4129 
4130   private void checkCompressionCodecs() throws IOException {
4131     for (HColumnDescriptor fam: this.htableDescriptor.getColumnFamilies()) {
4132       CompressionTest.testCompression(fam.getCompression());
4133       CompressionTest.testCompression(fam.getCompactionCompression());
4134     }
4135   }
4136 
4137   /**
4138    * Create a daughter region from given a temp directory with the region data.
4139    * @param hri Spec. for daughter region to open.
4140    * @throws IOException
4141    */
4142   HRegion createDaughterRegionFromSplits(final HRegionInfo hri) throws IOException {
4143     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(), fs.getFileSystem(),
4144         this.getBaseConf(), hri, this.getTableDesc(), rsServices);
4145     r.readRequestsCount.set(this.getReadRequestsCount() / 2);
4146     r.writeRequestsCount.set(this.getWriteRequestsCount() / 2);
4147     fs.commitDaughterRegion(hri);
4148     return r;
4149   }
4150 
4151   /**
4152    * Create a merged region given a temp directory with the region data.
4153    * @param mergedRegionInfo
4154    * @param region_b another merging region
4155    * @return merged hregion
4156    * @throws IOException
4157    */
4158   HRegion createMergedRegionFromMerges(final HRegionInfo mergedRegionInfo,
4159       final HRegion region_b) throws IOException {
4160     HRegion r = HRegion.newHRegion(this.fs.getTableDir(), this.getLog(),
4161         fs.getFileSystem(), this.getBaseConf(), mergedRegionInfo,
4162         this.getTableDesc(), this.rsServices);
4163     r.readRequestsCount.set(this.getReadRequestsCount()
4164         + region_b.getReadRequestsCount());
4165     r.writeRequestsCount.set(this.getWriteRequestsCount()
4166         + region_b.getWriteRequestsCount());
4167     this.fs.commitMergedRegion(mergedRegionInfo);
4168     return r;
4169   }
4170 
4171   /**
4172    * Inserts a new region's meta information into the passed
4173    * <code>meta</code> region. Used by the HMaster bootstrap code adding
4174    * new table to hbase:meta table.
4175    *
4176    * @param meta hbase:meta HRegion to be updated
4177    * @param r HRegion to add to <code>meta</code>
4178    *
4179    * @throws IOException
4180    */
4181   // TODO remove since only test and merge use this
4182   public static void addRegionToMETA(final HRegion meta, final HRegion r) throws IOException {
4183     meta.checkResources();
4184     // The row key is the region name
4185     byte[] row = r.getRegionName();
4186     final long now = EnvironmentEdgeManager.currentTimeMillis();
4187     final List<Cell> cells = new ArrayList<Cell>(2);
4188     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4189       HConstants.REGIONINFO_QUALIFIER, now,
4190       r.getRegionInfo().toByteArray()));
4191     // Set into the root table the version of the meta table.
4192     cells.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
4193       HConstants.META_VERSION_QUALIFIER, now,
4194       Bytes.toBytes(HConstants.META_VERSION)));
4195     meta.put(row, HConstants.CATALOG_FAMILY, cells);
4196   }
4197 
4198   /**
4199    * Computes the Path of the HRegion
4200    *
4201    * @param tabledir qualified path for table
4202    * @param name ENCODED region name
4203    * @return Path of HRegion directory
4204    */
4205   @Deprecated
4206   public static Path getRegionDir(final Path tabledir, final String name) {
4207     return new Path(tabledir, name);
4208   }
4209 
4210   /**
4211    * Computes the Path of the HRegion
4212    *
4213    * @param rootdir qualified path of HBase root directory
4214    * @param info HRegionInfo for the region
4215    * @return qualified path of region directory
4216    */
4217   @Deprecated
4218   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
4219     return new Path(
4220       FSUtils.getTableDir(rootdir, info.getTable()), info.getEncodedName());
4221   }
4222 
4223   /**
4224    * Determines if the specified row is within the row range specified by the
4225    * specified HRegionInfo
4226    *
4227    * @param info HRegionInfo that specifies the row range
4228    * @param row row to be checked
4229    * @return true if the row is within the range specified by the HRegionInfo
4230    */
4231   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
4232     return ((info.getStartKey().length == 0) ||
4233         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
4234         ((info.getEndKey().length == 0) ||
4235             (Bytes.compareTo(info.getEndKey(), row) > 0));
4236   }
4237 
4238   /**
4239    * Merge two HRegions.  The regions must be adjacent and must not overlap.
4240    *
4241    * @param srcA
4242    * @param srcB
4243    * @return new merged HRegion
4244    * @throws IOException
4245    */
4246   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
4247   throws IOException {
4248     HRegion a = srcA;
4249     HRegion b = srcB;
4250 
4251     // Make sure that srcA comes first; important for key-ordering during
4252     // write of the merged file.
4253     if (srcA.getStartKey() == null) {
4254       if (srcB.getStartKey() == null) {
4255         throw new IOException("Cannot merge two regions with null start key");
4256       }
4257       // A's start key is null but B's isn't. Assume A comes before B
4258     } else if ((srcB.getStartKey() == null) ||
4259       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
4260       a = srcB;
4261       b = srcA;
4262     }
4263 
4264     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
4265       throw new IOException("Cannot merge non-adjacent regions");
4266     }
4267     return merge(a, b);
4268   }
4269 
4270   /**
4271    * Merge two regions whether they are adjacent or not.
4272    *
4273    * @param a region a
4274    * @param b region b
4275    * @return new merged region
4276    * @throws IOException
4277    */
4278   public static HRegion merge(final HRegion a, final HRegion b) throws IOException {
4279     if (!a.getRegionInfo().getTable().equals(b.getRegionInfo().getTable())) {
4280       throw new IOException("Regions do not belong to the same table");
4281     }
4282 
4283     FileSystem fs = a.getRegionFileSystem().getFileSystem();
4284     // Make sure each region's cache is empty
4285     a.flushcache();
4286     b.flushcache();
4287 
4288     // Compact each region so we only have one store file per family
4289     a.compactStores(true);
4290     if (LOG.isDebugEnabled()) {
4291       LOG.debug("Files for region: " + a);
4292       a.getRegionFileSystem().logFileSystemState(LOG);
4293     }
4294     b.compactStores(true);
4295     if (LOG.isDebugEnabled()) {
4296       LOG.debug("Files for region: " + b);
4297       b.getRegionFileSystem().logFileSystemState(LOG);
4298     }
4299 
4300     RegionMergeTransaction rmt = new RegionMergeTransaction(a, b, true);
4301     if (!rmt.prepare(null)) {
4302       throw new IOException("Unable to merge regions " + a + " and " + b);
4303     }
4304     HRegionInfo mergedRegionInfo = rmt.getMergedRegionInfo();
4305     LOG.info("starting merge of regions: " + a + " and " + b
4306         + " into new region " + mergedRegionInfo.getRegionNameAsString()
4307         + " with start key <"
4308         + Bytes.toStringBinary(mergedRegionInfo.getStartKey())
4309         + "> and end key <"
4310         + Bytes.toStringBinary(mergedRegionInfo.getEndKey()) + ">");
4311     HRegion dstRegion = null;
4312     try {
4313       dstRegion = rmt.execute(null, null);
4314     } catch (IOException ioe) {
4315       rmt.rollback(null, null);
4316       throw new IOException("Failed merging region " + a + " and " + b
4317           + ", and succssfully rolled back");
4318     }
4319     dstRegion.compactStores(true);
4320 
4321     if (LOG.isDebugEnabled()) {
4322       LOG.debug("Files for new region");
4323       dstRegion.getRegionFileSystem().logFileSystemState(LOG);
4324     }
4325 
4326     if (dstRegion.getRegionFileSystem().hasReferences(dstRegion.getTableDesc())) {
4327       throw new IOException("Merged region " + dstRegion
4328           + " still has references after the compaction, is compaction canceled?");
4329     }
4330 
4331     // Archiving the 'A' region
4332     HFileArchiver.archiveRegion(a.getBaseConf(), fs, a.getRegionInfo());
4333     // Archiving the 'B' region
4334     HFileArchiver.archiveRegion(b.getBaseConf(), fs, b.getRegionInfo());
4335 
4336     LOG.info("merge completed. New region is " + dstRegion);
4337     return dstRegion;
4338   }
4339 
4340   /**
4341    * @return True if needs a major compaction.
4342    * @throws IOException
4343    */
4344   boolean isMajorCompaction() throws IOException {
4345     for (Store store : this.stores.values()) {
4346       if (store.isMajorCompaction()) {
4347         return true;
4348       }
4349     }
4350     return false;
4351   }
4352 
4353   //
4354   // HBASE-880
4355   //
4356   /**
4357    * @param get get object
4358    * @return result
4359    * @throws IOException read exceptions
4360    */
4361   public Result get(final Get get) throws IOException {
4362     checkRow(get.getRow(), "Get");
4363     // Verify families are all valid
4364     if (get.hasFamilies()) {
4365       for (byte [] family: get.familySet()) {
4366         checkFamily(family);
4367       }
4368     } else { // Adding all families to scanner
4369       for (byte[] family: this.htableDescriptor.getFamiliesKeys()) {
4370         get.addFamily(family);
4371       }
4372     }
4373     List<Cell> results = get(get, true);
4374     return Result.create(results, get.isCheckExistenceOnly() ? !results.isEmpty() : null);
4375   }
4376 
4377   /*
4378    * Do a get based on the get parameter.
4379    * @param withCoprocessor invoke coprocessor or not. We don't want to
4380    * always invoke cp for this private method.
4381    */
4382   private List<Cell> get(Get get, boolean withCoprocessor)
4383   throws IOException {
4384 
4385     List<Cell> results = new ArrayList<Cell>();
4386 
4387     // pre-get CP hook
4388     if (withCoprocessor && (coprocessorHost != null)) {
4389        if (coprocessorHost.preGet(get, results)) {
4390          return results;
4391        }
4392     }
4393 
4394     Scan scan = new Scan(get);
4395 
4396     RegionScanner scanner = null;
4397     try {
4398       scanner = getScanner(scan);
4399       scanner.next(results);
4400     } finally {
4401       if (scanner != null)
4402         scanner.close();
4403     }
4404 
4405     // post-get CP hook
4406     if (withCoprocessor && (coprocessorHost != null)) {
4407       coprocessorHost.postGet(get, results);
4408     }
4409 
4410     // do after lock
4411     if (this.metricsRegion != null) {
4412       long totalSize = 0l;
4413       if (results != null) {
4414         for (Cell kv:results) {
4415           totalSize += KeyValueUtil.ensureKeyValue(kv).getLength();
4416         }
4417       }
4418       this.metricsRegion.updateGet(totalSize);
4419     }
4420 
4421     return results;
4422   }
4423 
4424   public void mutateRow(RowMutations rm) throws IOException {
4425     mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
4426   }
4427 
4428   /**
4429    * Perform atomic mutations within the region.
4430    * @param mutations The list of mutations to perform.
4431    * <code>mutations</code> can contain operations for multiple rows.
4432    * Caller has to ensure that all rows are contained in this region.
4433    * @param rowsToLock Rows to lock
4434    * If multiple rows are locked care should be taken that
4435    * <code>rowsToLock</code> is sorted in order to avoid deadlocks.
4436    * @throws IOException
4437    */
4438   public void mutateRowsWithLocks(Collection<Mutation> mutations,
4439       Collection<byte[]> rowsToLock) throws IOException {
4440 
4441     MultiRowMutationProcessor proc =
4442         new MultiRowMutationProcessor(mutations, rowsToLock);
4443     processRowsWithLocks(proc, -1);
4444   }
4445 
4446   /**
4447    * Performs atomic multiple reads and writes on a given row.
4448    *
4449    * @param processor The object defines the reads and writes to a row.
4450    */
4451   public void processRowsWithLocks(RowProcessor<?,?> processor)
4452       throws IOException {
4453     processRowsWithLocks(processor, rowProcessorTimeout);
4454   }
4455 
4456   /**
4457    * Performs atomic multiple reads and writes on a given row.
4458    *
4459    * @param processor The object defines the reads and writes to a row.
4460    * @param timeout The timeout of the processor.process() execution
4461    *                Use a negative number to switch off the time bound
4462    */
4463   public void processRowsWithLocks(RowProcessor<?,?> processor, long timeout)
4464       throws IOException {
4465 
4466     for (byte[] row : processor.getRowsToLock()) {
4467       checkRow(row, "processRowsWithLocks");
4468     }
4469     if (!processor.readOnly()) {
4470       checkReadOnly();
4471     }
4472     checkResources();
4473 
4474     startRegionOperation();
4475     WALEdit walEdit = new WALEdit();
4476 
4477     // 1. Run pre-process hook
4478     processor.preProcess(this, walEdit);
4479 
4480     // Short circuit the read only case
4481     if (processor.readOnly()) {
4482       try {
4483         long now = EnvironmentEdgeManager.currentTimeMillis();
4484         doProcessRowWithTimeout(
4485             processor, now, this, null, null, timeout);
4486         processor.postProcess(this, walEdit);
4487       } catch (IOException e) {
4488         throw e;
4489       } finally {
4490         closeRegionOperation();
4491       }
4492       return;
4493     }
4494 
4495     MultiVersionConsistencyControl.WriteEntry writeEntry = null;
4496     boolean locked = false;
4497     boolean walSyncSuccessful = false;
4498     List<RowLock> acquiredRowLocks = null;
4499     long addedSize = 0;
4500     List<KeyValue> mutations = new ArrayList<KeyValue>();
4501     Collection<byte[]> rowsToLock = processor.getRowsToLock();
4502     try {
4503       // 2. Acquire the row lock(s)
4504       acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
4505       for (byte[] row : rowsToLock) {
4506         // Attempt to lock all involved rows, throw if any lock times out
4507         acquiredRowLocks.add(getRowLock(row));
4508       }
4509       // 3. Region lock
4510       lock(this.updatesLock.readLock(), acquiredRowLocks.size());
4511       locked = true;
4512 
4513       long now = EnvironmentEdgeManager.currentTimeMillis();
4514       try {
4515         // 4. Let the processor scan the rows, generate mutations and add
4516         //    waledits
4517         doProcessRowWithTimeout(
4518             processor, now, this, mutations, walEdit, timeout);
4519 
4520         if (!mutations.isEmpty()) {
4521           // 5. Get a mvcc write number
4522           writeEntry = mvcc.beginMemstoreInsert();
4523           // 6. Apply to memstore
4524           for (KeyValue kv : mutations) {
4525             kv.setMvccVersion(writeEntry.getWriteNumber());
4526             byte[] family = kv.getFamily();
4527             checkFamily(family);
4528             addedSize += stores.get(family).add(kv);
4529           }
4530 
4531           long txid = 0;
4532           // 7. Append no sync
4533           if (!walEdit.isEmpty()) {
4534             txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
4535                   walEdit, processor.getClusterIds(), now, this.htableDescriptor);
4536           }
4537           // 8. Release region lock
4538           if (locked) {
4539             this.updatesLock.readLock().unlock();
4540             locked = false;
4541           }
4542 
4543           // 9. Release row lock(s)
4544           releaseRowLocks(acquiredRowLocks);
4545 
4546           // 10. Sync edit log
4547           if (txid != 0) {
4548             syncOrDefer(txid, getEffectiveDurability(processor.useDurability()));
4549           }
4550           walSyncSuccessful = true;
4551         }
4552       } finally {
4553         if (!mutations.isEmpty() && !walSyncSuccessful) {
4554           LOG.warn("Wal sync failed. Roll back " + mutations.size() +
4555               " memstore keyvalues for row(s):" +
4556               processor.getRowsToLock().iterator().next() + "...");
4557           for (KeyValue kv : mutations) {
4558             stores.get(kv.getFamily()).rollback(kv);
4559           }
4560         }
4561         // 11. Roll mvcc forward
4562         if (writeEntry != null) {
4563           mvcc.completeMemstoreInsert(writeEntry);
4564           writeEntry = null;
4565         }
4566         if (locked) {
4567           this.updatesLock.readLock().unlock();
4568           locked = false;
4569         }
4570         // release locks if some were acquired but another timed out
4571         releaseRowLocks(acquiredRowLocks);
4572       }
4573 
4574       // 12. Run post-process hook
4575       processor.postProcess(this, walEdit);
4576 
4577     } catch (IOException e) {
4578       throw e;
4579     } finally {
4580       closeRegionOperation();
4581       if (!mutations.isEmpty() &&
4582           isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize))) {
4583         requestFlush();
4584       }
4585     }
4586   }
4587 
4588   private void doProcessRowWithTimeout(final RowProcessor<?,?> processor,
4589                                        final long now,
4590                                        final HRegion region,
4591                                        final List<KeyValue> mutations,
4592                                        final WALEdit walEdit,
4593                                        final long timeout) throws IOException {
4594     // Short circuit the no time bound case.
4595     if (timeout < 0) {
4596       try {
4597         processor.process(now, region, mutations, walEdit);
4598       } catch (IOException e) {
4599         LOG.warn("RowProcessor:" + processor.getClass().getName() +
4600             " throws Exception on row(s):" +
4601             Bytes.toStringBinary(
4602               processor.getRowsToLock().iterator().next()) + "...", e);
4603         throw e;
4604       }
4605       return;
4606     }
4607 
4608     // Case with time bound
4609     FutureTask<Void> task =
4610       new FutureTask<Void>(new Callable<Void>() {
4611         @Override
4612         public Void call() throws IOException {
4613           try {
4614             processor.process(now, region, mutations, walEdit);
4615             return null;
4616           } catch (IOException e) {
4617             LOG.warn("RowProcessor:" + processor.getClass().getName() +
4618                 " throws Exception on row(s):" +
4619                 Bytes.toStringBinary(
4620                     processor.getRowsToLock().iterator().next()) + "...", e);
4621             throw e;
4622           }
4623         }
4624       });
4625     rowProcessorExecutor.execute(task);
4626     try {
4627       task.get(timeout, TimeUnit.MILLISECONDS);
4628     } catch (TimeoutException te) {
4629       LOG.error("RowProcessor timeout:" + timeout + " ms on row(s):" +
4630           Bytes.toStringBinary(processor.getRowsToLock().iterator().next()) +
4631           "...");
4632       throw new IOException(te);
4633     } catch (Exception e) {
4634       throw new IOException(e);
4635     }
4636   }
4637 
4638   // TODO: There's a lot of boiler plate code identical
4639   // to increment... See how to better unify that.
4640   /**
4641    * Perform one or more append operations on a row.
4642    *
4643    * @param append
4644    * @return new keyvalues after increment
4645    * @throws IOException
4646    */
4647   public Result append(Append append)
4648       throws IOException {
4649     byte[] row = append.getRow();
4650     checkRow(row, "append");
4651     boolean flush = false;
4652     Durability durability = getEffectiveDurability(append.getDurability());
4653     boolean writeToWAL = durability != Durability.SKIP_WAL;
4654     WALEdit walEdits = null;
4655     List<Cell> allKVs = new ArrayList<Cell>(append.size());
4656     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
4657 
4658     long size = 0;
4659     long txid = 0;
4660 
4661     checkReadOnly();
4662     checkResources();
4663     // Lock row
4664     startRegionOperation(Operation.APPEND);
4665     this.writeRequestsCount.increment();
4666     WriteEntry w = null;
4667     RowLock rowLock = null;
4668     try {
4669       rowLock = getRowLock(row);
4670       try {
4671         lock(this.updatesLock.readLock());
4672         // wait for all prior MVCC transactions to finish - while we hold the row lock
4673         // (so that we are guaranteed to see the latest state)
4674         mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
4675         // now start my own transaction
4676         w = mvcc.beginMemstoreInsert();
4677         try {
4678           long now = EnvironmentEdgeManager.currentTimeMillis();
4679           // Process each family
4680           for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
4681 
4682             Store store = stores.get(family.getKey());
4683             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
4684   
4685             Collections.sort(family.getValue(), store.getComparator());
4686             // Get previous values for all columns in this family
4687             Get get = new Get(row);
4688             for (Cell cell : family.getValue()) {
4689               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4690               get.addColumn(family.getKey(), kv.getQualifier());
4691             }
4692             List<Cell> results = get(get, false);
4693   
4694             // Iterate the input columns and update existing values if they were
4695             // found, otherwise add new column initialized to the append value
4696 
4697             // Avoid as much copying as possible. Every byte is copied at most
4698             // once.
4699             // Would be nice if KeyValue had scatter/gather logic
4700             int idx = 0;
4701             for (Cell cell : family.getValue()) {
4702               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4703               KeyValue newKV;
4704               if (idx < results.size()
4705                   && CellUtil.matchingQualifier(results.get(idx),kv)) {
4706                 KeyValue oldKv = KeyValueUtil.ensureKeyValue(results.get(idx));
4707                 // allocate an empty kv once
4708                 newKV = new KeyValue(row.length, kv.getFamilyLength(),
4709                     kv.getQualifierLength(), now, KeyValue.Type.Put,
4710                     oldKv.getValueLength() + kv.getValueLength());
4711                 // copy in the value
4712                 System.arraycopy(oldKv.getBuffer(), oldKv.getValueOffset(),
4713                     newKV.getBuffer(), newKV.getValueOffset(),
4714                     oldKv.getValueLength());
4715                 System.arraycopy(kv.getBuffer(), kv.getValueOffset(),
4716                     newKV.getBuffer(),
4717                     newKV.getValueOffset() + oldKv.getValueLength(),
4718                     kv.getValueLength());
4719                 idx++;
4720               } else {
4721                 // allocate an empty kv once
4722                 newKV = new KeyValue(row.length, kv.getFamilyLength(),
4723                     kv.getQualifierLength(), now, KeyValue.Type.Put,
4724                     kv.getValueLength());
4725                 // copy in the value
4726                 System.arraycopy(kv.getBuffer(), kv.getValueOffset(),
4727                     newKV.getBuffer(), newKV.getValueOffset(),
4728                     kv.getValueLength());
4729               }
4730               // copy in row, family, and qualifier
4731               System.arraycopy(kv.getBuffer(), kv.getRowOffset(),
4732                   newKV.getBuffer(), newKV.getRowOffset(), kv.getRowLength());
4733               System.arraycopy(kv.getBuffer(), kv.getFamilyOffset(),
4734                   newKV.getBuffer(), newKV.getFamilyOffset(),
4735                   kv.getFamilyLength());
4736               System.arraycopy(kv.getBuffer(), kv.getQualifierOffset(),
4737                   newKV.getBuffer(), newKV.getQualifierOffset(),
4738                   kv.getQualifierLength());
4739   
4740               newKV.setMvccVersion(w.getWriteNumber());
4741               kvs.add(newKV);
4742 
4743               // Append update to WAL
4744               if (writeToWAL) {
4745                 if (walEdits == null) {
4746                   walEdits = new WALEdit();
4747                 }
4748                 walEdits.add(newKV);
4749               }
4750             }
4751 
4752             //store the kvs to the temporary memstore before writing HLog
4753             tempMemstore.put(store, kvs);
4754           }
4755 
4756           // Actually write to WAL now
4757           if (writeToWAL) {
4758             // Using default cluster id, as this can only happen in the orginating
4759             // cluster. A slave cluster receives the final value (not the delta)
4760             // as a Put.
4761             txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
4762               walEdits, new ArrayList<UUID>(), EnvironmentEdgeManager.currentTimeMillis(),
4763                   this.htableDescriptor);
4764           } else {
4765             recordMutationWithoutWal(append.getFamilyCellMap());
4766           }
4767 
4768           //Actually write to Memstore now
4769           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
4770             Store store = entry.getKey();
4771             if (store.getFamily().getMaxVersions() == 1) {
4772               // upsert if VERSIONS for this CF == 1
4773               size += store.upsert(entry.getValue(), getSmallestReadPoint());
4774             } else {
4775               // otherwise keep older versions around
4776               for (Cell cell: entry.getValue()) {
4777                 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4778                 size += store.add(kv);
4779               }
4780             }
4781             allKVs.addAll(entry.getValue());
4782           }
4783           size = this.addAndGetGlobalMemstoreSize(size);
4784           flush = isFlushSize(size);
4785         } finally {
4786           this.updatesLock.readLock().unlock();
4787         }
4788       } finally {
4789         rowLock.release();
4790       }
4791       if (writeToWAL) {
4792         // sync the transaction log outside the rowlock
4793         syncOrDefer(txid, durability);
4794       }
4795     } finally {
4796       if (w != null) {
4797         mvcc.completeMemstoreInsert(w);
4798       }
4799       closeRegionOperation();
4800     }
4801 
4802     if (this.metricsRegion != null) {
4803       this.metricsRegion.updateAppend();
4804     }
4805 
4806     if (flush) {
4807       // Request a cache flush. Do it outside update lock.
4808       requestFlush();
4809     }
4810 
4811 
4812     return append.isReturnResults() ? Result.create(allKVs) : null;
4813   }
4814 
4815   /**
4816    * Perform one or more increment operations on a row.
4817    * @param increment
4818    * @return new keyvalues after increment
4819    * @throws IOException
4820    */
4821   public Result increment(Increment increment)
4822   throws IOException {
4823     byte [] row = increment.getRow();
4824     checkRow(row, "increment");
4825     TimeRange tr = increment.getTimeRange();
4826     boolean flush = false;
4827     Durability durability = getEffectiveDurability(increment.getDurability());
4828     boolean writeToWAL = durability != Durability.SKIP_WAL;
4829     WALEdit walEdits = null;
4830     List<Cell> allKVs = new ArrayList<Cell>(increment.size());
4831     Map<Store, List<Cell>> tempMemstore = new HashMap<Store, List<Cell>>();
4832 
4833     long size = 0;
4834     long txid = 0;
4835 
4836     checkReadOnly();
4837     checkResources();
4838     // Lock row
4839     startRegionOperation(Operation.INCREMENT);
4840     this.writeRequestsCount.increment();
4841     WriteEntry w = null;
4842     try {
4843       RowLock rowLock = getRowLock(row);
4844       try {
4845         lock(this.updatesLock.readLock());
4846         // wait for all prior MVCC transactions to finish - while we hold the row lock
4847         // (so that we are guaranteed to see the latest state)
4848         mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert());
4849         // now start my own transaction
4850         w = mvcc.beginMemstoreInsert();
4851         try {
4852           long now = EnvironmentEdgeManager.currentTimeMillis();
4853           // Process each family
4854           for (Map.Entry<byte [], List<Cell>> family:
4855               increment.getFamilyCellMap().entrySet()) {
4856 
4857             Store store = stores.get(family.getKey());
4858             List<Cell> kvs = new ArrayList<Cell>(family.getValue().size());
4859 
4860             // Get previous values for all columns in this family
4861             Get get = new Get(row);
4862             for (Cell cell: family.getValue()) {
4863               KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4864               get.addColumn(family.getKey(), kv.getQualifier());
4865             }
4866             get.setTimeRange(tr.getMin(), tr.getMax());
4867             List<Cell> results = get(get, false);
4868   
4869             // Iterate the input columns and update existing values if they were
4870             // found, otherwise add new column initialized to the increment amount
4871             int idx = 0;
4872             for (Cell kv: family.getValue()) {
4873               long amount = Bytes.toLong(CellUtil.cloneValue(kv));
4874               if (idx < results.size() && CellUtil.matchingQualifier(results.get(idx), kv)) {
4875                 Cell c = results.get(idx);
4876                 if(c.getValueLength() == Bytes.SIZEOF_LONG) {
4877                   amount += Bytes.toLong(c.getValueArray(), c.getValueOffset(), Bytes.SIZEOF_LONG);
4878                 } else {
4879                   // throw DoNotRetryIOException instead of IllegalArgumentException
4880                   throw new org.apache.hadoop.hbase.DoNotRetryIOException(
4881                       "Attempted to increment field that isn't 64 bits wide");
4882                 }
4883                 idx++;
4884               }
4885 
4886               // Append new incremented KeyValue to list
4887               KeyValue newKV =
4888                 new KeyValue(row, family.getKey(), CellUtil.cloneQualifier(kv), now, Bytes.toBytes(amount));
4889               newKV.setMvccVersion(w.getWriteNumber());
4890               kvs.add(newKV);
4891 
4892               // Prepare WAL updates
4893               if (writeToWAL) {
4894                 if (walEdits == null) {
4895                   walEdits = new WALEdit();
4896                 }
4897                 walEdits.add(newKV);
4898               }
4899             }
4900 
4901             //store the kvs to the temporary memstore before writing HLog
4902             tempMemstore.put(store, kvs);
4903           }
4904 
4905           // Actually write to WAL now
4906           if (writeToWAL) {
4907             // Using default cluster id, as this can only happen in the orginating
4908             // cluster. A slave cluster receives the final value (not the delta)
4909             // as a Put.
4910             txid = this.log.appendNoSync(this.getRegionInfo(), this.htableDescriptor.getTableName(),
4911                 walEdits, new ArrayList<UUID>(), EnvironmentEdgeManager.currentTimeMillis(),
4912                   this.htableDescriptor);
4913           } else {
4914             recordMutationWithoutWal(increment.getFamilyCellMap());
4915           }
4916           //Actually write to Memstore now
4917           for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
4918             Store store = entry.getKey();
4919             if (store.getFamily().getMaxVersions() == 1) {
4920               // upsert if VERSIONS for this CF == 1
4921               size += store.upsert(entry.getValue(), getSmallestReadPoint());
4922             } else {
4923               // otherwise keep older versions around
4924               for (Cell cell : entry.getValue()) {
4925                 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
4926                 size += store.add(kv);
4927               }
4928             }
4929             allKVs.addAll(entry.getValue());
4930           }
4931           size = this.addAndGetGlobalMemstoreSize(size);
4932           flush = isFlushSize(size);
4933         } finally {
4934           this.updatesLock.readLock().unlock();
4935         }
4936       } finally {
4937         rowLock.release();
4938       }
4939       if (writeToWAL) {
4940         // sync the transaction log outside the rowlock
4941         syncOrDefer(txid, durability);
4942       }
4943     } finally {
4944       if (w != null) {
4945         mvcc.completeMemstoreInsert(w);
4946       }
4947       closeRegionOperation();
4948       if (this.metricsRegion != null) {
4949         this.metricsRegion.updateIncrement();
4950       }
4951     }
4952 
4953     if (flush) {
4954       // Request a cache flush.  Do it outside update lock.
4955       requestFlush();
4956     }
4957 
4958     return Result.create(allKVs);
4959   }
4960 
4961   //
4962   // New HBASE-880 Helpers
4963   //
4964 
4965   private void checkFamily(final byte [] family)
4966   throws NoSuchColumnFamilyException {
4967     if (!this.htableDescriptor.hasFamily(family)) {
4968       throw new NoSuchColumnFamilyException("Column family " +
4969           Bytes.toString(family) + " does not exist in region " + this
4970           + " in table " + this.htableDescriptor);
4971     }
4972   }
4973 
4974   public static final long FIXED_OVERHEAD = ClassSize.align(
4975       ClassSize.OBJECT +
4976       ClassSize.ARRAY +
4977       40 * ClassSize.REFERENCE + 2 * Bytes.SIZEOF_INT +
4978       (11 * Bytes.SIZEOF_LONG) +
4979       5 * Bytes.SIZEOF_BOOLEAN);
4980 
4981   // woefully out of date - currently missing:
4982   // 1 x HashMap - coprocessorServiceHandlers
4983   // 6 org.cliffc.high_scale_lib.Counter - numMutationsWithoutWAL, dataInMemoryWithoutWAL,
4984   //   checkAndMutateChecksPassed, checkAndMutateChecksFailed, readRequestsCount,
4985   //   writeRequestsCount, updatesBlockedMs
4986   // 1 x HRegion$WriteState - writestate
4987   // 1 x RegionCoprocessorHost - coprocessorHost
4988   // 1 x RegionSplitPolicy - splitPolicy
4989   // 1 x MetricsRegion - metricsRegion
4990   // 1 x MetricsRegionWrapperImpl - metricsRegionWrapper
4991   public static final long DEEP_OVERHEAD = FIXED_OVERHEAD +
4992       ClassSize.OBJECT + // closeLock
4993       (2 * ClassSize.ATOMIC_BOOLEAN) + // closed, closing
4994       (3 * ClassSize.ATOMIC_LONG) + // memStoreSize, numPutsWithoutWAL, dataInMemoryWithoutWAL
4995       (2 * ClassSize.CONCURRENT_HASHMAP) +  // lockedRows, scannerReadPoints
4996       WriteState.HEAP_SIZE + // writestate
4997       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + // stores
4998       (2 * ClassSize.REENTRANT_LOCK) + // lock, updatesLock
4999       ClassSize.ARRAYLIST + // recentFlushes
5000       MultiVersionConsistencyControl.FIXED_SIZE // mvcc
5001       + ClassSize.TREEMAP // maxSeqIdInStores
5002       + 2 * ClassSize.ATOMIC_INTEGER // majorInProgress, minorInProgress
5003       ;
5004 
5005   @Override
5006   public long heapSize() {
5007     long heapSize = DEEP_OVERHEAD;
5008     for (Store store : this.stores.values()) {
5009       heapSize += store.heapSize();
5010     }
5011     // this does not take into account row locks, recent flushes, mvcc entries, and more
5012     return heapSize;
5013   }
5014 
5015   /*
5016    * This method calls System.exit.
5017    * @param message Message to print out.  May be null.
5018    */
5019   private static void printUsageAndExit(final String message) {
5020     if (message != null && message.length() > 0) System.out.println(message);
5021     System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
5022     System.out.println("Options:");
5023     System.out.println(" major_compact  Pass this option to major compact " +
5024       "passed region.");
5025     System.out.println("Default outputs scan of passed region.");
5026     System.exit(1);
5027   }
5028 
5029   /**
5030    * Registers a new protocol buffer {@link Service} subclass as a coprocessor endpoint to
5031    * be available for handling
5032    * {@link HRegion#execService(com.google.protobuf.RpcController,
5033    *    org.apache.hadoop.hbase.protobuf.generated.ClientProtos.CoprocessorServiceCall)}} calls.
5034    *
5035    * <p>
5036    * Only a single instance may be registered per region for a given {@link Service} subclass (the
5037    * instances are keyed on {@link com.google.protobuf.Descriptors.ServiceDescriptor#getFullName()}.
5038    * After the first registration, subsequent calls with the same service name will fail with
5039    * a return value of {@code false}.
5040    * </p>
5041    * @param instance the {@code Service} subclass instance to expose as a coprocessor endpoint
5042    * @return {@code true} if the registration was successful, {@code false}
5043    * otherwise
5044    */
5045   public boolean registerService(Service instance) {
5046     /*
5047      * No stacking of instances is allowed for a single service name
5048      */
5049     Descriptors.ServiceDescriptor serviceDesc = instance.getDescriptorForType();
5050     if (coprocessorServiceHandlers.containsKey(serviceDesc.getFullName())) {
5051       LOG.error("Coprocessor service "+serviceDesc.getFullName()+
5052           " already registered, rejecting request from "+instance
5053       );
5054       return false;
5055     }
5056 
5057     coprocessorServiceHandlers.put(serviceDesc.getFullName(), instance);
5058     if (LOG.isDebugEnabled()) {
5059       LOG.debug("Registered coprocessor service: region="+
5060           Bytes.toStringBinary(getRegionName())+" service="+serviceDesc.getFullName());
5061     }
5062     return true;
5063   }
5064 
5065   /**
5066    * Executes a single protocol buffer coprocessor endpoint {@link Service} method using
5067    * the registered protocol handlers.  {@link Service} implementations must be registered via the
5068    * {@link HRegion#registerService(com.google.protobuf.Service)}
5069    * method before they are available.
5070    *
5071    * @param controller an {@code RpcContoller} implementation to pass to the invoked service
5072    * @param call a {@code CoprocessorServiceCall} instance identifying the service, method,
5073    *     and parameters for the method invocation
5074    * @return a protocol buffer {@code Message} instance containing the method's result
5075    * @throws IOException if no registered service handler is found or an error
5076    *     occurs during the invocation
5077    * @see org.apache.hadoop.hbase.regionserver.HRegion#registerService(com.google.protobuf.Service)
5078    */
5079   public Message execService(RpcController controller, CoprocessorServiceCall call)
5080       throws IOException {
5081     String serviceName = call.getServiceName();
5082     String methodName = call.getMethodName();
5083     if (!coprocessorServiceHandlers.containsKey(serviceName)) {
5084       throw new UnknownProtocolException(null,
5085           "No registered coprocessor service found for name "+serviceName+
5086           " in region "+Bytes.toStringBinary(getRegionName()));
5087     }
5088 
5089     Service service = coprocessorServiceHandlers.get(serviceName);
5090     Descriptors.ServiceDescriptor serviceDesc = service.getDescriptorForType();
5091     Descriptors.MethodDescriptor methodDesc = serviceDesc.findMethodByName(methodName);
5092     if (methodDesc == null) {
5093       throw new UnknownProtocolException(service.getClass(),
5094           "Unknown method "+methodName+" called on service "+serviceName+
5095               " in region "+Bytes.toStringBinary(getRegionName()));
5096     }
5097 
5098     Message request = service.getRequestPrototype(methodDesc).newBuilderForType()
5099         .mergeFrom(call.getRequest()).build();
5100     final Message.Builder responseBuilder =
5101         service.getResponsePrototype(methodDesc).newBuilderForType();
5102     service.callMethod(methodDesc, controller, request, new RpcCallback<Message>() {
5103       @Override
5104       public void run(Message message) {
5105         if (message != null) {
5106           responseBuilder.mergeFrom(message);
5107         }
5108       }
5109     });
5110 
5111     return responseBuilder.build();
5112   }
5113 
5114   /*
5115    * Process table.
5116    * Do major compaction or list content.
5117    * @param fs
5118    * @param p
5119    * @param log
5120    * @param c
5121    * @param majorCompact
5122    * @throws IOException
5123    */
5124   private static void processTable(final FileSystem fs, final Path p,
5125       final HLog log, final Configuration c,
5126       final boolean majorCompact)
5127   throws IOException {
5128     HRegion region = null;
5129     // Currently expects tables have one region only.
5130     if (FSUtils.getTableName(p).equals(TableName.META_TABLE_NAME)) {
5131       region = HRegion.newHRegion(p, log, fs, c,
5132         HRegionInfo.FIRST_META_REGIONINFO, HTableDescriptor.META_TABLEDESC, null);
5133     } else {
5134       throw new IOException("Not a known catalog table: " + p.toString());
5135     }
5136     try {
5137       region.initialize();
5138       if (majorCompact) {
5139         region.compactStores(true);
5140       } else {
5141         // Default behavior
5142         Scan scan = new Scan();
5143         // scan.addFamily(HConstants.CATALOG_FAMILY);
5144         RegionScanner scanner = region.getScanner(scan);
5145         try {
5146           List<Cell> kvs = new ArrayList<Cell>();
5147           boolean done = false;
5148           do {
5149             kvs.clear();
5150             done = scanner.next(kvs);
5151             if (kvs.size() > 0) LOG.info(kvs);
5152           } while (done);
5153         } finally {
5154           scanner.close();
5155         }
5156       }
5157     } finally {
5158       region.close();
5159     }
5160   }
5161 
5162   boolean shouldForceSplit() {
5163     return this.splitRequest;
5164   }
5165 
5166   byte[] getExplicitSplitPoint() {
5167     return this.explicitSplitPoint;
5168   }
5169 
5170   void forceSplit(byte[] sp) {
5171     // NOTE : this HRegion will go away after the forced split is successfull
5172     //        therefore, no reason to clear this value
5173     this.splitRequest = true;
5174     if (sp != null) {
5175       this.explicitSplitPoint = sp;
5176     }
5177   }
5178 
5179   void clearSplit_TESTS_ONLY() {
5180     this.splitRequest = false;
5181   }
5182 
5183   /**
5184    * Give the region a chance to prepare before it is split.
5185    */
5186   protected void prepareToSplit() {
5187     // nothing
5188   }
5189 
5190   /**
5191    * Return the splitpoint. null indicates the region isn't splittable
5192    * If the splitpoint isn't explicitly specified, it will go over the stores
5193    * to find the best splitpoint. Currently the criteria of best splitpoint
5194    * is based on the size of the store.
5195    */
5196   public byte[] checkSplit() {
5197     // Can't split META
5198     if (this.getRegionInfo().isMetaTable() ||
5199         TableName.NAMESPACE_TABLE_NAME.equals(this.getRegionInfo().getTable())) {
5200       if (shouldForceSplit()) {
5201         LOG.warn("Cannot split meta region in HBase 0.20 and above");
5202       }
5203       return null;
5204     }
5205 
5206     // Can't split region which is in recovering state
5207     if (this.isRecovering()) {
5208       LOG.info("Cannot split region " + this.getRegionInfo().getEncodedName() + " in recovery.");
5209       return null;
5210     }
5211 
5212     if (!splitPolicy.shouldSplit()) {
5213       return null;
5214     }
5215 
5216     byte[] ret = splitPolicy.getSplitPoint();
5217 
5218     if (ret != null) {
5219       try {
5220         checkRow(ret, "calculated split");
5221       } catch (IOException e) {
5222         LOG.error("Ignoring invalid split", e);
5223         return null;
5224       }
5225     }
5226     return ret;
5227   }
5228 
5229   /**
5230    * @return The priority that this region should have in the compaction queue
5231    */
5232   public int getCompactPriority() {
5233     int count = Integer.MAX_VALUE;
5234     for (Store store : stores.values()) {
5235       count = Math.min(count, store.getCompactPriority());
5236     }
5237     return count;
5238   }
5239 
5240   /**
5241    * Checks every store to see if one has too many
5242    * store files
5243    * @return true if any store has too many store files
5244    */
5245   public boolean needsCompaction() {
5246     for (Store store : stores.values()) {
5247       if(store.needsCompaction()) {
5248         return true;
5249       }
5250     }
5251     return false;
5252   }
5253 
5254   /** @return the coprocessor host */
5255   public RegionCoprocessorHost getCoprocessorHost() {
5256     return coprocessorHost;
5257   }
5258 
5259   /** @param coprocessorHost the new coprocessor host */
5260   public void setCoprocessorHost(final RegionCoprocessorHost coprocessorHost) {
5261     this.coprocessorHost = coprocessorHost;
5262   }
5263 
5264   /**
5265    * This method needs to be called before any public call that reads or
5266    * modifies data. It has to be called just before a try.
5267    * #closeRegionOperation needs to be called in the try's finally block
5268    * Acquires a read lock and checks if the region is closing or closed.
5269    * @throws NotServingRegionException when the region is closing or closed
5270    * @throws RegionTooBusyException if failed to get the lock in time
5271    * @throws InterruptedIOException if interrupted while waiting for a lock
5272    */
5273   public void startRegionOperation()
5274       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
5275     startRegionOperation(Operation.ANY);
5276   }
5277 
5278   /**
5279    * @param op The operation is about to be taken on the region
5280    * @throws NotServingRegionException
5281    * @throws RegionTooBusyException
5282    * @throws InterruptedIOException
5283    */
5284   protected void startRegionOperation(Operation op) throws NotServingRegionException,
5285       RegionTooBusyException, InterruptedIOException {
5286     switch (op) {
5287     case INCREMENT:
5288     case APPEND:
5289     case GET:
5290     case SCAN:
5291     case SPLIT_REGION:
5292     case MERGE_REGION:
5293     case PUT:
5294     case DELETE:
5295     case BATCH_MUTATE:
5296     case COMPACT_REGION:
5297       // when a region is in recovering state, no read, split or merge is allowed
5298       if (this.isRecovering() && (this.disallowWritesInRecovering ||
5299               (op != Operation.PUT && op != Operation.DELETE && op != Operation.BATCH_MUTATE))) {
5300         throw new RegionInRecoveryException(this.getRegionNameAsString() + " is recovering");
5301       }
5302       break;
5303     default:
5304       break;
5305     }
5306     if (op == Operation.MERGE_REGION || op == Operation.SPLIT_REGION
5307         || op == Operation.COMPACT_REGION) {
5308       // split, merge or compact region doesn't need to check the closing/closed state or lock the
5309       // region
5310       return;
5311     }
5312     if (this.closing.get()) {
5313       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
5314     }
5315     lock(lock.readLock());
5316     if (this.closed.get()) {
5317       lock.readLock().unlock();
5318       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
5319     }
5320   }
5321 
5322   /**
5323    * Closes the lock. This needs to be called in the finally block corresponding
5324    * to the try block of #startRegionOperation
5325    */
5326   public void closeRegionOperation() {
5327     lock.readLock().unlock();
5328   }
5329 
5330   /**
5331    * This method needs to be called before any public call that reads or
5332    * modifies stores in bulk. It has to be called just before a try.
5333    * #closeBulkRegionOperation needs to be called in the try's finally block
5334    * Acquires a writelock and checks if the region is closing or closed.
5335    * @throws NotServingRegionException when the region is closing or closed
5336    * @throws RegionTooBusyException if failed to get the lock in time
5337    * @throws InterruptedIOException if interrupted while waiting for a lock
5338    */
5339   private void startBulkRegionOperation(boolean writeLockNeeded)
5340       throws NotServingRegionException, RegionTooBusyException, InterruptedIOException {
5341     if (this.closing.get()) {
5342       throw new NotServingRegionException(getRegionNameAsString() + " is closing");
5343     }
5344     if (writeLockNeeded) lock(lock.writeLock());
5345     else lock(lock.readLock());
5346     if (this.closed.get()) {
5347       if (writeLockNeeded) lock.writeLock().unlock();
5348       else lock.readLock().unlock();
5349       throw new NotServingRegionException(getRegionNameAsString() + " is closed");
5350     }
5351   }
5352 
5353   /**
5354    * Closes the lock. This needs to be called in the finally block corresponding
5355    * to the try block of #startRegionOperation
5356    */
5357   private void closeBulkRegionOperation(){
5358     if (lock.writeLock().isHeldByCurrentThread()) lock.writeLock().unlock();
5359     else lock.readLock().unlock();
5360   }
5361 
5362   /**
5363    * Update counters for numer of puts without wal and the size of possible data loss.
5364    * These information are exposed by the region server metrics.
5365    */
5366   private void recordMutationWithoutWal(final Map<byte [], List<Cell>> familyMap) {
5367     numMutationsWithoutWAL.increment();
5368     if (numMutationsWithoutWAL.get() <= 1) {
5369       LOG.info("writing data to region " + this +
5370                " with WAL disabled. Data may be lost in the event of a crash.");
5371     }
5372 
5373     long mutationSize = 0;
5374     for (List<Cell> cells: familyMap.values()) {
5375       for (Cell cell : cells) {
5376         KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
5377         mutationSize += kv.getKeyLength() + kv.getValueLength();
5378       }
5379     }
5380 
5381     dataInMemoryWithoutWAL.add(mutationSize);
5382   }
5383 
5384   private void lock(final Lock lock)
5385       throws RegionTooBusyException, InterruptedIOException {
5386     lock(lock, 1);
5387   }
5388 
5389   /**
5390    * Try to acquire a lock.  Throw RegionTooBusyException
5391    * if failed to get the lock in time. Throw InterruptedIOException
5392    * if interrupted while waiting for the lock.
5393    */
5394   private void lock(final Lock lock, final int multiplier)
5395       throws RegionTooBusyException, InterruptedIOException {
5396     try {
5397       final long waitTime = Math.min(maxBusyWaitDuration,
5398           busyWaitDuration * Math.min(multiplier, maxBusyWaitMultiplier));
5399       if (!lock.tryLock(waitTime, TimeUnit.MILLISECONDS)) {
5400         throw new RegionTooBusyException(
5401             "failed to get a lock in " + waitTime + " ms. " +
5402                 "regionName=" + (this.getRegionInfo() == null ? "unknown" :
5403                 this.getRegionInfo().getRegionNameAsString()) +
5404                 ", server=" + (this.getRegionServerServices() == null ? "unknown" :
5405                 this.getRegionServerServices().getServerName()));
5406       }
5407     } catch (InterruptedException ie) {
5408       LOG.info("Interrupted while waiting for a lock");
5409       InterruptedIOException iie = new InterruptedIOException();
5410       iie.initCause(ie);
5411       throw iie;
5412     }
5413   }
5414 
5415   /**
5416    * Calls sync with the given transaction ID if the region's table is not
5417    * deferring it.
5418    * @param txid should sync up to which transaction
5419    * @throws IOException If anything goes wrong with DFS
5420    */
5421   private void syncOrDefer(long txid, Durability durability) throws IOException {
5422     if (this.getRegionInfo().isMetaRegion()) {
5423       this.log.sync(txid);
5424     } else {
5425       switch(durability) {
5426       case USE_DEFAULT:
5427         // do what table defaults to
5428         if (shouldSyncLog()) {
5429           this.log.sync(txid);
5430         }
5431         break;
5432       case SKIP_WAL:
5433         // nothing do to
5434         break;
5435       case ASYNC_WAL:
5436         // defer the sync, unless we globally can't
5437         if (this.deferredLogSyncDisabled) {
5438           this.log.sync(txid);
5439         }
5440         break;
5441       case SYNC_WAL:
5442       case FSYNC_WAL:
5443         // sync the WAL edit (SYNC and FSYNC treated the same for now)
5444         this.log.sync(txid);
5445         break;
5446       }
5447     }
5448   }
5449 
5450   /**
5451    * Check whether we should sync the log from the table's durability settings
5452    */
5453   private boolean shouldSyncLog() {
5454     return this.deferredLogSyncDisabled ||
5455         durability.ordinal() >  Durability.ASYNC_WAL.ordinal();
5456   }
5457 
5458   /**
5459    * A mocked list implementaion - discards all updates.
5460    */
5461   private static final List<Cell> MOCKED_LIST = new AbstractList<Cell>() {
5462 
5463     @Override
5464     public void add(int index, Cell element) {
5465       // do nothing
5466     }
5467 
5468     @Override
5469     public boolean addAll(int index, Collection<? extends Cell> c) {
5470       return false; // this list is never changed as a result of an update
5471     }
5472 
5473     @Override
5474     public KeyValue get(int index) {
5475       throw new UnsupportedOperationException();
5476     }
5477 
5478     @Override
5479     public int size() {
5480       return 0;
5481     }
5482   };
5483 
5484   /**
5485    * Facility for dumping and compacting catalog tables.
5486    * Only does catalog tables since these are only tables we for sure know
5487    * schema on.  For usage run:
5488    * <pre>
5489    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
5490    * </pre>
5491    * @param args
5492    * @throws IOException
5493    */
5494   public static void main(String[] args) throws IOException {
5495     if (args.length < 1) {
5496       printUsageAndExit(null);
5497     }
5498     boolean majorCompact = false;
5499     if (args.length > 1) {
5500       if (!args[1].toLowerCase().startsWith("major")) {
5501         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
5502       }
5503       majorCompact = true;
5504     }
5505     final Path tableDir = new Path(args[0]);
5506     final Configuration c = HBaseConfiguration.create();
5507     final FileSystem fs = FileSystem.get(c);
5508     final Path logdir = new Path(c.get("hbase.tmp.dir"));
5509     final String logname = "hlog" + FSUtils.getTableName(tableDir) + System.currentTimeMillis();
5510 
5511     final HLog log = HLogFactory.createHLog(fs, logdir, logname, c);
5512     try {
5513       processTable(fs, tableDir, log, c, majorCompact);
5514     } finally {
5515        log.close();
5516        // TODO: is this still right?
5517        BlockCache bc = new CacheConfig(c).getBlockCache();
5518        if (bc != null) bc.shutdown();
5519     }
5520   }
5521 
5522   /**
5523    * Gets the latest sequence number that was read from storage when this region was opened.
5524    */
5525   public long getOpenSeqNum() {
5526     return this.openSeqNum;
5527   }
5528 
5529   /**
5530    * Gets max sequence ids of stores that was read from storage when this region was opened. WAL
5531    * Edits with smaller or equal sequence number will be skipped from replay.
5532    */
5533   public Map<byte[], Long> getMaxStoreSeqIdForLogReplay() {
5534     return this.maxSeqIdInStores;
5535   }
5536 
5537   /**
5538    * @return if a given region is in compaction now.
5539    */
5540   public CompactionState getCompactionState() {
5541     boolean hasMajor = majorInProgress.get() > 0, hasMinor = minorInProgress.get() > 0;
5542     return (hasMajor ? (hasMinor ? CompactionState.MAJOR_AND_MINOR : CompactionState.MAJOR)
5543         : (hasMinor ? CompactionState.MINOR : CompactionState.NONE));
5544   }
5545 
5546   public void reportCompactionRequestStart(boolean isMajor){
5547     (isMajor ? majorInProgress : minorInProgress).incrementAndGet();
5548   }
5549 
5550   public void reportCompactionRequestEnd(boolean isMajor, int numFiles, long filesSizeCompacted){
5551     int newValue = (isMajor ? majorInProgress : minorInProgress).decrementAndGet();
5552 
5553     // metrics
5554     compactionsFinished.incrementAndGet();
5555     compactionNumFilesCompacted.addAndGet(numFiles);
5556     compactionNumBytesCompacted.addAndGet(filesSizeCompacted);
5557 
5558     assert newValue >= 0;
5559   }
5560 
5561   /**
5562    * Listener class to enable callers of
5563    * bulkLoadHFile() to perform any necessary
5564    * pre/post processing of a given bulkload call
5565    */
5566   public interface BulkLoadListener {
5567 
5568     /**
5569      * Called before an HFile is actually loaded
5570      * @param family family being loaded to
5571      * @param srcPath path of HFile
5572      * @return final path to be used for actual loading
5573      * @throws IOException
5574      */
5575     String prepareBulkLoad(byte[] family, String srcPath) throws IOException;
5576 
5577     /**
5578      * Called after a successful HFile load
5579      * @param family family being loaded to
5580      * @param srcPath path of HFile
5581      * @throws IOException
5582      */
5583     void doneBulkLoad(byte[] family, String srcPath) throws IOException;
5584 
5585     /**
5586      * Called after a failed HFile load
5587      * @param family family being loaded to
5588      * @param srcPath path of HFile
5589      * @throws IOException
5590      */
5591     void failedBulkLoad(byte[] family, String srcPath) throws IOException;
5592   }
5593 
5594   @VisibleForTesting class RowLockContext {
5595     private final HashedBytes row;
5596     private final CountDownLatch latch = new CountDownLatch(1);
5597     private final Thread thread;
5598     private int lockCount = 0;
5599 
5600     RowLockContext(HashedBytes row) {
5601       this.row = row;
5602       this.thread = Thread.currentThread();
5603     }
5604 
5605     boolean ownedByCurrentThread() {
5606       return thread == Thread.currentThread();
5607     }
5608 
5609     RowLock newLock() {
5610       lockCount++;
5611       return new RowLock(this);
5612     }
5613 
5614     void releaseLock() {
5615       if (!ownedByCurrentThread()) {
5616         throw new IllegalArgumentException("Lock held by thread: " + thread
5617           + " cannot be released by different thread: " + Thread.currentThread());
5618       }
5619       lockCount--;
5620       if (lockCount == 0) {
5621         // no remaining locks by the thread, unlock and allow other threads to access
5622         RowLockContext existingContext = lockedRows.remove(row);
5623         if (existingContext != this) {
5624           throw new RuntimeException(
5625               "Internal row lock state inconsistent, should not happen, row: " + row);
5626         }
5627         latch.countDown();
5628       }
5629     }
5630   }
5631 
5632   /**
5633    * Row lock held by a given thread.
5634    * One thread may acquire multiple locks on the same row simultaneously.
5635    * The locks must be released by calling release() from the same thread.
5636    */
5637   public class RowLock {
5638     @VisibleForTesting final RowLockContext context;
5639     private boolean released = false;
5640 
5641     @VisibleForTesting RowLock(RowLockContext context) {
5642       this.context = context;
5643     }
5644 
5645     /**
5646      * Release the given lock.  If there are no remaining locks held by the current thread
5647      * then unlock the row and allow other threads to acquire the lock.
5648      * @throws IllegalArgumentException if called by a different thread than the lock owning thread
5649      */
5650     public void release() {
5651       if (!released) {
5652         context.releaseLock();
5653         released = true;
5654       }
5655     }
5656   }
5657 }