View Javadoc

1   /*
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.io.IOException;
23  import java.io.UnsupportedEncodingException;
24  import java.lang.reflect.Constructor;
25  import java.util.AbstractList;
26  import java.util.ArrayList;
27  import java.util.Arrays;
28  import java.util.Collection;
29  import java.util.HashMap;
30  import java.util.List;
31  import java.util.Map;
32  import java.util.NavigableSet;
33  import java.util.Random;
34  import java.util.Set;
35  import java.util.TreeMap;
36  import java.util.TreeSet;
37  import java.util.concurrent.Callable;
38  import java.util.concurrent.ConcurrentSkipListMap;
39  import java.util.concurrent.atomic.AtomicBoolean;
40  import java.util.concurrent.atomic.AtomicLong;
41  import java.util.concurrent.locks.ReentrantReadWriteLock;
42  
43  import org.apache.commons.logging.Log;
44  import org.apache.commons.logging.LogFactory;
45  import org.apache.hadoop.conf.Configuration;
46  import org.apache.hadoop.fs.FSDataOutputStream;
47  import org.apache.hadoop.fs.FileStatus;
48  import org.apache.hadoop.fs.FileSystem;
49  import org.apache.hadoop.fs.Path;
50  import org.apache.hadoop.hbase.DoNotRetryIOException;
51  import org.apache.hadoop.hbase.DroppedSnapshotException;
52  import org.apache.hadoop.hbase.HBaseConfiguration;
53  import org.apache.hadoop.hbase.HColumnDescriptor;
54  import org.apache.hadoop.hbase.HConstants;
55  import org.apache.hadoop.hbase.HRegionInfo;
56  import org.apache.hadoop.hbase.HTableDescriptor;
57  import org.apache.hadoop.hbase.KeyValue;
58  import org.apache.hadoop.hbase.NotServingRegionException;
59  import org.apache.hadoop.hbase.UnknownScannerException;
60  import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
61  import org.apache.hadoop.hbase.client.Delete;
62  import org.apache.hadoop.hbase.client.Get;
63  import org.apache.hadoop.hbase.client.Put;
64  import org.apache.hadoop.hbase.client.Result;
65  import org.apache.hadoop.hbase.client.RowLock;
66  import org.apache.hadoop.hbase.client.Scan;
67  import org.apache.hadoop.hbase.filter.Filter;
68  import org.apache.hadoop.hbase.filter.IncompatibleFilterException;
69  import org.apache.hadoop.hbase.io.HeapSize;
70  import org.apache.hadoop.hbase.io.Reference.Range;
71  import org.apache.hadoop.hbase.io.hfile.BlockCache;
72  import org.apache.hadoop.hbase.ipc.HRegionInterface;
73  import org.apache.hadoop.hbase.regionserver.wal.HLog;
74  import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
75  import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
76  import org.apache.hadoop.hbase.util.Bytes;
77  import org.apache.hadoop.hbase.util.ClassSize;
78  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
79  import org.apache.hadoop.hbase.util.FSUtils;
80  import org.apache.hadoop.hbase.util.Pair;
81  import org.apache.hadoop.hbase.util.Writables;
82  import org.apache.hadoop.io.Writable;
83  import org.apache.hadoop.util.Progressable;
84  import org.apache.hadoop.util.StringUtils;
85  
86  import com.google.common.collect.Lists;
87  
88  /**
89   * HRegion stores data for a certain region of a table.  It stores all columns
90   * for each row. A given table consists of one or more HRegions.
91   *
92   * <p>We maintain multiple HStores for a single HRegion.
93   *
94   * <p>An Store is a set of rows with some column data; together,
95   * they make up all the data for the rows.
96   *
97   * <p>Each HRegion has a 'startKey' and 'endKey'.
98   * <p>The first is inclusive, the second is exclusive (except for
99   * the final region)  The endKey of region 0 is the same as
100  * startKey for region 1 (if it exists).  The startKey for the
101  * first region is null. The endKey for the final region is null.
102  *
103  * <p>Locking at the HRegion level serves only one purpose: preventing the
104  * region from being closed (and consequently split) while other operations
105  * are ongoing. Each row level operation obtains both a row lock and a region
106  * read lock for the duration of the operation. While a scanner is being
107  * constructed, getScanner holds a read lock. If the scanner is successfully
108  * constructed, it holds a read lock until it is closed. A close takes out a
109  * write lock and consequently will block for ongoing operations and will block
110  * new operations from starting while the close is in progress.
111  *
112  * <p>An HRegion is defined by its table and its key extent.
113  *
114  * <p>It consists of at least one Store.  The number of Stores should be
115  * configurable, so that data which is accessed together is stored in the same
116  * Store.  Right now, we approximate that by building a single Store for
117  * each column family.  (This config info will be communicated via the
118  * tabledesc.)
119  *
120  * <p>The HTableDescriptor contains metainfo about the HRegion's table.
121  * regionName is a unique identifier for this HRegion. (startKey, endKey]
122  * defines the keyspace for this HRegion.
123  */
124 public class HRegion implements HeapSize { // , Writable{
125   public static final Log LOG = LogFactory.getLog(HRegion.class);
126   static final String SPLITDIR = "splits";
127   static final String MERGEDIR = "merges";
128 
129   final AtomicBoolean closed = new AtomicBoolean(false);
130   /* Closing can take some time; use the closing flag if there is stuff we don't
131    * want to do while in closing state; e.g. like offer this region up to the
132    * master as a region to close if the carrying regionserver is overloaded.
133    * Once set, it is never cleared.
134    */
135   final AtomicBoolean closing = new AtomicBoolean(false);
136 
137   //////////////////////////////////////////////////////////////////////////////
138   // Members
139   //////////////////////////////////////////////////////////////////////////////
140 
141   private final Set<byte[]> lockedRows =
142     new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR);
143   private final Map<Integer, byte []> lockIds =
144     new HashMap<Integer, byte []>();
145   private int lockIdGenerator = 1;
146   static private Random rand = new Random();
147 
148   protected final Map<byte [], Store> stores =
149     new ConcurrentSkipListMap<byte [], Store>(Bytes.BYTES_RAWCOMPARATOR);
150 
151   //These variable are just used for getting data out of the region, to test on
152   //client side
153   // private int numStores = 0;
154   // private int [] storeSize = null;
155   // private byte [] name = null;
156 
157   final AtomicLong memstoreSize = new AtomicLong(0);
158 
159   /**
160    * The directory for the table this region is part of.
161    * This directory contains the directory for this region.
162    */
163   final Path tableDir;
164   
165   final HLog log;
166   final FileSystem fs;
167   final Configuration conf;
168   final HRegionInfo regionInfo;
169   final Path regiondir;
170   KeyValue.KVComparator comparator;
171 
172   /*
173    * Set this when scheduling compaction if want the next compaction to be a
174    * major compaction.  Cleared each time through compaction code.
175    */
176   private volatile boolean forceMajorCompaction = false;
177 
178   /*
179    * Data structure of write state flags used coordinating flushes,
180    * compactions and closes.
181    */
182   static class WriteState {
183     // Set while a memstore flush is happening.
184     volatile boolean flushing = false;
185     // Set when a flush has been requested.
186     volatile boolean flushRequested = false;
187     // Set while a compaction is running.
188     volatile boolean compacting = false;
189     // Gets set in close. If set, cannot compact or flush again.
190     volatile boolean writesEnabled = true;
191     // Set if region is read-only
192     volatile boolean readOnly = false;
193 
194     /**
195      * Set flags that make this region read-only.
196      *
197      * @param onOff flip value for region r/o setting
198      */
199     synchronized void setReadOnly(final boolean onOff) {
200       this.writesEnabled = !onOff;
201       this.readOnly = onOff;
202     }
203 
204     boolean isReadOnly() {
205       return this.readOnly;
206     }
207 
208     boolean isFlushRequested() {
209       return this.flushRequested;
210     }
211   }
212 
213   private final WriteState writestate = new WriteState();
214 
215   final long memstoreFlushSize;
216   private volatile long lastFlushTime;
217   final FlushRequester flushListener;
218   private final long blockingMemStoreSize;
219   final long threadWakeFrequency;
220   // Used to guard splits and closes
221   private final ReentrantReadWriteLock splitsAndClosesLock =
222     new ReentrantReadWriteLock();
223   private final ReentrantReadWriteLock newScannerLock =
224     new ReentrantReadWriteLock();
225 
226   // Stop updates lock
227   private final ReentrantReadWriteLock updatesLock =
228     new ReentrantReadWriteLock();
229   private final Object splitLock = new Object();
230   private boolean splitRequest;
231 
232   private final ReadWriteConsistencyControl rwcc =
233       new ReadWriteConsistencyControl();
234 
235   /**
236    * Name of the region info file that resides just under the region directory.
237    */
238   public final static String REGIONINFO_FILE = ".regioninfo";
239 
240   /**
241    * Should only be used for testing purposes
242    */
243   public HRegion(){
244     this.tableDir = null;
245     this.blockingMemStoreSize = 0L;
246     this.conf = null;
247     this.flushListener = null;
248     this.fs = null;
249     this.memstoreFlushSize = 0L;
250     this.log = null;
251     this.regiondir = null;
252     this.regionInfo = null;
253     this.threadWakeFrequency = 0L;
254   }
255 
256   /**
257    * HRegion constructor.  his constructor should only be used for testing and
258    * extensions.  Instances of HRegion should be instantiated with the
259    * {@link HRegion#newHRegion(Path, HLog, FileSystem, Configuration, org.apache.hadoop.hbase.HRegionInfo, FlushRequester)} method.
260    *
261    *
262    * @param tableDir qualified path of directory where region should be located,
263    * usually the table directory.
264    * @param log The HLog is the outbound log for any updates to the HRegion
265    * (There's a single HLog for all the HRegions on a single HRegionServer.)
266    * The log file is a logfile from the previous execution that's
267    * custom-computed for this HRegion. The HRegionServer computes and sorts the
268    * appropriate log info for this HRegion. If there is a previous log file
269    * (implying that the HRegion has been written-to before), then read it from
270    * the supplied path.
271    * @param fs is the filesystem.
272    * @param conf is global configuration settings.
273    * @param regionInfo - HRegionInfo that describes the region
274    * is new), then read them from the supplied path.
275    * @param flushListener an object that implements CacheFlushListener or null
276    * making progress to master -- otherwise master might think region deploy
277    * failed.  Can be null.
278    *
279    * @see HRegion#newHRegion(Path, HLog, FileSystem, Configuration, org.apache.hadoop.hbase.HRegionInfo, FlushRequester)
280 
281    */
282   public HRegion(Path tableDir, HLog log, FileSystem fs, Configuration conf,
283       HRegionInfo regionInfo, FlushRequester flushListener) {
284     this.tableDir = tableDir;
285     this.comparator = regionInfo.getComparator();
286     this.log = log;
287     this.fs = fs;
288     this.conf = conf;
289     this.regionInfo = regionInfo;
290     this.flushListener = flushListener;
291     this.threadWakeFrequency = conf.getLong(HConstants.THREAD_WAKE_FREQUENCY,
292         10 * 1000);
293     String encodedNameStr = this.regionInfo.getEncodedName();
294     this.regiondir = new Path(tableDir, encodedNameStr);
295     if (LOG.isDebugEnabled()) {
296       // Write out region name as string and its encoded name.
297       LOG.debug("Creating region " + this);
298     }
299     long flushSize = regionInfo.getTableDesc().getMemStoreFlushSize();
300     if (flushSize == HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE) {
301       flushSize = conf.getLong("hbase.hregion.memstore.flush.size",
302                       HTableDescriptor.DEFAULT_MEMSTORE_FLUSH_SIZE);
303     }
304     this.memstoreFlushSize = flushSize;
305     this.blockingMemStoreSize = this.memstoreFlushSize *
306       conf.getLong("hbase.hregion.memstore.block.multiplier", 2);
307   }
308 
309   /**
310    * Initialize this region.
311    * @return What the next sequence (edit) id should be.
312    * @throws IOException e
313    */
314   public long initialize() throws IOException {
315     return initialize(null);
316   }
317 
318   /**
319    * Initialize this region.
320    *
321    * @param reporter Tickle every so often if initialize is taking a while.
322    * @return What the next sequence (edit) id should be.
323    * @throws IOException e
324    */
325   public long initialize(final Progressable reporter)
326   throws IOException {
327     // Write HRI to a file in case we need to recover .META.
328     checkRegioninfoOnFilesystem();
329 
330     // Remove temporary data left over from old regions
331     cleanupTmpDir();
332     
333     // Load in all the HStores.  Get maximum seqid.
334     long maxSeqId = -1;
335     for (HColumnDescriptor c : this.regionInfo.getTableDesc().getFamilies()) {
336       Store store = instantiateHStore(this.tableDir, c);
337       this.stores.put(c.getName(), store);
338       long storeSeqId = store.getMaxSequenceId();
339       if (storeSeqId > maxSeqId) {
340         maxSeqId = storeSeqId;
341       }
342     }
343     // Recover any edits if available.
344     maxSeqId = replayRecoveredEditsIfAny(this.regiondir, maxSeqId, reporter);
345 
346     // Get rid of any splits or merges that were lost in-progress.  Clean out
347     // these directories here on open.  We may be opening a region that was
348     // being split but we crashed in the middle of it all.
349     FSUtils.deleteDirectory(this.fs, new Path(regiondir, SPLITDIR));
350     FSUtils.deleteDirectory(this.fs, new Path(regiondir, MERGEDIR));
351 
352     // See if region is meant to run read-only.
353     if (this.regionInfo.getTableDesc().isReadOnly()) {
354       this.writestate.setReadOnly(true);
355     }
356 
357     this.writestate.compacting = false;
358     this.lastFlushTime = EnvironmentEdgeManager.currentTimeMillis();
359     // Use maximum of log sequenceid or that which was found in stores
360     // (particularly if no recovered edits, seqid will be -1).
361     long nextSeqid = maxSeqId + 1;
362     LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid);
363     return nextSeqid;
364   }
365 
366   /*
367    * Move any passed HStore files into place (if any).  Used to pick up split
368    * files and any merges from splits and merges dirs.
369    * @param initialFiles
370    * @throws IOException
371    */
372   private static void moveInitialFilesIntoPlace(final FileSystem fs,
373     final Path initialFiles, final Path regiondir)
374   throws IOException {
375     if (initialFiles != null && fs.exists(initialFiles)) {
376       fs.rename(initialFiles, regiondir);
377     }
378   }
379 
380   /**
381    * @return True if this region has references.
382    */
383   boolean hasReferences() {
384     for (Store store : this.stores.values()) {
385       for (StoreFile sf : store.getStorefiles()) {
386         // Found a reference, return.
387         if (sf.isReference()) return true;
388       }
389     }
390     return false;
391   }
392 
393   /*
394    * Write out an info file under the region directory.  Useful recovering
395    * mangled regions.
396    * @throws IOException
397    */
398   private void checkRegioninfoOnFilesystem() throws IOException {
399     // Name of this file has two leading and trailing underscores so it doesn't
400     // clash w/ a store/family name.  There is possibility, but assumption is
401     // that its slim (don't want to use control character in filename because
402     //
403     Path regioninfo = new Path(this.regiondir, REGIONINFO_FILE);
404     if (this.fs.exists(regioninfo) &&
405         this.fs.getFileStatus(regioninfo).getLen() > 0) {
406       return;
407     }
408     FSDataOutputStream out = this.fs.create(regioninfo, true);
409     try {
410       this.regionInfo.write(out);
411       out.write('\n');
412       out.write('\n');
413       out.write(Bytes.toBytes(this.regionInfo.toString()));
414     } finally {
415       out.close();
416     }
417   }
418 
419   /** @return a HRegionInfo object for this region */
420   public HRegionInfo getRegionInfo() {
421     return this.regionInfo;
422   }
423 
424   /** @return true if region is closed */
425   public boolean isClosed() {
426     return this.closed.get();
427   }
428 
429   /**
430    * @return True if closing process has started.
431    */
432   public boolean isClosing() {
433     return this.closing.get();
434   }
435 
436    public ReadWriteConsistencyControl getRWCC() {
437      return rwcc;
438    }
439 
440   /**
441    * Close down this HRegion.  Flush the cache, shut down each HStore, don't
442    * service any more calls.
443    *
444    * <p>This method could take some time to execute, so don't call it from a
445    * time-sensitive thread.
446    *
447    * @return Vector of all the storage files that the HRegion's component
448    * HStores make use of.  It's a list of all HStoreFile objects. Returns empty
449    * vector if already closed and null if judged that it should not close.
450    *
451    * @throws IOException e
452    */
453   public List<StoreFile> close() throws IOException {
454     return close(false);
455   }
456 
457   /**
458    * Close down this HRegion.  Flush the cache unless abort parameter is true,
459    * Shut down each HStore, don't service any more calls.
460    *
461    * This method could take some time to execute, so don't call it from a
462    * time-sensitive thread.
463    *
464    * @param abort true if server is aborting (only during testing)
465    * @return Vector of all the storage files that the HRegion's component
466    * HStores make use of.  It's a list of HStoreFile objects.  Can be null if
467    * we are not to close at this time or we are already closed.
468    *
469    * @throws IOException e
470    */
471   public List<StoreFile> close(final boolean abort) throws IOException {
472     if (isClosed()) {
473       LOG.warn("region " + this + " already closed");
474       return null;
475     }
476     synchronized (splitLock) {
477       boolean wasFlushing = false;
478       synchronized (writestate) {
479         // Disable compacting and flushing by background threads for this
480         // region.
481         writestate.writesEnabled = false;
482         wasFlushing = writestate.flushing;
483         LOG.debug("Closing " + this + ": disabling compactions & flushes");
484         while (writestate.compacting || writestate.flushing) {
485           LOG.debug("waiting for" +
486               (writestate.compacting ? " compaction" : "") +
487               (writestate.flushing ?
488                   (writestate.compacting ? "," : "") + " cache flush" :
489                     "") + " to complete for region " + this);
490           try {
491             writestate.wait();
492           } catch (InterruptedException iex) {
493             // continue
494           }
495         }
496       }
497       // If we were not just flushing, is it worth doing a preflush...one
498       // that will clear out of the bulk of the memstore before we put up
499       // the close flag?
500       if (!abort && !wasFlushing && worthPreFlushing()) {
501         LOG.info("Running close preflush of " + this.getRegionNameAsString());
502         internalFlushcache();
503       }
504       newScannerLock.writeLock().lock();
505       this.closing.set(true);
506       try {
507         splitsAndClosesLock.writeLock().lock();
508         LOG.debug("Updates disabled for region, no outstanding scanners on " +
509           this);
510         try {
511           // Write lock means no more row locks can be given out.  Wait on
512           // outstanding row locks to come in before we close so we do not drop
513           // outstanding updates.
514           waitOnRowLocks();
515           LOG.debug("No more row locks outstanding on region " + this);
516 
517           // Don't flush the cache if we are aborting
518           if (!abort) {
519             internalFlushcache();
520           }
521 
522           List<StoreFile> result = new ArrayList<StoreFile>();
523           for (Store store: stores.values()) {
524             result.addAll(store.close());
525           }
526           this.closed.set(true);
527           LOG.info("Closed " + this);
528           return result;
529         } finally {
530           splitsAndClosesLock.writeLock().unlock();
531         }
532       } finally {
533         newScannerLock.writeLock().unlock();
534       }
535     }
536   }
537 
538    /**
539     * @return True if its worth doing a flush before we put up the close flag.
540     */
541   private boolean worthPreFlushing() {
542     return this.memstoreSize.get() >
543       this.conf.getLong("hbase.hregion.preclose.flush.size", 1024 * 1024 * 5);
544   }
545 
546   //////////////////////////////////////////////////////////////////////////////
547   // HRegion accessors
548   //////////////////////////////////////////////////////////////////////////////
549 
550   /** @return start key for region */
551   public byte [] getStartKey() {
552     return this.regionInfo.getStartKey();
553   }
554 
555   /** @return end key for region */
556   public byte [] getEndKey() {
557     return this.regionInfo.getEndKey();
558   }
559 
560   /** @return region id */
561   public long getRegionId() {
562     return this.regionInfo.getRegionId();
563   }
564 
565   /** @return region name */
566   public byte [] getRegionName() {
567     return this.regionInfo.getRegionName();
568   }
569 
570   /** @return region name as string for logging */
571   public String getRegionNameAsString() {
572     return this.regionInfo.getRegionNameAsString();
573   }
574 
575   /** @return HTableDescriptor for this region */
576   public HTableDescriptor getTableDesc() {
577     return this.regionInfo.getTableDesc();
578   }
579 
580   /** @return HLog in use for this region */
581   public HLog getLog() {
582     return this.log;
583   }
584 
585   /** @return Configuration object */
586   public Configuration getConf() {
587     return this.conf;
588   }
589 
590   /** @return region directory Path */
591   public Path getRegionDir() {
592     return this.regiondir;
593   }
594 
595   /** @return FileSystem being used by this region */
596   public FileSystem getFilesystem() {
597     return this.fs;
598   }
599 
600   /** @return the last time the region was flushed */
601   public long getLastFlushTime() {
602     return this.lastFlushTime;
603   }
604 
605   //////////////////////////////////////////////////////////////////////////////
606   // HRegion maintenance.
607   //
608   // These methods are meant to be called periodically by the HRegionServer for
609   // upkeep.
610   //////////////////////////////////////////////////////////////////////////////
611 
612   /** @return returns size of largest HStore. */
613   public long getLargestHStoreSize() {
614     long size = 0;
615     for (Store h: stores.values()) {
616       long storeSize = h.getSize();
617       if (storeSize > size) {
618         size = storeSize;
619       }
620     }
621     return size;
622   }
623 
624   /*
625    * Split the HRegion to create two brand-new ones.  This also closes
626    * current HRegion.  Split should be fast since we don't rewrite store files
627    * but instead create new 'reference' store files that read off the top and
628    * bottom ranges of parent store files.
629    * @param splitRow row on which to split region
630    * @return two brand-new HRegions or null if a split is not needed
631    * @throws IOException
632    */
633   HRegion [] splitRegion(final byte [] splitRow) throws IOException {
634     prepareToSplit();
635     synchronized (splitLock) {
636       if (closed.get()) {
637         return null;
638       }
639       // Add start/end key checking: hbase-428.
640       byte [] startKey = this.regionInfo.getStartKey();
641       byte [] endKey = this.regionInfo.getEndKey();
642       if (this.comparator.matchingRows(startKey, 0, startKey.length,
643           splitRow, 0, splitRow.length)) {
644         LOG.debug("Startkey and midkey are same, not splitting");
645         return null;
646       }
647       if (this.comparator.matchingRows(splitRow, 0, splitRow.length,
648           endKey, 0, endKey.length)) {
649         LOG.debug("Endkey and midkey are same, not splitting");
650         return null;
651       }
652       LOG.info("Starting split of region " + this);
653       Path splits = new Path(this.regiondir, SPLITDIR);
654       if(!this.fs.exists(splits)) {
655         this.fs.mkdirs(splits);
656       }
657       // Calculate regionid to use.  Can't be less than that of parent else
658       // it'll insert into wrong location over in .META. table: HBASE-710.
659       long rid = EnvironmentEdgeManager.currentTimeMillis();
660       if (rid < this.regionInfo.getRegionId()) {
661         LOG.warn("Clock skew; parent regions id is " +
662           this.regionInfo.getRegionId() + " but current time here is " + rid);
663         rid = this.regionInfo.getRegionId() + 1;
664       }
665       HRegionInfo regionAInfo = new HRegionInfo(this.regionInfo.getTableDesc(),
666         startKey, splitRow, false, rid);
667       Path dirA = getSplitDirForDaughter(splits, regionAInfo);
668       HRegionInfo regionBInfo = new HRegionInfo(this.regionInfo.getTableDesc(),
669         splitRow, endKey, false, rid);
670       Path dirB = getSplitDirForDaughter(splits, regionBInfo);
671 
672       // Now close the HRegion.  Close returns all store files or null if not
673       // supposed to close (? What to do in this case? Implement abort of close?)
674       // Close also does wait on outstanding rows and calls a flush just-in-case.
675       List<StoreFile> hstoreFilesToSplit = close(false);
676       if (hstoreFilesToSplit == null) {
677         LOG.warn("Close came back null (Implement abort of close?)");
678         throw new RuntimeException("close returned empty vector of HStoreFiles");
679       }
680 
681       // Split each store file.
682       for(StoreFile h: hstoreFilesToSplit) {
683         StoreFile.split(fs,
684           Store.getStoreHomedir(splits, regionAInfo.getEncodedName(),
685             h.getFamily()),
686           h, splitRow, Range.bottom);
687         StoreFile.split(fs,
688           Store.getStoreHomedir(splits, regionBInfo.getEncodedName(),
689             h.getFamily()),
690           h, splitRow, Range.top);
691       }
692 
693       // Create a region instance and then move the splits into place under
694       // regionA and regionB.
695       HRegion regionA =
696         HRegion.newHRegion(tableDir, log, fs, conf, regionAInfo, null);
697       moveInitialFilesIntoPlace(this.fs, dirA, regionA.getRegionDir());
698       HRegion regionB =
699         HRegion.newHRegion(tableDir, log, fs, conf, regionBInfo, null);
700       moveInitialFilesIntoPlace(this.fs, dirB, regionB.getRegionDir());
701 
702       return new HRegion [] {regionA, regionB};
703     }
704   }
705 
706   /*
707    * Get the daughter directories in the splits dir.  The splits dir is under
708    * the parent regions' directory.
709    * @param splits
710    * @param hri
711    * @return Path to split dir.
712    * @throws IOException
713    */
714   private Path getSplitDirForDaughter(final Path splits, final HRegionInfo hri)
715   throws IOException {
716     Path d =
717       new Path(splits, hri.getEncodedName());
718     if (fs.exists(d)) {
719       // This should never happen; the splits dir will be newly made when we
720       // come in here.  Even if we crashed midway through a split, the reopen
721       // of the parent region clears out the dir in its initialize method.
722       throw new IOException("Cannot split; target file collision at " + d);
723     }
724     return d;
725   }
726 
727   protected void prepareToSplit() {
728     // nothing
729   }
730 
731   /*
732    * Do preparation for pending compaction.
733    * @throws IOException
734    */
735   private void doRegionCompactionPrep() throws IOException {
736   }
737 
738   /*
739    * Removes the temporary directory for this Store.
740    */
741   private void cleanupTmpDir() throws IOException {
742     FSUtils.deleteDirectory(this.fs, getTmpDir());
743   }
744   
745   /**
746    * Get the temporary diretory for this region. This directory
747    * will have its contents removed when the region is reopened.
748    */
749   Path getTmpDir() {
750     return new Path(getRegionDir(), ".tmp");
751   }
752 
753   void setForceMajorCompaction(final boolean b) {
754     this.forceMajorCompaction = b;
755   }
756 
757   boolean getForceMajorCompaction() {
758     return this.forceMajorCompaction;
759   }
760 
761   /**
762    * Called by compaction thread and after region is opened to compact the
763    * HStores if necessary.
764    *
765    * <p>This operation could block for a long time, so don't call it from a
766    * time-sensitive thread.
767    *
768    * Note that no locking is necessary at this level because compaction only
769    * conflicts with a region split, and that cannot happen because the region
770    * server does them sequentially and not in parallel.
771    *
772    * @return mid key if split is needed
773    * @throws IOException e
774    */
775   public byte [] compactStores() throws IOException {
776     boolean majorCompaction = this.forceMajorCompaction;
777     this.forceMajorCompaction = false;
778     return compactStores(majorCompaction);
779   }
780 
781   /*
782    * Called by compaction thread and after region is opened to compact the
783    * HStores if necessary.
784    *
785    * <p>This operation could block for a long time, so don't call it from a
786    * time-sensitive thread.
787    *
788    * Note that no locking is necessary at this level because compaction only
789    * conflicts with a region split, and that cannot happen because the region
790    * server does them sequentially and not in parallel.
791    *
792    * @param majorCompaction True to force a major compaction regardless of thresholds
793    * @return split row if split is needed
794    * @throws IOException e
795    */
796   byte [] compactStores(final boolean majorCompaction)
797   throws IOException {
798     if (this.closing.get() || this.closed.get()) {
799       LOG.debug("Skipping compaction on " + this + " because closing/closed");
800       return null;
801     }
802     splitsAndClosesLock.readLock().lock();
803     try {
804       byte [] splitRow = null;
805       if (this.closed.get()) {
806         return splitRow;
807       }
808       try {
809         synchronized (writestate) {
810           if (!writestate.compacting && writestate.writesEnabled) {
811             writestate.compacting = true;
812           } else {
813             LOG.info("NOT compacting region " + this +
814                 ": compacting=" + writestate.compacting + ", writesEnabled=" +
815                 writestate.writesEnabled);
816               return splitRow;
817           }
818         }
819         LOG.info("Starting" + (majorCompaction? " major " : " ") +
820             "compaction on region " + this);
821         long startTime = EnvironmentEdgeManager.currentTimeMillis();
822         doRegionCompactionPrep();
823         long maxSize = -1;
824         for (Store store: stores.values()) {
825           final Store.StoreSize ss = store.compact(majorCompaction);
826           if (ss != null && ss.getSize() > maxSize) {
827             maxSize = ss.getSize();
828             splitRow = ss.getSplitRow();
829           }
830         }
831         String timeTaken = StringUtils.formatTimeDiff(EnvironmentEdgeManager.currentTimeMillis(),
832             startTime);
833         LOG.info("compaction completed on region " + this + " in " + timeTaken);
834       } finally {
835         synchronized (writestate) {
836           writestate.compacting = false;
837           writestate.notifyAll();
838         }
839       }
840       return splitRow;
841     } finally {
842       splitsAndClosesLock.readLock().unlock();
843     }
844   }
845 
846   /**
847    * Flush the cache.
848    *
849    * When this method is called the cache will be flushed unless:
850    * <ol>
851    *   <li>the cache is empty</li>
852    *   <li>the region is closed.</li>
853    *   <li>a flush is already in progress</li>
854    *   <li>writes are disabled</li>
855    * </ol>
856    *
857    * <p>This method may block for some time, so it should not be called from a
858    * time-sensitive thread.
859    *
860    * @return true if cache was flushed
861    *
862    * @throws IOException general io exceptions
863    * @throws DroppedSnapshotException Thrown when replay of hlog is required
864    * because a Snapshot was not properly persisted.
865    */
866   public boolean flushcache() throws IOException {
867     if (this.closed.get()) {
868       return false;
869     }
870     synchronized (writestate) {
871       if (!writestate.flushing && writestate.writesEnabled) {
872         this.writestate.flushing = true;
873       } else {
874         if(LOG.isDebugEnabled()) {
875           LOG.debug("NOT flushing memstore for region " + this +
876             ", flushing=" +
877               writestate.flushing + ", writesEnabled=" +
878               writestate.writesEnabled);
879         }
880         return false;
881       }
882     }
883     try {
884       // Prevent splits and closes
885       splitsAndClosesLock.readLock().lock();
886       try {
887         return internalFlushcache();
888       } finally {
889         splitsAndClosesLock.readLock().unlock();
890       }
891     } finally {
892       synchronized (writestate) {
893         writestate.flushing = false;
894         this.writestate.flushRequested = false;
895         writestate.notifyAll();
896       }
897     }
898   }
899 
900   /**
901    * Flush the memstore.
902    * 
903    * Flushing the memstore is a little tricky. We have a lot of updates in the
904    * memstore, all of which have also been written to the log. We need to
905    * write those updates in the memstore out to disk, while being able to
906    * process reads/writes as much as possible during the flush operation. Also,
907    * the log has to state clearly the point in time at which the memstore was
908    * flushed. (That way, during recovery, we know when we can rely on the
909    * on-disk flushed structures and when we have to recover the memstore from
910    * the log.)
911    *
912    * <p>So, we have a three-step process:
913    *
914    * <ul><li>A. Flush the memstore to the on-disk stores, noting the current
915    * sequence ID for the log.<li>
916    *
917    * <li>B. Write a FLUSHCACHE-COMPLETE message to the log, using the sequence
918    * ID that was current at the time of memstore-flush.</li>
919    *
920    * <li>C. Get rid of the memstore structures that are now redundant, as
921    * they've been flushed to the on-disk HStores.</li>
922    * </ul>
923    * <p>This method is protected, but can be accessed via several public
924    * routes.
925    *
926    * <p> This method may block for some time.
927    *
928    * @return true if the region needs compacting
929    *
930    * @throws IOException general io exceptions
931    * @throws DroppedSnapshotException Thrown when replay of hlog is required
932    * because a Snapshot was not properly persisted.
933    */
934   protected boolean internalFlushcache() throws IOException {
935     return internalFlushcache(this.log, -1);
936   }
937 
938   /**
939    * @param wal Null if we're NOT to go via hlog/wal.
940    * @param myseqid The seqid to use if <code>wal</code> is null writing out
941    * flush file.
942    * @return true if the region needs compacting
943    * @throws IOException
944    * @see {@link #internalFlushcache()}
945    */
946   protected boolean internalFlushcache(final HLog wal, final long myseqid)
947   throws IOException {
948     final long startTime = EnvironmentEdgeManager.currentTimeMillis();
949     // Clear flush flag.
950     // Record latest flush time
951     this.lastFlushTime = startTime;
952     // If nothing to flush, return and avoid logging start/stop flush.
953     if (this.memstoreSize.get() <= 0) {
954       return false;
955     }
956     if (LOG.isDebugEnabled()) {
957       LOG.debug("Started memstore flush for region " + this +
958         ". Current region memstore size " +
959         StringUtils.humanReadableInt(this.memstoreSize.get()) +
960         ((wal != null)? "": "; wal is null, using passed sequenceid=" + myseqid));
961     }
962 
963     // Stop updates while we snapshot the memstore of all stores. We only have
964     // to do this for a moment.  Its quick.  The subsequent sequence id that
965     // goes into the HLog after we've flushed all these snapshots also goes
966     // into the info file that sits beside the flushed files.
967     // We also set the memstore size to zero here before we allow updates
968     // again so its value will represent the size of the updates received
969     // during the flush
970     long sequenceId = -1L;
971     long completeSequenceId = -1L;
972 
973     // We have to take a write lock during snapshot, or else a write could
974     // end up in both snapshot and memstore (makes it difficult to do atomic
975     // rows then)
976     this.updatesLock.writeLock().lock();
977     final long currentMemStoreSize = this.memstoreSize.get();
978     List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size());
979     try {
980       sequenceId = (wal == null)? myseqid: wal.startCacheFlush();
981       completeSequenceId = this.getCompleteCacheFlushSequenceId(sequenceId);
982 
983       for (Store s : stores.values()) {
984         storeFlushers.add(s.getStoreFlusher(completeSequenceId));
985       }
986 
987       // prepare flush (take a snapshot)
988       for (StoreFlusher flusher : storeFlushers) {
989         flusher.prepare();
990       }
991     } finally {
992       this.updatesLock.writeLock().unlock();
993     }
994 
995     LOG.debug("Finished snapshotting, commencing flushing stores");
996 
997     // Any failure from here on out will be catastrophic requiring server
998     // restart so hlog content can be replayed and put back into the memstore.
999     // Otherwise, the snapshot content while backed up in the hlog, it will not
1000     // be part of the current running servers state.
1001     boolean compactionRequested = false;
1002     try {
1003       // A.  Flush memstore to all the HStores.
1004       // Keep running vector of all store files that includes both old and the
1005       // just-made new flush store file.
1006 
1007       for (StoreFlusher flusher : storeFlushers) {
1008         flusher.flushCache();
1009       }
1010 
1011       Callable<Void> atomicWork = internalPreFlushcacheCommit();
1012 
1013       LOG.debug("Caches flushed, doing commit now (which includes update scanners)");
1014 
1015       /**
1016        * Switch between memstore(snapshot) and the new store file
1017        */
1018       if (atomicWork != null) {
1019         LOG.debug("internalPreFlushcacheCommit gives us work to do, acquiring newScannerLock");
1020         newScannerLock.writeLock().lock();
1021       }
1022 
1023       try {
1024         if (atomicWork != null) {
1025           atomicWork.call();
1026         }
1027 
1028         // Switch snapshot (in memstore) -> new hfile (thus causing
1029         // all the store scanners to reset/reseek).
1030         for (StoreFlusher flusher : storeFlushers) {
1031           boolean needsCompaction = flusher.commit();
1032           if (needsCompaction) {
1033             compactionRequested = true;
1034           }
1035         }
1036       } finally {
1037         if (atomicWork != null) {
1038           newScannerLock.writeLock().unlock();
1039         }
1040       }
1041 
1042       storeFlushers.clear();
1043 
1044       // Set down the memstore size by amount of flush.
1045       this.memstoreSize.addAndGet(-currentMemStoreSize);
1046     } catch (Throwable t) {
1047       // An exception here means that the snapshot was not persisted.
1048       // The hlog needs to be replayed so its content is restored to memstore.
1049       // Currently, only a server restart will do this.
1050       // We used to only catch IOEs but its possible that we'd get other
1051       // exceptions -- e.g. HBASE-659 was about an NPE -- so now we catch
1052       // all and sundry.
1053       if (wal != null) wal.abortCacheFlush();
1054       DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
1055           Bytes.toStringBinary(getRegionName()));
1056       dse.initCause(t);
1057       throw dse;
1058     }
1059 
1060     // If we get to here, the HStores have been written. If we get an
1061     // error in completeCacheFlush it will release the lock it is holding
1062 
1063     // B.  Write a FLUSHCACHE-COMPLETE message to the log.
1064     //     This tells future readers that the HStores were emitted correctly,
1065     //     and that all updates to the log for this regionName that have lower
1066     //     log-sequence-ids can be safely ignored.
1067     if (wal != null) {
1068       wal.completeCacheFlush(getRegionName(),
1069         regionInfo.getTableDesc().getName(), completeSequenceId,
1070         this.getRegionInfo().isMetaRegion());
1071     }
1072 
1073     // C. Finally notify anyone waiting on memstore to clear:
1074     // e.g. checkResources().
1075     synchronized (this) {
1076       notifyAll(); // FindBugs NN_NAKED_NOTIFY
1077     }
1078 
1079     if (LOG.isDebugEnabled()) {
1080       long now = EnvironmentEdgeManager.currentTimeMillis();
1081       LOG.info("Finished memstore flush of ~" +
1082         StringUtils.humanReadableInt(currentMemStoreSize) + " for region " +
1083         this + " in " + (now - startTime) + "ms, sequenceid=" + sequenceId +
1084         ", compaction requested=" + compactionRequested +
1085         ((wal == null)? "; wal=null": ""));
1086     }
1087     return compactionRequested;
1088   }
1089 
1090    /**
1091     * A hook for sub classed wishing to perform operations prior to the cache
1092     * flush commit stage.
1093     *
1094     * If a subclass wishes that an atomic update of their work and the
1095     * flush commit stage happens, they should return a callable. The new scanner
1096     * lock will be acquired and released.
1097 
1098     * @throws java.io.IOException allow children to throw exception
1099     */
1100    protected Callable<Void> internalPreFlushcacheCommit() throws IOException {
1101      return null;
1102    }
1103 
1104    /**
1105    * Get the sequence number to be associated with this cache flush. Used by
1106    * TransactionalRegion to not complete pending transactions.
1107    *
1108    *
1109    * @param currentSequenceId
1110    * @return sequence id to complete the cache flush with
1111    */
1112   protected long getCompleteCacheFlushSequenceId(long currentSequenceId) {
1113     return currentSequenceId;
1114   }
1115 
1116   //////////////////////////////////////////////////////////////////////////////
1117   // get() methods for client use.
1118   //////////////////////////////////////////////////////////////////////////////
1119   /**
1120    * Return all the data for the row that matches <i>row</i> exactly,
1121    * or the one that immediately preceeds it, at or immediately before
1122    * <i>ts</i>.
1123    *
1124    * @param row row key
1125    * @return map of values
1126    * @throws IOException
1127    */
1128   Result getClosestRowBefore(final byte [] row)
1129   throws IOException{
1130     return getClosestRowBefore(row, HConstants.CATALOG_FAMILY);
1131   }
1132 
1133   /**
1134    * Return all the data for the row that matches <i>row</i> exactly,
1135    * or the one that immediately preceeds it, at or immediately before
1136    * <i>ts</i>.
1137    *
1138    * @param row row key
1139    * @param family column family to find on
1140    * @return map of values
1141    * @throws IOException read exceptions
1142    */
1143   public Result getClosestRowBefore(final byte [] row, final byte [] family)
1144   throws IOException {
1145     // look across all the HStores for this region and determine what the
1146     // closest key is across all column families, since the data may be sparse
1147     KeyValue key = null;
1148     checkRow(row);
1149     splitsAndClosesLock.readLock().lock();
1150     try {
1151       Store store = getStore(family);
1152       KeyValue kv = new KeyValue(row, HConstants.LATEST_TIMESTAMP);
1153       // get the closest key. (HStore.getRowKeyAtOrBefore can return null)
1154       key = store.getRowKeyAtOrBefore(kv);
1155       if (key == null) {
1156         return null;
1157       }
1158       Get get = new Get(key.getRow());
1159       get.addFamily(family);
1160       return get(get, null);
1161     } finally {
1162       splitsAndClosesLock.readLock().unlock();
1163     }
1164   }
1165 
1166   /**
1167    * Return an iterator that scans over the HRegion, returning the indicated
1168    * columns and rows specified by the {@link Scan}.
1169    * <p>
1170    * This Iterator must be closed by the caller.
1171    *
1172    * @param scan configured {@link Scan}
1173    * @return InternalScanner
1174    * @throws IOException read exceptions
1175    */
1176   public InternalScanner getScanner(Scan scan)
1177   throws IOException {
1178    return getScanner(scan, null);
1179   }
1180 
1181   protected InternalScanner getScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
1182     newScannerLock.readLock().lock();
1183     try {
1184       if (this.closed.get()) {
1185         throw new NotServingRegionException("Region " + this + " closed");
1186       }
1187       // Verify families are all valid
1188       if(scan.hasFamilies()) {
1189         for(byte [] family : scan.getFamilyMap().keySet()) {
1190           checkFamily(family);
1191         }
1192       } else { // Adding all families to scanner
1193         for(byte[] family: regionInfo.getTableDesc().getFamiliesKeys()){
1194           scan.addFamily(family);
1195         }
1196       }
1197       return instantiateInternalScanner(scan, additionalScanners);
1198 
1199     } finally {
1200       newScannerLock.readLock().unlock();
1201     }
1202   }
1203 
1204   protected InternalScanner instantiateInternalScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
1205     return new RegionScanner(scan, additionalScanners);
1206   }
1207 
1208   /*
1209    * @param delete The passed delete is modified by this method. WARNING!
1210    */
1211   private void prepareDelete(Delete delete) throws IOException {
1212     // Check to see if this is a deleteRow insert
1213     if(delete.getFamilyMap().isEmpty()){
1214       for(byte [] family : regionInfo.getTableDesc().getFamiliesKeys()){
1215         // Don't eat the timestamp
1216         delete.deleteFamily(family, delete.getTimeStamp());
1217       }
1218     } else {
1219       for(byte [] family : delete.getFamilyMap().keySet()) {
1220         if(family == null) {
1221           throw new NoSuchColumnFamilyException("Empty family is invalid");
1222         }
1223         checkFamily(family);
1224       }
1225     }
1226   }
1227 
1228   //////////////////////////////////////////////////////////////////////////////
1229   // set() methods for client use.
1230   //////////////////////////////////////////////////////////////////////////////
1231   /**
1232    * @param delete delete object
1233    * @param lockid existing lock id, or null for grab a lock
1234    * @param writeToWAL append to the write ahead lock or not
1235    * @throws IOException read exceptions
1236    */
1237   public void delete(Delete delete, Integer lockid, boolean writeToWAL)
1238   throws IOException {
1239     checkReadOnly();
1240     checkResources();
1241     Integer lid = null;
1242     splitsAndClosesLock.readLock().lock();
1243     try {
1244       byte [] row = delete.getRow();
1245       // If we did not pass an existing row lock, obtain a new one
1246       lid = getLock(lockid, row, true);
1247 
1248       // All edits for the given row (across all column families) must happen atomically.
1249       prepareDelete(delete);
1250       delete(delete.getFamilyMap(), writeToWAL);
1251 
1252     } finally {
1253       if(lockid == null) releaseRowLock(lid);
1254       splitsAndClosesLock.readLock().unlock();
1255     }
1256   }
1257 
1258 
1259   /**
1260    * @param familyMap map of family to edits for the given family.
1261    * @param writeToWAL
1262    * @throws IOException
1263    */
1264   public void delete(Map<byte[], List<KeyValue>> familyMap, boolean writeToWAL)
1265   throws IOException {
1266     long now = EnvironmentEdgeManager.currentTimeMillis();
1267     byte [] byteNow = Bytes.toBytes(now);
1268     boolean flush = false;
1269 
1270     updatesLock.readLock().lock();
1271 
1272     try {
1273 
1274       for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
1275 
1276         byte[] family = e.getKey();
1277         List<KeyValue> kvs = e.getValue();
1278         Map<byte[], Integer> kvCount = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
1279 
1280         for (KeyValue kv: kvs) {
1281           //  Check if time is LATEST, change to time of most recent addition if so
1282           //  This is expensive.
1283           if (kv.isLatestTimestamp() && kv.isDeleteType()) {
1284             byte[] qual = kv.getQualifier();
1285             if (qual == null) qual = HConstants.EMPTY_BYTE_ARRAY;
1286 
1287             Integer count = kvCount.get(qual);
1288             if (count == null) {
1289               kvCount.put(qual, 1);
1290             } else {
1291               kvCount.put(qual, count + 1);
1292             }
1293             count = kvCount.get(qual);
1294 
1295             Get get = new Get(kv.getRow());
1296             get.setMaxVersions(count);
1297             get.addColumn(family, qual);
1298 
1299             List<KeyValue> result = get(get);
1300 
1301             if (result.size() < count) {
1302               // Nothing to delete
1303               kv.updateLatestStamp(byteNow);
1304               continue;
1305             }
1306             if (result.size() > count) {
1307               throw new RuntimeException("Unexpected size: " + result.size());
1308             }
1309             KeyValue getkv = result.get(count - 1);
1310             Bytes.putBytes(kv.getBuffer(), kv.getTimestampOffset(),
1311                 getkv.getBuffer(), getkv.getTimestampOffset(), Bytes.SIZEOF_LONG);
1312           } else {
1313             kv.updateLatestStamp(byteNow);
1314           }
1315         }
1316       }
1317 
1318       if (writeToWAL) {
1319         // write/sync to WAL should happen before we touch memstore.
1320         //
1321         // If order is reversed, i.e. we write to memstore first, and
1322         // for some reason fail to write/sync to commit log, the memstore
1323         // will contain uncommitted transactions.
1324         //
1325         // bunch up all edits across all column families into a
1326         // single WALEdit.
1327         WALEdit walEdit = new WALEdit();
1328         addFamilyMapToWALEdit(familyMap, walEdit);
1329         this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
1330             walEdit, now);
1331       }
1332 
1333       // Now make changes to the memstore.
1334       long addedSize = applyFamilyMapToMemstore(familyMap);
1335       flush = isFlushSize(memstoreSize.addAndGet(addedSize));
1336     } finally {
1337       this.updatesLock.readLock().unlock();
1338     }
1339 
1340     if (flush) {
1341       // Request a cache flush.  Do it outside update lock.
1342       requestFlush();
1343     }
1344   }
1345 
1346   /**
1347    * @param put
1348    * @throws IOException
1349    */
1350   public void put(Put put) throws IOException {
1351     this.put(put, null, put.getWriteToWAL());
1352   }
1353 
1354   /**
1355    * @param put
1356    * @param writeToWAL
1357    * @throws IOException
1358    */
1359   public void put(Put put, boolean writeToWAL) throws IOException {
1360     this.put(put, null, writeToWAL);
1361   }
1362 
1363   /**
1364    * @param put
1365    * @param lockid
1366    * @throws IOException
1367    */
1368   public void put(Put put, Integer lockid) throws IOException {
1369     this.put(put, lockid, put.getWriteToWAL());
1370   }
1371 
1372   /**
1373    * @param put
1374    * @param lockid
1375    * @param writeToWAL
1376    * @throws IOException
1377    */
1378   public void put(Put put, Integer lockid, boolean writeToWAL)
1379   throws IOException {
1380     checkReadOnly();
1381 
1382     // Do a rough check that we have resources to accept a write.  The check is
1383     // 'rough' in that between the resource check and the call to obtain a
1384     // read lock, resources may run out.  For now, the thought is that this
1385     // will be extremely rare; we'll deal with it when it happens.
1386     checkResources();
1387     splitsAndClosesLock.readLock().lock();
1388 
1389     try {
1390       // We obtain a per-row lock, so other clients will block while one client
1391       // performs an update. The read lock is released by the client calling
1392       // #commit or #abort or if the HRegionServer lease on the lock expires.
1393       // See HRegionServer#RegionListener for how the expire on HRegionServer
1394       // invokes a HRegion#abort.
1395       byte [] row = put.getRow();
1396       // If we did not pass an existing row lock, obtain a new one
1397       Integer lid = getLock(lockid, row, true);
1398 
1399       try {
1400         // All edits for the given row (across all column families) must happen atomically.
1401         put(put.getFamilyMap(), writeToWAL);
1402       } finally {
1403         if(lockid == null) releaseRowLock(lid);
1404       }
1405     } finally {
1406       splitsAndClosesLock.readLock().unlock();
1407     }
1408   }
1409 
1410   /**
1411    * Struct-like class that tracks the progress of a batch operation,
1412    * accumulating status codes and tracking the index at which processing
1413    * is proceeding.
1414    */
1415   private static class BatchOperationInProgress<T> {
1416     T[] operations;
1417     OperationStatusCode[] retCodes;
1418     int nextIndexToProcess = 0;
1419 
1420     public BatchOperationInProgress(T[] operations) {
1421       this.operations = operations;
1422       retCodes = new OperationStatusCode[operations.length];
1423       Arrays.fill(retCodes, OperationStatusCode.NOT_RUN);
1424     }
1425     
1426     public boolean isDone() {
1427       return nextIndexToProcess == operations.length;
1428     }
1429   }
1430   
1431   /**
1432    * Perform a batch put with no pre-specified locks
1433    * @see HRegion#put(Pair[])
1434    */
1435   public OperationStatusCode[] put(Put[] puts) throws IOException {
1436     @SuppressWarnings("unchecked")
1437     Pair<Put, Integer> putsAndLocks[] = new Pair[puts.length];
1438 
1439     for (int i = 0; i < puts.length; i++) {
1440       putsAndLocks[i] = new Pair<Put, Integer>(puts[i], null);
1441     }
1442     return put(putsAndLocks);
1443   }
1444   
1445   /**
1446    * Perform a batch of puts.
1447    * @param putsAndLocks the list of puts paired with their requested lock IDs.
1448    * @throws IOException
1449    */
1450   public OperationStatusCode[] put(Pair<Put, Integer>[] putsAndLocks) throws IOException {
1451     BatchOperationInProgress<Pair<Put, Integer>> batchOp =
1452       new BatchOperationInProgress<Pair<Put,Integer>>(putsAndLocks);
1453     
1454     while (!batchOp.isDone()) {
1455       checkReadOnly();
1456       checkResources();
1457 
1458       long newSize;
1459       splitsAndClosesLock.readLock().lock();
1460       try {
1461         long addedSize = doMiniBatchPut(batchOp);
1462         newSize = memstoreSize.addAndGet(addedSize);
1463       } finally {
1464         splitsAndClosesLock.readLock().unlock();
1465       }
1466       if (isFlushSize(newSize)) {
1467         requestFlush();
1468       }
1469     }
1470     return batchOp.retCodes;
1471   }
1472 
1473   private long doMiniBatchPut(BatchOperationInProgress<Pair<Put, Integer>> batchOp) throws IOException {
1474     long now = EnvironmentEdgeManager.currentTimeMillis();
1475     byte[] byteNow = Bytes.toBytes(now);
1476 
1477     /** Keep track of the locks we hold so we can release them in finally clause */
1478     List<Integer> acquiredLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
1479     // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
1480     int firstIndex = batchOp.nextIndexToProcess;
1481     int lastIndexExclusive = firstIndex;
1482     boolean success = false;
1483     try {
1484       // ------------------------------------
1485       // STEP 1. Try to acquire as many locks as we can, and ensure
1486       // we acquire at least one.
1487       // ----------------------------------
1488       int numReadyToWrite = 0;
1489       while (lastIndexExclusive < batchOp.operations.length) {
1490         Pair<Put, Integer> nextPair = batchOp.operations[lastIndexExclusive];
1491         Put put = nextPair.getFirst();
1492         Integer providedLockId = nextPair.getSecond();
1493 
1494         // Check the families in the put. If bad, skip this one.
1495         try {
1496           checkFamilies(put.getFamilyMap().keySet());
1497         } catch (NoSuchColumnFamilyException nscf) {
1498           LOG.warn("No such column family in batch put", nscf);
1499           batchOp.retCodes[lastIndexExclusive] = OperationStatusCode.BAD_FAMILY;
1500           lastIndexExclusive++;
1501           continue;
1502         }
1503 
1504         // If we haven't got any rows in our batch, we should block to
1505         // get the next one.
1506         boolean shouldBlock = numReadyToWrite == 0;
1507         Integer acquiredLockId = getLock(providedLockId, put.getRow(), shouldBlock);
1508         if (acquiredLockId == null) {
1509           // We failed to grab another lock
1510           assert !shouldBlock : "Should never fail to get lock when blocking";
1511           break; // stop acquiring more rows for this batch
1512         }
1513         if (providedLockId == null) {
1514           acquiredLocks.add(acquiredLockId);
1515         }
1516         lastIndexExclusive++;
1517         numReadyToWrite++;
1518       }
1519       // We've now grabbed as many puts off the list as we can
1520       assert numReadyToWrite > 0;
1521 
1522       // ------------------------------------
1523       // STEP 2. Update any LATEST_TIMESTAMP timestamps
1524       // ----------------------------------
1525       for (int i = firstIndex; i < lastIndexExclusive; i++) {
1526         updateKVTimestamps(
1527             batchOp.operations[i].getFirst().getFamilyMap().values(),
1528             byteNow);
1529       }
1530       
1531       // ------------------------------------
1532       // STEP 3. Write to WAL
1533       // ----------------------------------
1534       WALEdit walEdit = new WALEdit();
1535       for (int i = firstIndex; i < lastIndexExclusive; i++) {
1536         // Skip puts that were determined to be invalid during preprocessing
1537         if (batchOp.retCodes[i] != OperationStatusCode.NOT_RUN) continue;
1538         
1539         Put p = batchOp.operations[i].getFirst();
1540         if (!p.getWriteToWAL()) continue;
1541         addFamilyMapToWALEdit(p.getFamilyMap(), walEdit);
1542       }
1543       
1544       // Append the edit to WAL
1545       this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
1546           walEdit, now);
1547 
1548       // ------------------------------------
1549       // STEP 4. Write back to memstore
1550       // ----------------------------------
1551       long addedSize = 0;
1552       for (int i = firstIndex; i < lastIndexExclusive; i++) {
1553         if (batchOp.retCodes[i] != OperationStatusCode.NOT_RUN) continue;
1554 
1555         Put p = batchOp.operations[i].getFirst();
1556         addedSize += applyFamilyMapToMemstore(p.getFamilyMap());
1557         batchOp.retCodes[i] = OperationStatusCode.SUCCESS;
1558       }
1559       success = true;
1560       return addedSize;
1561     } finally {
1562       for (Integer toRelease : acquiredLocks) {
1563         releaseRowLock(toRelease);
1564       }
1565       if (!success) {
1566         for (int i = firstIndex; i < lastIndexExclusive; i++) {
1567           if (batchOp.retCodes[i] == OperationStatusCode.NOT_RUN) {
1568             batchOp.retCodes[i] = OperationStatusCode.FAILURE;
1569           }
1570         }
1571       }
1572       batchOp.nextIndexToProcess = lastIndexExclusive;
1573     }
1574   }
1575 
1576   //TODO, Think that gets/puts and deletes should be refactored a bit so that
1577   //the getting of the lock happens before, so that you would just pass it into
1578   //the methods. So in the case of checkAndMutate you could just do lockRow,
1579   //get, put, unlockRow or something
1580   /**
1581    *
1582    * @param row
1583    * @param family
1584    * @param qualifier
1585    * @param expectedValue
1586    * @param lockId
1587    * @param writeToWAL
1588    * @throws IOException
1589    * @return true if the new put was execute, false otherwise
1590    */
1591   public boolean checkAndMutate(byte [] row, byte [] family, byte [] qualifier,
1592       byte [] expectedValue, Writable w, Integer lockId, boolean writeToWAL)
1593   throws IOException{
1594     checkReadOnly();
1595     //TODO, add check for value length or maybe even better move this to the
1596     //client if this becomes a global setting
1597     checkResources();
1598     boolean isPut = w instanceof Put;
1599     if (!isPut && !(w instanceof Delete))
1600       throw new IOException("Action must be Put or Delete");
1601 
1602     splitsAndClosesLock.readLock().lock();
1603     try {
1604       RowLock lock = isPut ? ((Put)w).getRowLock() : ((Delete)w).getRowLock();
1605       Get get = new Get(row, lock);
1606       checkFamily(family);
1607       get.addColumn(family, qualifier);
1608 
1609       // Lock row
1610       Integer lid = getLock(lockId, get.getRow(), true);
1611       List<KeyValue> result = new ArrayList<KeyValue>();
1612       try {
1613         result = get(get);
1614 
1615         boolean matches = false;
1616         if (result.size() == 0 && expectedValue.length == 0) {
1617           matches = true;
1618         } else if (result.size() == 1) {
1619           //Compare the expected value with the actual value
1620           byte [] actualValue = result.get(0).getValue();
1621           matches = Bytes.equals(expectedValue, actualValue);
1622         }
1623         //If matches put the new put or delete the new delete
1624         if (matches) {
1625           // All edits for the given row (across all column families) must happen atomically.
1626           if (isPut) {
1627             put(((Put)w).getFamilyMap(), writeToWAL);
1628           } else {
1629             Delete d = (Delete)w;
1630             prepareDelete(d);
1631             delete(d.getFamilyMap(), writeToWAL);
1632           }
1633           return true;
1634         }
1635         return false;
1636       } finally {
1637         if(lockId == null) releaseRowLock(lid);
1638       }
1639     } finally {
1640       splitsAndClosesLock.readLock().unlock();
1641     }
1642   }
1643 
1644 
1645   /**
1646    * Replaces any KV timestamps set to {@link HConstants#LATEST_TIMESTAMP}
1647    * with the provided current timestamp.
1648    */
1649   private void updateKVTimestamps(
1650       final Iterable<List<KeyValue>> keyLists, final byte[] now) {
1651     for (List<KeyValue> keys: keyLists) {
1652       if (keys == null) continue;
1653       for (KeyValue key : keys) {
1654         key.updateLatestStamp(now);
1655       }
1656     }
1657   }
1658 
1659 //  /*
1660 //   * Utility method to verify values length.
1661 //   * @param batchUpdate The update to verify
1662 //   * @throws IOException Thrown if a value is too long
1663 //   */
1664 //  private void validateValuesLength(Put put)
1665 //  throws IOException {
1666 //    Map<byte[], List<KeyValue>> families = put.getFamilyMap();
1667 //    for(Map.Entry<byte[], List<KeyValue>> entry : families.entrySet()) {
1668 //      HColumnDescriptor hcd =
1669 //        this.regionInfo.getTableDesc().getFamily(entry.getKey());
1670 //      int maxLen = hcd.getMaxValueLength();
1671 //      for(KeyValue kv : entry.getValue()) {
1672 //        if(kv.getValueLength() > maxLen) {
1673 //          throw new ValueOverMaxLengthException("Value in column "
1674 //            + Bytes.toString(kv.getColumn()) + " is too long. "
1675 //            + kv.getValueLength() + " > " + maxLen);
1676 //        }
1677 //      }
1678 //    }
1679 //  }
1680 
1681   /*
1682    * Check if resources to support an update.
1683    *
1684    * Here we synchronize on HRegion, a broad scoped lock.  Its appropriate
1685    * given we're figuring in here whether this region is able to take on
1686    * writes.  This is only method with a synchronize (at time of writing),
1687    * this and the synchronize on 'this' inside in internalFlushCache to send
1688    * the notify.
1689    */
1690   private void checkResources() {
1691 
1692     // If catalog region, do not impose resource constraints or block updates.
1693     if (this.getRegionInfo().isMetaRegion()) return;
1694 
1695     boolean blocked = false;
1696     while (this.memstoreSize.get() > this.blockingMemStoreSize) {
1697       requestFlush();
1698       if (!blocked) {
1699         LOG.info("Blocking updates for '" + Thread.currentThread().getName() +
1700           "' on region " + Bytes.toStringBinary(getRegionName()) +
1701           ": memstore size " +
1702           StringUtils.humanReadableInt(this.memstoreSize.get()) +
1703           " is >= than blocking " +
1704           StringUtils.humanReadableInt(this.blockingMemStoreSize) + " size");
1705       }
1706       blocked = true;
1707       synchronized(this) {
1708         try {
1709           wait(threadWakeFrequency);
1710         } catch (InterruptedException e) {
1711           // continue;
1712         }
1713       }
1714     }
1715     if (blocked) {
1716       LOG.info("Unblocking updates for region " + this + " '"
1717           + Thread.currentThread().getName() + "'");
1718     }
1719   }
1720 
1721   /**
1722    * @throws IOException Throws exception if region is in read-only mode.
1723    */
1724   protected void checkReadOnly() throws IOException {
1725     if (this.writestate.isReadOnly()) {
1726       throw new IOException("region is read only");
1727     }
1728   }
1729 
1730   /**
1731    * Add updates first to the hlog and then add values to memstore.
1732    * Warning: Assumption is caller has lock on passed in row.
1733    * @param family
1734    * @param edits Cell updates by column
1735    * @praram now
1736    * @throws IOException
1737    */
1738   private void put(final byte [] family, final List<KeyValue> edits)
1739   throws IOException {
1740     Map<byte[], List<KeyValue>> familyMap = new HashMap<byte[], List<KeyValue>>();
1741     familyMap.put(family, edits);
1742     this.put(familyMap, true);
1743   }
1744 
1745   /**
1746    * Add updates first to the hlog (if writeToWal) and then add values to memstore.
1747    * Warning: Assumption is caller has lock on passed in row.
1748    * @param familyMap map of family to edits for the given family.
1749    * @param writeToWAL if true, then we should write to the log
1750    * @throws IOException
1751    */
1752   private void put(final Map<byte [], List<KeyValue>> familyMap,
1753       boolean writeToWAL) throws IOException {
1754     long now = EnvironmentEdgeManager.currentTimeMillis();
1755     byte[] byteNow = Bytes.toBytes(now);
1756     boolean flush = false;
1757     this.updatesLock.readLock().lock();
1758     try {
1759       checkFamilies(familyMap.keySet());
1760       updateKVTimestamps(familyMap.values(), byteNow);
1761       // write/sync to WAL should happen before we touch memstore.
1762       //
1763       // If order is reversed, i.e. we write to memstore first, and
1764       // for some reason fail to write/sync to commit log, the memstore
1765       // will contain uncommitted transactions.
1766       if (writeToWAL) {
1767         WALEdit walEdit = new WALEdit();
1768         addFamilyMapToWALEdit(familyMap, walEdit);
1769         this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
1770            walEdit, now);
1771       }
1772 
1773       long addedSize = applyFamilyMapToMemstore(familyMap);
1774       flush = isFlushSize(memstoreSize.addAndGet(addedSize));
1775     } finally {
1776       this.updatesLock.readLock().unlock();
1777     }
1778     if (flush) {
1779       // Request a cache flush.  Do it outside update lock.
1780       requestFlush();
1781     }
1782   }
1783 
1784   /**
1785    * Atomically apply the given map of family->edits to the memstore.
1786    * This handles the consistency control on its own, but the caller
1787    * should already have locked updatesLock.readLock(). This also does
1788    * <b>not</b> check the families for validity.
1789    *
1790    * @return the additional memory usage of the memstore caused by the
1791    * new entries.
1792    */
1793   private long applyFamilyMapToMemstore(Map<byte[], List<KeyValue>> familyMap) {
1794     ReadWriteConsistencyControl.WriteEntry w = null;
1795     long size = 0;
1796     try {
1797       w = rwcc.beginMemstoreInsert();
1798 
1799       for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
1800         byte[] family = e.getKey();
1801         List<KeyValue> edits = e.getValue();
1802   
1803         Store store = getStore(family);
1804         for (KeyValue kv: edits) {
1805           kv.setMemstoreTS(w.getWriteNumber());
1806           size += store.add(kv);
1807         }
1808       }
1809     } finally {
1810       rwcc.completeMemstoreInsert(w);
1811     }
1812     return size;
1813   }
1814 
1815   /**
1816    * Check the collection of families for validity.
1817    * @throws NoSuchColumnFamilyException if a family does not exist.
1818    */
1819   private void checkFamilies(Collection<byte[]> families)
1820   throws NoSuchColumnFamilyException {
1821     for (byte[] family : families) {
1822       checkFamily(family);
1823     }
1824   }
1825 
1826   /**
1827    * Append the given map of family->edits to a WALEdit data structure.
1828    * This does not write to the HLog itself.
1829    * @param familyMap map of family->edits
1830    * @param walEdit the destination entry to append into
1831    */
1832   private void addFamilyMapToWALEdit(Map<byte[], List<KeyValue>> familyMap,
1833       WALEdit walEdit) {
1834     for (List<KeyValue> edits : familyMap.values()) {
1835       for (KeyValue kv : edits) {
1836         walEdit.add(kv);
1837       }
1838     }
1839   }
1840 
1841   private void requestFlush() {
1842     if (this.flushListener == null) {
1843       return;
1844     }
1845     synchronized (writestate) {
1846       if (this.writestate.isFlushRequested()) {
1847         return;
1848       }
1849       writestate.flushRequested = true;
1850     }
1851     // Make request outside of synchronize block; HBASE-818.
1852     this.flushListener.request(this);
1853     if (LOG.isDebugEnabled()) {
1854       LOG.debug("Flush requested on " + this);
1855     }
1856   }
1857 
1858   /*
1859    * @param size
1860    * @return True if size is over the flush threshold
1861    */
1862   private boolean isFlushSize(final long size) {
1863     return size > this.memstoreFlushSize;
1864   }
1865 
1866   /**
1867    * Read the edits log put under this region by wal log splitting process.  Put
1868    * the recovered edits back up into this region.
1869    *
1870    * <p>We can ignore any log message that has a sequence ID that's equal to or
1871    * lower than minSeqId.  (Because we know such log messages are already
1872    * reflected in the HFiles.)
1873    * 
1874    * <p>While this is running we are putting pressure on memory yet we are
1875    * outside of our usual accounting because we are not yet an onlined region
1876    * (this stuff is being run as part of Region initialization).  This means
1877    * that if we're up against global memory limits, we'll not be flagged to flush
1878    * because we are not online. We can't be flushed by usual mechanisms anyways;
1879    * we're not yet online so our relative sequenceids are not yet aligned with
1880    * HLog sequenceids -- not till we come up online, post processing of split
1881    * edits.
1882    * 
1883    * <p>But to help relieve memory pressure, at least manage our own heap size
1884    * flushing if are in excess of per-region limits.  Flushing, though, we have
1885    * to be careful and avoid using the regionserver/hlog sequenceid.  Its running
1886    * on a different line to whats going on in here in this region context so if we
1887    * crashed replaying these edits, but in the midst had a flush that used the
1888    * regionserver log with a sequenceid in excess of whats going on in here
1889    * in this region and with its split editlogs, then we could miss edits the
1890    * next time we go to recover. So, we have to flush inline, using seqids that
1891    * make sense in a this single region context only -- until we online.
1892    * 
1893    * @param regiondir
1894    * @param minSeqId Any edit found in split editlogs needs to be in excess of
1895    * this minSeqId to be applied, else its skipped.
1896    * @param reporter
1897    * @return the sequence id of the last edit added to this region out of the
1898    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
1899    * @throws UnsupportedEncodingException
1900    * @throws IOException
1901    */
1902   protected long replayRecoveredEditsIfAny(final Path regiondir,
1903       final long minSeqId, final Progressable reporter)
1904   throws UnsupportedEncodingException, IOException {
1905     long seqid = minSeqId;
1906     NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir);
1907     if (files == null || files.isEmpty()) return seqid;
1908     for (Path edits: files) {
1909       if (edits == null || !this.fs.exists(edits)) {
1910         LOG.warn("Null or non-existent edits file: " + edits);
1911         continue;
1912       }
1913       if (isZeroLengthThenDelete(this.fs, edits)) continue;
1914       try {
1915         seqid = replayRecoveredEdits(edits, seqid, reporter);
1916       } catch (IOException e) {
1917         boolean skipErrors = conf.getBoolean("hbase.skip.errors", false);
1918         if (skipErrors) {
1919           Path p = HLog.moveAsideBadEditsFile(fs, edits);
1920           LOG.error("hbase.skip.errors=true so continuing. Renamed " + edits +
1921             " as " + p, e);
1922         } else {
1923           throw e;
1924         }
1925       }
1926     }
1927     if (seqid > minSeqId) {
1928       // Then we added some edits to memory. Flush and cleanup split edit files.
1929       internalFlushcache(null, seqid);
1930       for (Path file: files) {
1931         if (!this.fs.delete(file, false)) {
1932           LOG.error("Failed delete of " + file);
1933         } else {
1934           LOG.debug("Deleted recovered.edits file=" + file);
1935         }
1936       }
1937     }
1938     return seqid;
1939   }
1940 
1941   /*
1942    * @param edits File of recovered edits.
1943    * @param minSeqId Minimum sequenceid found in a store file.  Edits in log
1944    * must be larger than this to be replayed.
1945    * @param reporter
1946    * @return the sequence id of the last edit added to this region out of the
1947    * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
1948    * @throws IOException
1949    */
1950   private long replayRecoveredEdits(final Path edits,
1951       final long minSeqId, final Progressable reporter)
1952     throws IOException {
1953     LOG.info("Replaying edits from " + edits + "; minSequenceid=" + minSeqId);
1954     HLog.Reader reader = HLog.getReader(this.fs, edits, conf);
1955     try {
1956       return replayRecoveredEdits(reader, minSeqId, reporter);
1957     } finally {
1958       reader.close();
1959     }
1960   }
1961 
1962  /* @param reader Reader against file of recovered edits.
1963   * @param minSeqId Any edit found in split editlogs needs to be in excess of
1964   * this minSeqId to be applied, else its skipped.
1965   * @param reporter
1966   * @return the sequence id of the last edit added to this region out of the
1967   * recovered edits log or <code>minSeqId</code> if nothing added from editlogs.
1968   * @throws IOException
1969   */
1970   private long replayRecoveredEdits(final HLog.Reader reader,
1971     final long minSeqId, final Progressable reporter)
1972   throws IOException {
1973     long currentEditSeqId = minSeqId;
1974     long firstSeqIdInLog = -1;
1975     long skippedEdits = 0;
1976     long editsCount = 0;
1977     HLog.Entry entry;
1978     Store store = null;
1979     // How many edits to apply before we send a progress report.
1980     int interval = this.conf.getInt("hbase.hstore.report.interval.edits", 2000);
1981     while ((entry = reader.next()) != null) {
1982       HLogKey key = entry.getKey();
1983       WALEdit val = entry.getEdit();
1984       if (firstSeqIdInLog == -1) {
1985         firstSeqIdInLog = key.getLogSeqNum();
1986       }
1987       // Now, figure if we should skip this edit.
1988       if (key.getLogSeqNum() <= currentEditSeqId) {
1989         skippedEdits++;
1990         continue;
1991       }
1992       currentEditSeqId = key.getLogSeqNum();
1993       boolean flush = false;
1994       for (KeyValue kv: val.getKeyValues()) {
1995         // Check this edit is for me. Also, guard against writing the special
1996         // METACOLUMN info such as HBASE::CACHEFLUSH entries
1997         if (kv.matchingFamily(HLog.METAFAMILY) ||
1998             !Bytes.equals(key.getRegionName(), this.regionInfo.getRegionName())) {
1999           skippedEdits++;
2000           continue;
2001         }
2002         // Figure which store the edit is meant for.
2003         if (store == null || !kv.matchingFamily(store.getFamily().getName())) {
2004           store = this.stores.get(kv.getFamily());
2005         }
2006         if (store == null) {
2007           // This should never happen.  Perhaps schema was changed between
2008           // crash and redeploy?
2009           LOG.warn("No family for " + kv);
2010           skippedEdits++;
2011           continue;
2012         }
2013         // Once we are over the limit, restoreEdit will keep returning true to
2014         // flush -- but don't flush until we've played all the kvs that make up
2015         // the WALEdit.
2016         flush = restoreEdit(store, kv);
2017         editsCount++;
2018      }
2019      if (flush) internalFlushcache(null, currentEditSeqId);
2020 
2021       // Every 'interval' edits, tell the reporter we're making progress.
2022       // Have seen 60k edits taking 3minutes to complete.
2023       if (reporter != null && (editsCount % interval) == 0) {
2024         reporter.progress();
2025       }
2026     }
2027     if (LOG.isDebugEnabled()) {
2028       LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits +
2029         ", firstSequenceidInLog=" + firstSeqIdInLog +
2030         ", maxSequenceidInLog=" + currentEditSeqId);
2031     }
2032     return currentEditSeqId;
2033   }
2034 
2035   /**
2036    * Used by tests
2037    * @param s Store to add edit too.
2038    * @param kv KeyValue to add.
2039    * @return True if we should flush.
2040    */
2041   protected boolean restoreEdit(final Store s, final KeyValue kv) {
2042     return isFlushSize(this.memstoreSize.addAndGet(s.add(kv)));
2043   }
2044 
2045   /*
2046    * @param fs
2047    * @param p File to check.
2048    * @return True if file was zero-length (and if so, we'll delete it in here).
2049    * @throws IOException
2050    */
2051   private static boolean isZeroLengthThenDelete(final FileSystem fs, final Path p)
2052   throws IOException {
2053     FileStatus stat = fs.getFileStatus(p);
2054     if (stat.getLen() > 0) return false;
2055     LOG.warn("File " + p + " is zero-length, deleting.");
2056     fs.delete(p, false);
2057     return true;
2058   }
2059 
2060   protected Store instantiateHStore(Path tableDir, HColumnDescriptor c)
2061   throws IOException {
2062     return new Store(tableDir, this, c, this.fs, this.conf);
2063   }
2064 
2065   /**
2066    * Return HStore instance.
2067    * Use with caution.  Exposed for use of fixup utilities.
2068    * @param column Name of column family hosted by this region.
2069    * @return Store that goes with the family on passed <code>column</code>.
2070    * TODO: Make this lookup faster.
2071    */
2072   public Store getStore(final byte [] column) {
2073     return this.stores.get(column);
2074   }
2075 
2076   //////////////////////////////////////////////////////////////////////////////
2077   // Support code
2078   //////////////////////////////////////////////////////////////////////////////
2079 
2080   /** Make sure this is a valid row for the HRegion */
2081   private void checkRow(final byte [] row) throws IOException {
2082     if(!rowIsInRange(regionInfo, row)) {
2083       throw new WrongRegionException("Requested row out of range for " +
2084           "HRegion " + this + ", startKey='" +
2085           Bytes.toStringBinary(regionInfo.getStartKey()) + "', getEndKey()='" +
2086           Bytes.toStringBinary(regionInfo.getEndKey()) + "', row='" +
2087           Bytes.toStringBinary(row) + "'");
2088     }
2089   }
2090 
2091   /**
2092    * Obtain a lock on the given row.  Blocks until success.
2093    *
2094    * I know it's strange to have two mappings:
2095    * <pre>
2096    *   ROWS  ==> LOCKS
2097    * </pre>
2098    * as well as
2099    * <pre>
2100    *   LOCKS ==> ROWS
2101    * </pre>
2102    *
2103    * But it acts as a guard on the client; a miswritten client just can't
2104    * submit the name of a row and start writing to it; it must know the correct
2105    * lockid, which matches the lock list in memory.
2106    *
2107    * <p>It would be more memory-efficient to assume a correctly-written client,
2108    * which maybe we'll do in the future.
2109    *
2110    * @param row Name of row to lock.
2111    * @throws IOException
2112    * @return The id of the held lock.
2113    */
2114   public Integer obtainRowLock(final byte [] row) throws IOException {
2115     return internalObtainRowLock(row, true);
2116   }
2117 
2118   /**
2119    * Tries to obtain a row lock on the given row, but does not block if the
2120    * row lock is not available. If the lock is not available, returns false.
2121    * Otherwise behaves the same as the above method.
2122    * @see HRegion#obtainRowLock(byte[])
2123    */
2124   public Integer tryObtainRowLock(final byte[] row) throws IOException {
2125     return internalObtainRowLock(row, false);
2126   }
2127   
2128   /**
2129    * Obtains or tries to obtain the given row lock.
2130    * @param waitForLock if true, will block until the lock is available.
2131    *        Otherwise, just tries to obtain the lock and returns
2132    *        null if unavailable.
2133    */
2134   private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)
2135   throws IOException {
2136     checkRow(row);
2137     splitsAndClosesLock.readLock().lock();
2138     try {
2139       if (this.closed.get()) {
2140         throw new NotServingRegionException(this + " is closed");
2141       }
2142       synchronized (lockedRows) {
2143         while (lockedRows.contains(row)) {
2144           if (!waitForLock) {
2145             return null;
2146           }
2147           try {
2148             lockedRows.wait();
2149           } catch (InterruptedException ie) {
2150             // Empty
2151           }
2152         }
2153         // generate a new lockid. Attempt to insert the new [lockid, row].
2154         // if this lockid already exists in the map then revert and retry
2155         // We could have first done a lockIds.get, and if it does not exist only
2156         // then do a lockIds.put, but the hope is that the lockIds.put will
2157         // mostly return null the first time itself because there won't be
2158         // too many lockId collisions.
2159         byte [] prev = null;
2160         Integer lockId = null;
2161         do {
2162           lockId = new Integer(lockIdGenerator++);
2163           prev = lockIds.put(lockId, row);
2164           if (prev != null) {
2165             lockIds.put(lockId, prev);    // revert old value
2166             lockIdGenerator = rand.nextInt(); // generate new start point
2167           }
2168         } while (prev != null);
2169 
2170         lockedRows.add(row);
2171         lockedRows.notifyAll();
2172         return lockId;
2173       }
2174     } finally {
2175       splitsAndClosesLock.readLock().unlock();
2176     }
2177   }
2178   
2179   /**
2180    * Used by unit tests.
2181    * @param lockid
2182    * @return Row that goes with <code>lockid</code>
2183    */
2184   byte [] getRowFromLock(final Integer lockid) {
2185     synchronized (lockedRows) {
2186       return lockIds.get(lockid);
2187     }
2188   }
2189 
2190   /**
2191    * Release the row lock!
2192    * @param lockid  The lock ID to release.
2193    */
2194   void releaseRowLock(final Integer lockid) {
2195     synchronized (lockedRows) {
2196       byte[] row = lockIds.remove(lockid);
2197       lockedRows.remove(row);
2198       lockedRows.notifyAll();
2199     }
2200   }
2201 
2202   /**
2203    * See if row is currently locked.
2204    * @param lockid
2205    * @return boolean
2206    */
2207   boolean isRowLocked(final Integer lockid) {
2208     synchronized (lockedRows) {
2209       if (lockIds.get(lockid) != null) {
2210         return true;
2211       }
2212       return false;
2213     }
2214   }
2215 
2216   /**
2217    * Returns existing row lock if found, otherwise
2218    * obtains a new row lock and returns it.
2219    * @param lockid requested by the user, or null if the user didn't already hold lock
2220    * @param row the row to lock
2221    * @param waitForLock if true, will block until the lock is available, otherwise will
2222    * simply return null if it could not acquire the lock.
2223    * @return lockid or null if waitForLock is false and the lock was unavailable.
2224    */
2225   private Integer getLock(Integer lockid, byte [] row, boolean waitForLock)
2226   throws IOException {
2227     Integer lid = null;
2228     if (lockid == null) {
2229       lid = internalObtainRowLock(row, waitForLock);
2230     } else {
2231       if (!isRowLocked(lockid)) {
2232         throw new IOException("Invalid row lock");
2233       }
2234       lid = lockid;
2235     }
2236     return lid;
2237   }
2238 
2239   private void waitOnRowLocks() {
2240     synchronized (lockedRows) {
2241       while (!this.lockedRows.isEmpty()) {
2242         if (LOG.isDebugEnabled()) {
2243           LOG.debug("Waiting on " + this.lockedRows.size() + " row locks");
2244         }
2245         try {
2246           this.lockedRows.wait();
2247         } catch (InterruptedException e) {
2248           // Catch. Let while test determine loop-end.
2249         }
2250       }
2251     }
2252   }
2253 
2254   public void bulkLoadHFile(String hfilePath, byte[] familyName)
2255   throws IOException {
2256     splitsAndClosesLock.readLock().lock();
2257     try {
2258       Store store = getStore(familyName);
2259       if (store == null) {
2260         throw new DoNotRetryIOException(
2261             "No such column family " + Bytes.toStringBinary(familyName));
2262       }
2263       store.bulkLoadHFile(hfilePath);
2264     } finally {
2265       splitsAndClosesLock.readLock().unlock();
2266     }
2267 
2268   }
2269 
2270 
2271   @Override
2272   public boolean equals(Object o) {
2273     if (!(o instanceof HRegion)) {
2274       return false;
2275     }
2276     return this.hashCode() == ((HRegion)o).hashCode();
2277   }
2278 
2279   @Override
2280   public int hashCode() {
2281     return Bytes.hashCode(this.regionInfo.getRegionName());
2282   }
2283 
2284   @Override
2285   public String toString() {
2286     return this.regionInfo.getRegionNameAsString();
2287   }
2288 
2289   /** @return Path of region base directory */
2290   public Path getTableDir() {
2291     return this.tableDir;
2292   }
2293 
2294   /**
2295    * RegionScanner is an iterator through a bunch of rows in an HRegion.
2296    * <p>
2297    * It is used to combine scanners from multiple Stores (aka column families).
2298    */
2299   class RegionScanner implements InternalScanner {
2300     // Package local for testability
2301     KeyValueHeap storeHeap = null;
2302     private final byte [] stopRow;
2303     private Filter filter;
2304     private List<KeyValue> results = new ArrayList<KeyValue>();
2305     private int batch;
2306     private int isScan;
2307     private boolean filterClosed = false;
2308     private long readPt;
2309 
2310     RegionScanner(Scan scan, List<KeyValueScanner> additionalScanners) throws IOException {
2311       //DebugPrint.println("HRegionScanner.<init>");
2312       this.filter = scan.getFilter();
2313       this.batch = scan.getBatch();
2314       if (Bytes.equals(scan.getStopRow(), HConstants.EMPTY_END_ROW)) {
2315         this.stopRow = null;
2316       } else {
2317         this.stopRow = scan.getStopRow();
2318       }
2319       // If we are doing a get, we want to be [startRow,endRow] normally
2320       // it is [startRow,endRow) and if startRow=endRow we get nothing.
2321       this.isScan = scan.isGetScan() ? -1 : 0;
2322 
2323       this.readPt = ReadWriteConsistencyControl.resetThreadReadPoint(rwcc);
2324 
2325       List<KeyValueScanner> scanners = new ArrayList<KeyValueScanner>();
2326       if (additionalScanners != null) {
2327         scanners.addAll(additionalScanners);
2328       }
2329 
2330       for (Map.Entry<byte[], NavigableSet<byte[]>> entry :
2331           scan.getFamilyMap().entrySet()) {
2332         Store store = stores.get(entry.getKey());
2333         scanners.add(store.getScanner(scan, entry.getValue()));
2334       }
2335       this.storeHeap = new KeyValueHeap(scanners, comparator);
2336     }
2337 
2338     RegionScanner(Scan scan) throws IOException {
2339       this(scan, null);
2340     }
2341 
2342     /**
2343      * Reset both the filter and the old filter.
2344      */
2345     protected void resetFilters() {
2346       if (filter != null) {
2347         filter.reset();
2348       }
2349     }
2350 
2351     public synchronized boolean next(List<KeyValue> outResults, int limit)
2352         throws IOException {
2353       if (this.filterClosed) {
2354         throw new UnknownScannerException("Scanner was closed (timed out?) " +
2355             "after we renewed it. Could be caused by a very slow scanner " +
2356             "or a lengthy garbage collection");
2357       }
2358       if (closing.get() || closed.get()) {
2359         close();
2360         throw new NotServingRegionException(regionInfo.getRegionNameAsString() +
2361           " is closing=" + closing.get() + " or closed=" + closed.get());
2362       }
2363 
2364       // This could be a new thread from the last time we called next().
2365       ReadWriteConsistencyControl.setThreadReadPoint(this.readPt);
2366 
2367       results.clear();
2368       boolean returnResult = nextInternal(limit);
2369 
2370       outResults.addAll(results);
2371       resetFilters();
2372       if (isFilterDone()) {
2373         return false;
2374       }
2375       return returnResult;
2376     }
2377 
2378     public synchronized boolean next(List<KeyValue> outResults)
2379         throws IOException {
2380       // apply the batching limit by default
2381       return next(outResults, batch);
2382     }
2383 
2384     /*
2385      * @return True if a filter rules the scanner is over, done.
2386      */
2387     synchronized boolean isFilterDone() {
2388       return this.filter != null && this.filter.filterAllRemaining();
2389     }
2390 
2391     private boolean nextInternal(int limit) throws IOException {
2392       while (true) {
2393         byte [] currentRow = peekRow();
2394         if (isStopRow(currentRow)) {
2395           if (filter != null && filter.hasFilterRow()) {
2396             filter.filterRow(results);
2397           }
2398           if (filter != null && filter.filterRow()) {
2399             results.clear();
2400           }
2401 
2402           return false;
2403         } else if (filterRowKey(currentRow)) {
2404           nextRow(currentRow);
2405         } else {
2406           byte [] nextRow;
2407           do {
2408             this.storeHeap.next(results, limit - results.size());
2409             if (limit > 0 && results.size() == limit) {
2410               if (this.filter != null && filter.hasFilterRow()) throw new IncompatibleFilterException(
2411                   "Filter with filterRow(List<KeyValue>) incompatible with scan with limit!");
2412               return true; // we are expecting more yes, but also limited to how many we can return.
2413             }
2414           } while (Bytes.equals(currentRow, nextRow = peekRow()));
2415 
2416           final boolean stopRow = isStopRow(nextRow);
2417 
2418           // now that we have an entire row, lets process with a filters:
2419 
2420           // first filter with the filterRow(List)
2421           if (filter != null && filter.hasFilterRow()) {
2422             filter.filterRow(results);
2423           }
2424 
2425           if (results.isEmpty() || filterRow()) {
2426             // this seems like a redundant step - we already consumed the row
2427             // there're no left overs.
2428             // the reasons for calling this method are:
2429             // 1. reset the filters.
2430             // 2. provide a hook to fast forward the row (used by subclasses)
2431             nextRow(currentRow);
2432 
2433             // This row was totally filtered out, if this is NOT the last row,
2434             // we should continue on.
2435 
2436             if (!stopRow) continue;
2437           }
2438           return !stopRow;
2439         }
2440       }
2441     }
2442 
2443     private boolean filterRow() {
2444       return filter != null
2445           && filter.filterRow();
2446     }
2447     private boolean filterRowKey(byte[] row) {
2448       return filter != null
2449           && filter.filterRowKey(row, 0, row.length);
2450     }
2451 
2452     protected void nextRow(byte [] currentRow) throws IOException {
2453       while (Bytes.equals(currentRow, peekRow())) {
2454         this.storeHeap.next(MOCKED_LIST);
2455       }
2456       results.clear();
2457       resetFilters();
2458     }
2459 
2460     private byte[] peekRow() {
2461       KeyValue kv = this.storeHeap.peek();
2462       return kv == null ? null : kv.getRow();
2463     }
2464 
2465     private boolean isStopRow(byte [] currentRow) {
2466       return currentRow == null ||
2467           (stopRow != null &&
2468           comparator.compareRows(stopRow, 0, stopRow.length,
2469               currentRow, 0, currentRow.length) <= isScan);
2470     }
2471 
2472     public synchronized void close() {
2473       if (storeHeap != null) {
2474         storeHeap.close();
2475         storeHeap = null;
2476       }
2477       this.filterClosed = true;
2478     }
2479   }
2480 
2481   // Utility methods
2482   /**
2483    * A utility method to create new instances of HRegion based on the
2484    * {@link HConstants#REGION_IMPL} configuration property.
2485    * @param tableDir qualified path of directory where region should be located,
2486    * usually the table directory.
2487    * @param log The HLog is the outbound log for any updates to the HRegion
2488    * (There's a single HLog for all the HRegions on a single HRegionServer.)
2489    * The log file is a logfile from the previous execution that's
2490    * custom-computed for this HRegion. The HRegionServer computes and sorts the
2491    * appropriate log info for this HRegion. If there is a previous log file
2492    * (implying that the HRegion has been written-to before), then read it from
2493    * the supplied path.
2494    * @param fs is the filesystem.
2495    * @param conf is global configuration settings.
2496    * @param regionInfo - HRegionInfo that describes the region
2497    * is new), then read them from the supplied path.
2498    * @param flushListener an object that implements CacheFlushListener or null
2499    * making progress to master -- otherwise master might think region deploy
2500    * failed.  Can be null.
2501    * @return the new instance
2502    */
2503   public static HRegion newHRegion(Path tableDir, HLog log, FileSystem fs, Configuration conf,
2504                                    HRegionInfo regionInfo, FlushRequester flushListener) {
2505     try {
2506       @SuppressWarnings("unchecked")
2507       Class<? extends HRegion> regionClass =
2508           (Class<? extends HRegion>) conf.getClass(HConstants.REGION_IMPL, HRegion.class);
2509 
2510       Constructor<? extends HRegion> c =
2511           regionClass.getConstructor(Path.class, HLog.class, FileSystem.class,
2512               Configuration.class, HRegionInfo.class, FlushRequester.class);
2513 
2514       return c.newInstance(tableDir, log, fs, conf, regionInfo, flushListener);
2515     } catch (Throwable e) {
2516       // todo: what should I throw here?
2517       throw new IllegalStateException("Could not instantiate a region instance.", e);
2518     }
2519   }
2520 
2521   /**
2522    * Convenience method creating new HRegions. Used by createTable and by the
2523    * bootstrap code in the HMaster constructor.
2524    * Note, this method creates an {@link HLog} for the created region. It
2525    * needs to be closed explicitly.  Use {@link HRegion#getLog()} to get
2526    * access.
2527    * @param info Info for region to create.
2528    * @param rootDir Root directory for HBase instance
2529    * @param conf
2530    * @return new HRegion
2531    *
2532    * @throws IOException
2533    */
2534   public static HRegion createHRegion(final HRegionInfo info, final Path rootDir,
2535     final Configuration conf)
2536   throws IOException {
2537     Path tableDir =
2538       HTableDescriptor.getTableDir(rootDir, info.getTableDesc().getName());
2539     Path regionDir = HRegion.getRegionDir(tableDir, info.getEncodedName());
2540     FileSystem fs = FileSystem.get(conf);
2541     fs.mkdirs(regionDir);
2542     HRegion region = HRegion.newHRegion(tableDir,
2543       new HLog(fs, new Path(regionDir, HConstants.HREGION_LOGDIR_NAME),
2544           new Path(regionDir, HConstants.HREGION_OLDLOGDIR_NAME), conf, null),
2545       fs, conf, info, null);
2546     region.initialize();
2547     return region;
2548   }
2549 
2550   /**
2551    * Convenience method to open a HRegion outside of an HRegionServer context.
2552    * @param info Info for region to be opened.
2553    * @param rootDir Root directory for HBase instance
2554    * @param log HLog for region to use. This method will call
2555    * HLog#setSequenceNumber(long) passing the result of the call to
2556    * HRegion#getMinSequenceId() to ensure the log id is properly kept
2557    * up.  HRegionStore does this every time it opens a new region.
2558    * @param conf
2559    * @return new HRegion
2560    *
2561    * @throws IOException
2562    */
2563   public static HRegion openHRegion(final HRegionInfo info, final Path rootDir,
2564     final HLog log, final Configuration conf)
2565   throws IOException {
2566     if (LOG.isDebugEnabled()) {
2567       LOG.debug("Opening region: " + info);
2568     }
2569     if (info == null) {
2570       throw new NullPointerException("Passed region info is null");
2571     }
2572     HRegion r = HRegion.newHRegion(
2573         HTableDescriptor.getTableDir(rootDir, info.getTableDesc().getName()),
2574         log, FileSystem.get(conf), conf, info, null);
2575     long seqid = r.initialize();
2576     // If seqid  > current wal seqid, the wal seqid is updated.
2577     if (log != null) log.setSequenceNumber(seqid);
2578     return r;
2579   }
2580 
2581   /**
2582    * Inserts a new region's meta information into the passed
2583    * <code>meta</code> region. Used by the HMaster bootstrap code adding
2584    * new table to ROOT table.
2585    *
2586    * @param meta META HRegion to be updated
2587    * @param r HRegion to add to <code>meta</code>
2588    *
2589    * @throws IOException
2590    */
2591   public static void addRegionToMETA(HRegion meta, HRegion r)
2592   throws IOException {
2593     meta.checkResources();
2594     // The row key is the region name
2595     byte[] row = r.getRegionName();
2596     Integer lid = meta.obtainRowLock(row);
2597     try {
2598       final List<KeyValue> edits = new ArrayList<KeyValue>(1);
2599       edits.add(new KeyValue(row, HConstants.CATALOG_FAMILY,
2600           HConstants.REGIONINFO_QUALIFIER,
2601           EnvironmentEdgeManager.currentTimeMillis(),
2602           Writables.getBytes(r.getRegionInfo())));
2603       meta.put(HConstants.CATALOG_FAMILY, edits);
2604     } finally {
2605       meta.releaseRowLock(lid);
2606     }
2607   }
2608 
2609   /**
2610    * Delete a region's meta information from the passed
2611    * <code>meta</code> region.  Deletes the row.
2612    * @param srvr META server to be updated
2613    * @param metaRegionName Meta region name
2614    * @param regionName HRegion to remove from <code>meta</code>
2615    *
2616    * @throws IOException
2617    */
2618   public static void removeRegionFromMETA(final HRegionInterface srvr,
2619     final byte [] metaRegionName, final byte [] regionName)
2620   throws IOException {
2621     Delete delete = new Delete(regionName);
2622     srvr.delete(metaRegionName, delete);
2623   }
2624 
2625   /**
2626    * Utility method used by HMaster marking regions offlined.
2627    * @param srvr META server to be updated
2628    * @param metaRegionName Meta region name
2629    * @param info HRegion to update in <code>meta</code>
2630    *
2631    * @throws IOException
2632    */
2633   public static void offlineRegionInMETA(final HRegionInterface srvr,
2634     final byte [] metaRegionName, final HRegionInfo info)
2635   throws IOException {
2636     // Puts and Deletes used to be "atomic" here.  We can use row locks if
2637     // we need to keep that property, or we can expand Puts and Deletes to
2638     // allow them to be committed at once.
2639     byte [] row = info.getRegionName();
2640     Put put = new Put(row);
2641     info.setOffline(true);
2642     put.add(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER,
2643         Writables.getBytes(info));
2644     srvr.put(metaRegionName, put);
2645     cleanRegionInMETA(srvr, metaRegionName, info);
2646   }
2647 
2648   /**
2649    * Clean COL_SERVER and COL_STARTCODE for passed <code>info</code> in
2650    * <code>.META.</code>
2651    * @param srvr
2652    * @param metaRegionName
2653    * @param info
2654    * @throws IOException
2655    */
2656   public static void cleanRegionInMETA(final HRegionInterface srvr,
2657     final byte [] metaRegionName, final HRegionInfo info)
2658   throws IOException {
2659     Delete del = new Delete(info.getRegionName());
2660     del.deleteColumns(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER);
2661     del.deleteColumns(HConstants.CATALOG_FAMILY,
2662         HConstants.STARTCODE_QUALIFIER);
2663     srvr.delete(metaRegionName, del);
2664   }
2665 
2666   /**
2667    * Deletes all the files for a HRegion
2668    *
2669    * @param fs the file system object
2670    * @param rootdir qualified path of HBase root directory
2671    * @param info HRegionInfo for region to be deleted
2672    * @throws IOException
2673    */
2674   public static void deleteRegion(FileSystem fs, Path rootdir, HRegionInfo info)
2675   throws IOException {
2676     deleteRegion(fs, HRegion.getRegionDir(rootdir, info));
2677   }
2678 
2679   private static void deleteRegion(FileSystem fs, Path regiondir)
2680   throws IOException {
2681     if (LOG.isDebugEnabled()) {
2682       LOG.debug("DELETING region " + regiondir.toString());
2683     }
2684     if (!fs.delete(regiondir, true)) {
2685       LOG.warn("Failed delete of " + regiondir);
2686     }
2687   }
2688 
2689   /**
2690    * Computes the Path of the HRegion
2691    *
2692    * @param tabledir qualified path for table
2693    * @param name ENCODED region name
2694    * @return Path of HRegion directory
2695    */
2696   public static Path getRegionDir(final Path tabledir, final String name) {
2697     return new Path(tabledir, name);
2698   }
2699 
2700   /**
2701    * Computes the Path of the HRegion
2702    *
2703    * @param rootdir qualified path of HBase root directory
2704    * @param info HRegionInfo for the region
2705    * @return qualified path of region directory
2706    */
2707   public static Path getRegionDir(final Path rootdir, final HRegionInfo info) {
2708     return new Path(
2709       HTableDescriptor.getTableDir(rootdir, info.getTableDesc().getName()),
2710                                    info.getEncodedName());
2711   }
2712 
2713   /**
2714    * Determines if the specified row is within the row range specified by the
2715    * specified HRegionInfo
2716    *
2717    * @param info HRegionInfo that specifies the row range
2718    * @param row row to be checked
2719    * @return true if the row is within the range specified by the HRegionInfo
2720    */
2721   public static boolean rowIsInRange(HRegionInfo info, final byte [] row) {
2722     return ((info.getStartKey().length == 0) ||
2723         (Bytes.compareTo(info.getStartKey(), row) <= 0)) &&
2724         ((info.getEndKey().length == 0) ||
2725             (Bytes.compareTo(info.getEndKey(), row) > 0));
2726   }
2727 
2728   /**
2729    * Make the directories for a specific column family
2730    *
2731    * @param fs the file system
2732    * @param tabledir base directory where region will live (usually the table dir)
2733    * @param hri
2734    * @param colFamily the column family
2735    * @throws IOException
2736    */
2737   public static void makeColumnFamilyDirs(FileSystem fs, Path tabledir,
2738     final HRegionInfo hri, byte [] colFamily)
2739   throws IOException {
2740     Path dir = Store.getStoreHomedir(tabledir, hri.getEncodedName(), colFamily);
2741     if (!fs.mkdirs(dir)) {
2742       LOG.warn("Failed to create " + dir);
2743     }
2744   }
2745 
2746   /**
2747    * Merge two HRegions.  The regions must be adjacent and must not overlap.
2748    *
2749    * @param srcA
2750    * @param srcB
2751    * @return new merged HRegion
2752    * @throws IOException
2753    */
2754   public static HRegion mergeAdjacent(final HRegion srcA, final HRegion srcB)
2755   throws IOException {
2756     HRegion a = srcA;
2757     HRegion b = srcB;
2758 
2759     // Make sure that srcA comes first; important for key-ordering during
2760     // write of the merged file.
2761     if (srcA.getStartKey() == null) {
2762       if (srcB.getStartKey() == null) {
2763         throw new IOException("Cannot merge two regions with null start key");
2764       }
2765       // A's start key is null but B's isn't. Assume A comes before B
2766     } else if ((srcB.getStartKey() == null) ||
2767       (Bytes.compareTo(srcA.getStartKey(), srcB.getStartKey()) > 0)) {
2768       a = srcB;
2769       b = srcA;
2770     }
2771 
2772     if (!(Bytes.compareTo(a.getEndKey(), b.getStartKey()) == 0)) {
2773       throw new IOException("Cannot merge non-adjacent regions");
2774     }
2775     return merge(a, b);
2776   }
2777 
2778   /**
2779    * Merge two regions whether they are adjacent or not.
2780    *
2781    * @param a region a
2782    * @param b region b
2783    * @return new merged region
2784    * @throws IOException
2785    */
2786   public static HRegion merge(HRegion a, HRegion b) throws IOException {
2787     if (!a.getRegionInfo().getTableDesc().getNameAsString().equals(
2788         b.getRegionInfo().getTableDesc().getNameAsString())) {
2789       throw new IOException("Regions do not belong to the same table");
2790     }
2791 
2792     FileSystem fs = a.getFilesystem();
2793 
2794     // Make sure each region's cache is empty
2795 
2796     a.flushcache();
2797     b.flushcache();
2798 
2799     // Compact each region so we only have one store file per family
2800 
2801     a.compactStores(true);
2802     if (LOG.isDebugEnabled()) {
2803       LOG.debug("Files for region: " + a);
2804       listPaths(fs, a.getRegionDir());
2805     }
2806     b.compactStores(true);
2807     if (LOG.isDebugEnabled()) {
2808       LOG.debug("Files for region: " + b);
2809       listPaths(fs, b.getRegionDir());
2810     }
2811 
2812     Configuration conf = a.getConf();
2813     HTableDescriptor tabledesc = a.getTableDesc();
2814     HLog log = a.getLog();
2815     Path tableDir = a.getTableDir();
2816     // Presume both are of same region type -- i.e. both user or catalog
2817     // table regions.  This way can use comparator.
2818     final byte[] startKey =
2819       (a.comparator.matchingRows(a.getStartKey(), 0, a.getStartKey().length,
2820            HConstants.EMPTY_BYTE_ARRAY, 0, HConstants.EMPTY_BYTE_ARRAY.length)
2821        || b.comparator.matchingRows(b.getStartKey(), 0,
2822               b.getStartKey().length, HConstants.EMPTY_BYTE_ARRAY, 0,
2823               HConstants.EMPTY_BYTE_ARRAY.length))
2824       ? HConstants.EMPTY_BYTE_ARRAY
2825       : (a.comparator.compareRows(a.getStartKey(), 0, a.getStartKey().length,
2826              b.getStartKey(), 0, b.getStartKey().length) <= 0
2827          ? a.getStartKey()
2828          : b.getStartKey());
2829     final byte[] endKey =
2830       (a.comparator.matchingRows(a.getEndKey(), 0, a.getEndKey().length,
2831            HConstants.EMPTY_BYTE_ARRAY, 0, HConstants.EMPTY_BYTE_ARRAY.length)
2832        || a.comparator.matchingRows(b.getEndKey(), 0, b.getEndKey().length,
2833               HConstants.EMPTY_BYTE_ARRAY, 0,
2834               HConstants.EMPTY_BYTE_ARRAY.length))
2835       ? HConstants.EMPTY_BYTE_ARRAY
2836       : (a.comparator.compareRows(a.getEndKey(), 0, a.getEndKey().length,
2837              b.getEndKey(), 0, b.getEndKey().length) <= 0
2838          ? b.getEndKey()
2839          : a.getEndKey());
2840 
2841     HRegionInfo newRegionInfo = new HRegionInfo(tabledesc, startKey, endKey);
2842     LOG.info("Creating new region " + newRegionInfo.toString());
2843     String encodedName = newRegionInfo.getEncodedName();
2844     Path newRegionDir = HRegion.getRegionDir(a.getTableDir(), encodedName);
2845     if(fs.exists(newRegionDir)) {
2846       throw new IOException("Cannot merge; target file collision at " +
2847           newRegionDir);
2848     }
2849     fs.mkdirs(newRegionDir);
2850 
2851     LOG.info("starting merge of regions: " + a + " and " + b +
2852       " into new region " + newRegionInfo.toString() +
2853         " with start key <" + Bytes.toString(startKey) + "> and end key <" +
2854         Bytes.toString(endKey) + ">");
2855 
2856     // Move HStoreFiles under new region directory
2857     Map<byte [], List<StoreFile>> byFamily =
2858       new TreeMap<byte [], List<StoreFile>>(Bytes.BYTES_COMPARATOR);
2859     byFamily = filesByFamily(byFamily, a.close());
2860     byFamily = filesByFamily(byFamily, b.close());
2861     for (Map.Entry<byte [], List<StoreFile>> es : byFamily.entrySet()) {
2862       byte [] colFamily = es.getKey();
2863       makeColumnFamilyDirs(fs, tableDir, newRegionInfo, colFamily);
2864       // Because we compacted the source regions we should have no more than two
2865       // HStoreFiles per family and there will be no reference store
2866       List<StoreFile> srcFiles = es.getValue();
2867       if (srcFiles.size() == 2) {
2868         long seqA = srcFiles.get(0).getMaxSequenceId();
2869         long seqB = srcFiles.get(1).getMaxSequenceId();
2870         if (seqA == seqB) {
2871           // Can't have same sequenceid since on open of a store, this is what
2872           // distingushes the files (see the map of stores how its keyed by
2873           // sequenceid).
2874           throw new IOException("Files have same sequenceid: " + seqA);
2875         }
2876       }
2877       for (StoreFile hsf: srcFiles) {
2878         StoreFile.rename(fs, hsf.getPath(),
2879           StoreFile.getUniqueFile(fs, Store.getStoreHomedir(tableDir,
2880             newRegionInfo.getEncodedName(), colFamily)));
2881       }
2882     }
2883     if (LOG.isDebugEnabled()) {
2884       LOG.debug("Files for new region");
2885       listPaths(fs, newRegionDir);
2886     }
2887     HRegion dstRegion = HRegion.newHRegion(tableDir, log, fs, conf, newRegionInfo, null);
2888     dstRegion.initialize();
2889     dstRegion.compactStores();
2890     if (LOG.isDebugEnabled()) {
2891       LOG.debug("Files for new region");
2892       listPaths(fs, dstRegion.getRegionDir());
2893     }
2894     deleteRegion(fs, a.getRegionDir());
2895     deleteRegion(fs, b.getRegionDir());
2896 
2897     LOG.info("merge completed. New region is " + dstRegion);
2898 
2899     return dstRegion;
2900   }
2901 
2902   /*
2903    * Fills a map with a vector of store files keyed by column family.
2904    * @param byFamily Map to fill.
2905    * @param storeFiles Store files to process.
2906    * @param family
2907    * @return Returns <code>byFamily</code>
2908    */
2909   private static Map<byte [], List<StoreFile>> filesByFamily(
2910       Map<byte [], List<StoreFile>> byFamily, List<StoreFile> storeFiles) {
2911     for (StoreFile src: storeFiles) {
2912       byte [] family = src.getFamily();
2913       List<StoreFile> v = byFamily.get(family);
2914       if (v == null) {
2915         v = new ArrayList<StoreFile>();
2916         byFamily.put(family, v);
2917       }
2918       v.add(src);
2919     }
2920     return byFamily;
2921   }
2922 
2923   /**
2924    * @return True if needs a mojor compaction.
2925    * @throws IOException
2926    */
2927   boolean isMajorCompaction() throws IOException {
2928     for (Store store: this.stores.values()) {
2929       if (store.isMajorCompaction()) {
2930         return true;
2931       }
2932     }
2933     return false;
2934   }
2935 
2936   /*
2937    * List the files under the specified directory
2938    *
2939    * @param fs
2940    * @param dir
2941    * @throws IOException
2942    */
2943   private static void listPaths(FileSystem fs, Path dir) throws IOException {
2944     if (LOG.isDebugEnabled()) {
2945       FileStatus[] stats = fs.listStatus(dir);
2946       if (stats == null || stats.length == 0) {
2947         return;
2948       }
2949       for (int i = 0; i < stats.length; i++) {
2950         String path = stats[i].getPath().toString();
2951         if (stats[i].isDir()) {
2952           LOG.debug("d " + path);
2953           listPaths(fs, stats[i].getPath());
2954         } else {
2955           LOG.debug("f " + path + " size=" + stats[i].getLen());
2956         }
2957       }
2958     }
2959   }
2960 
2961 
2962   //
2963   // HBASE-880
2964   //
2965   /**
2966    * @param get get object
2967    * @param lockid existing lock id, or null for no previous lock
2968    * @return result
2969    * @throws IOException read exceptions
2970    */
2971   public Result get(final Get get, final Integer lockid) throws IOException {
2972     // Verify families are all valid
2973     if (get.hasFamilies()) {
2974       for (byte [] family: get.familySet()) {
2975         checkFamily(family);
2976       }
2977     } else { // Adding all families to scanner
2978       for (byte[] family: regionInfo.getTableDesc().getFamiliesKeys()) {
2979         get.addFamily(family);
2980       }
2981     }
2982     List<KeyValue> result = get(get);
2983 
2984     return new Result(result);
2985   }
2986 
2987   /*
2988    * Do a get based on the get parameter.
2989    */
2990   private List<KeyValue> get(final Get get) throws IOException {
2991     Scan scan = new Scan(get);
2992 
2993     List<KeyValue> results = new ArrayList<KeyValue>();
2994 
2995     InternalScanner scanner = null;
2996     try {
2997       scanner = getScanner(scan);
2998       scanner.next(results);
2999     } finally {
3000       if (scanner != null)
3001         scanner.close();
3002     }
3003     return results;
3004   }
3005 
3006   /**
3007    *
3008    * @param row
3009    * @param family
3010    * @param qualifier
3011    * @param amount
3012    * @param writeToWAL
3013    * @return The new value.
3014    * @throws IOException
3015    */
3016   public long incrementColumnValue(byte [] row, byte [] family,
3017       byte [] qualifier, long amount, boolean writeToWAL)
3018   throws IOException {
3019     checkRow(row);
3020     boolean flush = false;
3021     // Lock row
3022     Integer lid = obtainRowLock(row);
3023     long result = amount;
3024     try {
3025       Store store = stores.get(family);
3026 
3027       // Get the old value:
3028       Get get = new Get(row);
3029       get.addColumn(family, qualifier);
3030 
3031       List<KeyValue> results = get(get);
3032 
3033       if (!results.isEmpty()) {
3034         KeyValue kv = results.get(0);
3035         byte [] buffer = kv.getBuffer();
3036         int valueOffset = kv.getValueOffset();
3037         result += Bytes.toLong(buffer, valueOffset, Bytes.SIZEOF_LONG);
3038       }
3039 
3040       // bulid the KeyValue now:
3041       KeyValue newKv = new KeyValue(row, family,
3042           qualifier, EnvironmentEdgeManager.currentTimeMillis(),
3043           Bytes.toBytes(result));
3044 
3045       // now log it:
3046       if (writeToWAL) {
3047         long now = EnvironmentEdgeManager.currentTimeMillis();
3048         WALEdit walEdit = new WALEdit();
3049         walEdit.add(newKv);
3050         this.log.append(regionInfo, regionInfo.getTableDesc().getName(),
3051           walEdit, now);
3052       }
3053 
3054       // Now request the ICV to the store, this will set the timestamp
3055       // appropriately depending on if there is a value in memcache or not.
3056       // returns the
3057       long size = store.updateColumnValue(row, family, qualifier, result);
3058 
3059       size = this.memstoreSize.addAndGet(size);
3060       flush = isFlushSize(size);
3061     } finally {
3062       releaseRowLock(lid);
3063     }
3064 
3065     if (flush) {
3066       // Request a cache flush.  Do it outside update lock.
3067       requestFlush();
3068     }
3069 
3070     return result;
3071   }
3072 
3073 
3074   //
3075   // New HBASE-880 Helpers
3076   //
3077 
3078   private void checkFamily(final byte [] family)
3079   throws NoSuchColumnFamilyException {
3080     if(!regionInfo.getTableDesc().hasFamily(family)) {
3081       throw new NoSuchColumnFamilyException("Column family " +
3082           Bytes.toString(family) + " does not exist in region " + this
3083             + " in table " + regionInfo.getTableDesc());
3084     }
3085   }
3086 
3087   public static final long FIXED_OVERHEAD = ClassSize.align(
3088       (4 * Bytes.SIZEOF_LONG) + Bytes.SIZEOF_BOOLEAN +
3089       (20 * ClassSize.REFERENCE) + ClassSize.OBJECT + Bytes.SIZEOF_INT);
3090 
3091   public static final long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
3092       ClassSize.OBJECT + (2 * ClassSize.ATOMIC_BOOLEAN) +
3093       ClassSize.ATOMIC_LONG + ClassSize.ATOMIC_INTEGER +
3094 
3095       // Using TreeMap for TreeSet
3096       ClassSize.TREEMAP +
3097 
3098       // Using TreeMap for HashMap
3099       ClassSize.TREEMAP +
3100 
3101       ClassSize.CONCURRENT_SKIPLISTMAP + ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY +
3102       ClassSize.align(ClassSize.OBJECT +
3103         (5 * Bytes.SIZEOF_BOOLEAN)) +
3104         (3 * ClassSize.REENTRANT_LOCK));
3105 
3106   public long heapSize() {
3107     long heapSize = DEEP_OVERHEAD;
3108     for(Store store : this.stores.values()) {
3109       heapSize += store.heapSize();
3110     }
3111     return heapSize;
3112   }
3113 
3114   /*
3115    * This method calls System.exit.
3116    * @param message Message to print out.  May be null.
3117    */
3118   private static void printUsageAndExit(final String message) {
3119     if (message != null && message.length() > 0) System.out.println(message);
3120     System.out.println("Usage: HRegion CATLALOG_TABLE_DIR [major_compact]");
3121     System.out.println("Options:");
3122     System.out.println(" major_compact  Pass this option to major compact " +
3123       "passed region.");
3124     System.out.println("Default outputs scan of passed region.");
3125     System.exit(1);
3126   }
3127 
3128   /*
3129    * Process table.
3130    * Do major compaction or list content.
3131    * @param fs
3132    * @param p
3133    * @param log
3134    * @param c
3135    * @param majorCompact
3136    * @throws IOException
3137    */
3138   private static void processTable(final FileSystem fs, final Path p,
3139       final HLog log, final Configuration c,
3140       final boolean majorCompact)
3141   throws IOException {
3142     HRegion region = null;
3143     String rootStr = Bytes.toString(HConstants.ROOT_TABLE_NAME);
3144     String metaStr = Bytes.toString(HConstants.META_TABLE_NAME);
3145     // Currently expects tables have one region only.
3146     if (p.getName().startsWith(rootStr)) {
3147       region = HRegion.newHRegion(p, log, fs, c, HRegionInfo.ROOT_REGIONINFO, null);
3148     } else if (p.getName().startsWith(metaStr)) {
3149       region = HRegion.newHRegion(p, log, fs, c, HRegionInfo.FIRST_META_REGIONINFO,
3150           null);
3151     } else {
3152       throw new IOException("Not a known catalog table: " + p.toString());
3153     }
3154     try {
3155       region.initialize();
3156       if (majorCompact) {
3157         region.compactStores(true);
3158       } else {
3159         // Default behavior
3160         Scan scan = new Scan();
3161         // scan.addFamily(HConstants.CATALOG_FAMILY);
3162         InternalScanner scanner = region.getScanner(scan);
3163         try {
3164           List<KeyValue> kvs = new ArrayList<KeyValue>();
3165           boolean done = false;
3166           do {
3167             kvs.clear();
3168             done = scanner.next(kvs);
3169             if (kvs.size() > 0) LOG.info(kvs);
3170           } while (done);
3171         } finally {
3172           scanner.close();
3173         }
3174         // System.out.println(region.getClosestRowBefore(Bytes.toBytes("GeneratedCSVContent2,E3652782193BC8D66A0BA1629D0FAAAB,9993372036854775807")));
3175       }
3176     } finally {
3177       region.close();
3178     }
3179   }
3180 
3181   /**
3182    * For internal use in forcing splits ahead of file size limit.
3183    * @param b
3184    * @return previous value
3185    */
3186   public boolean shouldSplit(boolean b) {
3187     boolean old = this.splitRequest;
3188     this.splitRequest = b;
3189     return old;
3190   }
3191 
3192   /**
3193    * Checks every store to see if one has too many
3194    * store files
3195    * @return true if any store has too many store files
3196    */
3197   public boolean hasTooManyStoreFiles() {
3198     for(Store store : stores.values()) {
3199       if(store.hasTooManyStoreFiles()) {
3200         return true;
3201       }
3202     }
3203     return false;
3204   }
3205 
3206   /**
3207    * A mocked list implementaion - discards all updates.
3208    */
3209   private static final List<KeyValue> MOCKED_LIST = new AbstractList<KeyValue>() {
3210 
3211     @Override
3212     public void add(int index, KeyValue element) {
3213       // do nothing
3214     }
3215 
3216     @Override
3217     public boolean addAll(int index, Collection<? extends KeyValue> c) {
3218       return false; // this list is never changed as a result of an update
3219     }
3220 
3221     @Override
3222     public KeyValue get(int index) {
3223       throw new UnsupportedOperationException();
3224     }
3225 
3226     @Override
3227     public int size() {
3228       return 0;
3229     }
3230   };
3231 
3232 
3233   /**
3234    * Facility for dumping and compacting catalog tables.
3235    * Only does catalog tables since these are only tables we for sure know
3236    * schema on.  For usage run:
3237    * <pre>
3238    *   ./bin/hbase org.apache.hadoop.hbase.regionserver.HRegion
3239    * </pre>
3240    * @param args
3241    * @throws IOException
3242    */
3243   public static void main(String[] args) throws IOException {
3244     if (args.length < 1) {
3245       printUsageAndExit(null);
3246     }
3247     boolean majorCompact = false;
3248     if (args.length > 1) {
3249       if (!args[1].toLowerCase().startsWith("major")) {
3250         printUsageAndExit("ERROR: Unrecognized option <" + args[1] + ">");
3251       }
3252       majorCompact = true;
3253     }
3254     final Path tableDir = new Path(args[0]);
3255     final Configuration c = HBaseConfiguration.create();
3256     final FileSystem fs = FileSystem.get(c);
3257     final Path logdir = new Path(c.get("hbase.tmp.dir"),
3258         "hlog" + tableDir.getName()
3259         + EnvironmentEdgeManager.currentTimeMillis());
3260     final Path oldLogDir = new Path(c.get("hbase.tmp.dir"),
3261         HConstants.HREGION_OLDLOGDIR_NAME);
3262     final HLog log = new HLog(fs, logdir, oldLogDir, c, null);
3263     try {
3264       processTable(fs, tableDir, log, c, majorCompact);
3265      } finally {
3266        log.close();
3267        BlockCache bc = StoreFile.getBlockCache(c);
3268        if (bc != null) bc.shutdown();
3269      }
3270   }
3271 }