View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.util.ArrayList;
25  import java.util.Collections;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.NavigableSet;
29  import java.util.SortedSet;
30  import java.util.concurrent.atomic.AtomicLong;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.Cell;
37  import org.apache.hadoop.hbase.CellUtil;
38  import org.apache.hadoop.hbase.HBaseConfiguration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.KeyValue;
41  import org.apache.hadoop.hbase.KeyValueUtil;
42  import org.apache.hadoop.hbase.client.Scan;
43  import org.apache.hadoop.hbase.util.ByteRange;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.ClassSize;
46  import org.apache.hadoop.hbase.util.CollectionBackedScanner;
47  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
48  import org.apache.hadoop.hbase.util.Pair;
49  import org.apache.hadoop.hbase.util.ReflectionUtils;
50  
51  /**
52   * The MemStore holds in-memory modifications to the Store.  Modifications
53   * are {@link Cell}s.  When asked to flush, current memstore is moved
54   * to snapshot and is cleared.  We continue to serve edits out of new memstore
55   * and backing snapshot until flusher reports in that the flush succeeded. At
56   * this point we let the snapshot go.
57   *  <p>
58   * The MemStore functions should not be called in parallel. Callers should hold
59   *  write and read locks. This is done in {@link HStore}.
60   *  </p>
61   *
62   * TODO: Adjust size of the memstore when we remove items because they have
63   * been deleted.
64   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
65   * in KV size.
66   */
67  @InterfaceAudience.Private
68  public class DefaultMemStore implements MemStore {
69    private static final Log LOG = LogFactory.getLog(DefaultMemStore.class);
70    static final String USEMSLAB_KEY = "hbase.hregion.memstore.mslab.enabled";
71    private static final boolean USEMSLAB_DEFAULT = true;
72    static final String MSLAB_CLASS_NAME = "hbase.regionserver.mslab.class";
73  
74    private Configuration conf;
75  
76    // MemStore.  Use a CellSkipListSet rather than SkipListSet because of the
77    // better semantics.  The Map will overwrite if passed a key it already had
78    // whereas the Set will not add new Cell if key is same though value might be
79    // different.  Value is not important -- just make sure always same
80    // reference passed.
81    volatile CellSkipListSet cellSet;
82  
83    // Snapshot of memstore.  Made for flusher.
84    volatile CellSkipListSet snapshot;
85  
86    final KeyValue.KVComparator comparator;
87  
88    // Used to track own heapSize
89    final AtomicLong size;
90    private volatile long snapshotSize;
91  
92    // Used to track when to flush
93    volatile long timeOfOldestEdit = Long.MAX_VALUE;
94  
95    TimeRangeTracker timeRangeTracker;
96    TimeRangeTracker snapshotTimeRangeTracker;
97  
98    volatile MemStoreLAB allocator;
99    volatile MemStoreLAB snapshotAllocator;
100   volatile long snapshotId;
101 
102   /**
103    * Default constructor. Used for tests.
104    */
105   public DefaultMemStore() {
106     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
107   }
108 
109   /**
110    * Constructor.
111    * @param c Comparator
112    */
113   public DefaultMemStore(final Configuration conf,
114                   final KeyValue.KVComparator c) {
115     this.conf = conf;
116     this.comparator = c;
117     this.cellSet = new CellSkipListSet(c);
118     this.snapshot = new CellSkipListSet(c);
119     timeRangeTracker = new TimeRangeTracker();
120     snapshotTimeRangeTracker = new TimeRangeTracker();
121     this.size = new AtomicLong(DEEP_OVERHEAD);
122     this.snapshotSize = 0;
123     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
124       String className = conf.get(MSLAB_CLASS_NAME, HeapMemStoreLAB.class.getName());
125       this.allocator = ReflectionUtils.instantiateWithCustomCtor(className,
126           new Class[] { Configuration.class }, new Object[] { conf });
127     } else {
128       this.allocator = null;
129     }
130   }
131 
132   void dump() {
133     for (Cell cell: this.cellSet) {
134       LOG.info(cell);
135     }
136     for (Cell cell: this.snapshot) {
137       LOG.info(cell);
138     }
139   }
140 
141   /**
142    * Creates a snapshot of the current memstore.
143    * Snapshot must be cleared by call to {@link #clearSnapshot(long)}
144    */
145   @Override
146   public MemStoreSnapshot snapshot() {
147     // If snapshot currently has entries, then flusher failed or didn't call
148     // cleanup.  Log a warning.
149     if (!this.snapshot.isEmpty()) {
150       LOG.warn("Snapshot called again without clearing previous. " +
151           "Doing nothing. Another ongoing flush or did we fail last attempt?");
152     } else {
153       this.snapshotId = EnvironmentEdgeManager.currentTime();
154       this.snapshotSize = keySize();
155       if (!this.cellSet.isEmpty()) {
156         this.snapshot = this.cellSet;
157         this.cellSet = new CellSkipListSet(this.comparator);
158         this.snapshotTimeRangeTracker = this.timeRangeTracker;
159         this.timeRangeTracker = new TimeRangeTracker();
160         // Reset heap to not include any keys
161         this.size.set(DEEP_OVERHEAD);
162         this.snapshotAllocator = this.allocator;
163         // Reset allocator so we get a fresh buffer for the new memstore
164         if (allocator != null) {
165           String className = conf.get(MSLAB_CLASS_NAME, HeapMemStoreLAB.class.getName());
166           this.allocator = ReflectionUtils.instantiateWithCustomCtor(className,
167               new Class[] { Configuration.class }, new Object[] { conf });
168         }
169         timeOfOldestEdit = Long.MAX_VALUE;
170       }
171     }
172     return new MemStoreSnapshot(this.snapshotId, snapshot.size(), this.snapshotSize,
173         this.snapshotTimeRangeTracker, new CollectionBackedScanner(snapshot, this.comparator));
174   }
175 
176   /**
177    * The passed snapshot was successfully persisted; it can be let go.
178    * @param id Id of the snapshot to clean out.
179    * @throws UnexpectedStateException
180    * @see #snapshot()
181    */
182   @Override
183   public void clearSnapshot(long id) throws UnexpectedStateException {
184     MemStoreLAB tmpAllocator = null;
185     if (this.snapshotId != id) {
186       throw new UnexpectedStateException("Current snapshot id is " + this.snapshotId + ",passed "
187           + id);
188     }
189     // OK. Passed in snapshot is same as current snapshot. If not-empty,
190     // create a new snapshot and let the old one go.
191     if (!this.snapshot.isEmpty()) {
192       this.snapshot = new CellSkipListSet(this.comparator);
193       this.snapshotTimeRangeTracker = new TimeRangeTracker();
194     }
195     this.snapshotSize = 0;
196     this.snapshotId = -1;
197     if (this.snapshotAllocator != null) {
198       tmpAllocator = this.snapshotAllocator;
199       this.snapshotAllocator = null;
200     }
201     if (tmpAllocator != null) {
202       tmpAllocator.close();
203     }
204   }
205 
206   @Override
207   public long getFlushableSize() {
208     return this.snapshotSize > 0 ? this.snapshotSize : keySize();
209   }
210 
211   /**
212    * Write an update
213    * @param cell
214    * @return approximate size of the passed KV & newly added KV which maybe different than the
215    *         passed-in KV
216    */
217   @Override
218   public Pair<Long, Cell> add(Cell cell) {
219     Cell toAdd = maybeCloneWithAllocator(cell);
220     return new Pair<Long, Cell>(internalAdd(toAdd), toAdd);
221   }
222 
223   @Override
224   public long timeOfOldestEdit() {
225     return timeOfOldestEdit;
226   }
227 
228   private boolean addToCellSet(Cell e) {
229     boolean b = this.cellSet.add(e);
230     setOldestEditTimeToNow();
231     return b;
232   }
233 
234   private boolean removeFromCellSet(Cell e) {
235     boolean b = this.cellSet.remove(e);
236     setOldestEditTimeToNow();
237     return b;
238   }
239 
240   void setOldestEditTimeToNow() {
241     if (timeOfOldestEdit == Long.MAX_VALUE) {
242       timeOfOldestEdit = EnvironmentEdgeManager.currentTime();
243     }
244   }
245 
246   /**
247    * Internal version of add() that doesn't clone Cells with the
248    * allocator, and doesn't take the lock.
249    *
250    * Callers should ensure they already have the read lock taken
251    */
252   private long internalAdd(final Cell toAdd) {
253     long s = heapSizeChange(toAdd, addToCellSet(toAdd));
254     timeRangeTracker.includeTimestamp(toAdd);
255     this.size.addAndGet(s);
256     return s;
257   }
258 
259   private Cell maybeCloneWithAllocator(Cell cell) {
260     if (allocator == null) {
261       return cell;
262     }
263 
264     int len = KeyValueUtil.length(cell);
265     ByteRange alloc = allocator.allocateBytes(len);
266     if (alloc == null) {
267       // The allocation was too large, allocator decided
268       // not to do anything with it.
269       return cell;
270     }
271     assert alloc.getBytes() != null;
272     KeyValueUtil.appendToByteArray(cell, alloc.getBytes(), alloc.getOffset());
273     KeyValue newKv = new KeyValue(alloc.getBytes(), alloc.getOffset(), len);
274     newKv.setSequenceId(cell.getSequenceId());
275     return newKv;
276   }
277 
278   /**
279    * Remove n key from the memstore. Only cells that have the same key and the
280    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
281    * in this call. It is possible that we can optimize this method by using
282    * tailMap/iterator, but since this method is called rarely (only for
283    * error recovery), we can leave those optimization for the future.
284    * @param cell
285    */
286   @Override
287   public void rollback(Cell cell) {
288     // If the key is in the snapshot, delete it. We should not update
289     // this.size, because that tracks the size of only the memstore and
290     // not the snapshot. The flush of this snapshot to disk has not
291     // yet started because Store.flush() waits for all rwcc transactions to
292     // commit before starting the flush to disk.
293     Cell found = this.snapshot.get(cell);
294     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
295       this.snapshot.remove(cell);
296       long sz = heapSizeChange(cell, true);
297       this.snapshotSize -= sz;
298     }
299     // If the key is in the memstore, delete it. Update this.size.
300     found = this.cellSet.get(cell);
301     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
302       removeFromCellSet(cell);
303       long s = heapSizeChange(cell, true);
304       this.size.addAndGet(-s);
305     }
306   }
307 
308   /**
309    * Write a delete
310    * @param deleteCell
311    * @return approximate size of the passed key and value.
312    */
313   @Override
314   public long delete(Cell deleteCell) {
315     long s = 0;
316     Cell toAdd = maybeCloneWithAllocator(deleteCell);
317     s += heapSizeChange(toAdd, addToCellSet(toAdd));
318     timeRangeTracker.includeTimestamp(toAdd);
319     this.size.addAndGet(s);
320     return s;
321   }
322 
323   /**
324    * @param cell Find the row that comes after this one.  If null, we return the
325    * first.
326    * @return Next row or null if none found.
327    */
328   Cell getNextRow(final Cell cell) {
329     return getLowest(getNextRow(cell, this.cellSet), getNextRow(cell, this.snapshot));
330   }
331 
332   /*
333    * @param a
334    * @param b
335    * @return Return lowest of a or b or null if both a and b are null
336    */
337   private Cell getLowest(final Cell a, final Cell b) {
338     if (a == null) {
339       return b;
340     }
341     if (b == null) {
342       return a;
343     }
344     return comparator.compareRows(a, b) <= 0? a: b;
345   }
346 
347   /*
348    * @param key Find row that follows this one.  If null, return first.
349    * @param map Set to look in for a row beyond <code>row</code>.
350    * @return Next row or null if none found.  If one found, will be a new
351    * KeyValue -- can be destroyed by subsequent calls to this method.
352    */
353   private Cell getNextRow(final Cell key,
354       final NavigableSet<Cell> set) {
355     Cell result = null;
356     SortedSet<Cell> tail = key == null? set: set.tailSet(key);
357     // Iterate until we fall into the next row; i.e. move off current row
358     for (Cell cell: tail) {
359       if (comparator.compareRows(cell, key) <= 0)
360         continue;
361       // Note: Not suppressing deletes or expired cells.  Needs to be handled
362       // by higher up functions.
363       result = cell;
364       break;
365     }
366     return result;
367   }
368 
369   /**
370    * @param state column/delete tracking state
371    */
372   @Override
373   public void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
374     getRowKeyAtOrBefore(cellSet, state);
375     getRowKeyAtOrBefore(snapshot, state);
376   }
377 
378   /*
379    * @param set
380    * @param state Accumulates deletes and candidates.
381    */
382   private void getRowKeyAtOrBefore(final NavigableSet<Cell> set,
383       final GetClosestRowBeforeTracker state) {
384     if (set.isEmpty()) {
385       return;
386     }
387     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
388       // Found nothing in row.  Try backing up.
389       getRowKeyBefore(set, state);
390     }
391   }
392 
393   /*
394    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
395    * we have been passed the first possible key on a row.  As we walk forward
396    * we accumulate deletes until we hit a candidate on the row at which point
397    * we return.
398    * @param set
399    * @param firstOnRow First possible key on this row.
400    * @param state
401    * @return True if we found a candidate walking this row.
402    */
403   private boolean walkForwardInSingleRow(final SortedSet<Cell> set,
404       final Cell firstOnRow, final GetClosestRowBeforeTracker state) {
405     boolean foundCandidate = false;
406     SortedSet<Cell> tail = set.tailSet(firstOnRow);
407     if (tail.isEmpty()) return foundCandidate;
408     for (Iterator<Cell> i = tail.iterator(); i.hasNext();) {
409       Cell kv = i.next();
410       // Did we go beyond the target row? If so break.
411       if (state.isTooFar(kv, firstOnRow)) break;
412       if (state.isExpired(kv)) {
413         i.remove();
414         continue;
415       }
416       // If we added something, this row is a contender. break.
417       if (state.handle(kv)) {
418         foundCandidate = true;
419         break;
420       }
421     }
422     return foundCandidate;
423   }
424 
425   /*
426    * Walk backwards through the passed set a row at a time until we run out of
427    * set or until we get a candidate.
428    * @param set
429    * @param state
430    */
431   private void getRowKeyBefore(NavigableSet<Cell> set,
432       final GetClosestRowBeforeTracker state) {
433     Cell firstOnRow = state.getTargetKey();
434     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
435         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
436       // Make sure we don't fall out of our table.
437       if (!state.isTargetTable(p.cell)) break;
438       // Stop looking if we've exited the better candidate range.
439       if (!state.isBetterCandidate(p.cell)) break;
440       // Make into firstOnRow
441       firstOnRow = new KeyValue(p.cell.getRowArray(), p.cell.getRowOffset(), p.cell.getRowLength(),
442           HConstants.LATEST_TIMESTAMP);
443       // If we find something, break;
444       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
445     }
446   }
447 
448   /**
449    * Only used by tests. TODO: Remove
450    *
451    * Given the specs of a column, update it, first by inserting a new record,
452    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
453    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
454    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
455    * get the new value, or the old value and all readers will eventually only see the new
456    * value after the old was removed.
457    *
458    * @param row
459    * @param family
460    * @param qualifier
461    * @param newValue
462    * @param now
463    * @return  Timestamp
464    */
465   public long updateColumnValue(byte[] row,
466                                 byte[] family,
467                                 byte[] qualifier,
468                                 long newValue,
469                                 long now) {
470     Cell firstCell = KeyValueUtil.createFirstOnRow(row, family, qualifier);
471     // Is there a Cell in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
472     SortedSet<Cell> snSs = snapshot.tailSet(firstCell);
473     if (!snSs.isEmpty()) {
474       Cell snc = snSs.first();
475       // is there a matching Cell in the snapshot?
476       if (CellUtil.matchingRow(snc, firstCell) && CellUtil.matchingQualifier(snc, firstCell)) {
477         if (snc.getTimestamp() == now) {
478           // poop,
479           now += 1;
480         }
481       }
482     }
483 
484     // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
485     // But the timestamp should also be max(now, mostRecentTsInMemstore)
486 
487     // so we cant add the new Cell w/o knowing what's there already, but we also
488     // want to take this chance to delete some cells. So two loops (sad)
489 
490     SortedSet<Cell> ss = cellSet.tailSet(firstCell);
491     for (Cell cell : ss) {
492       // if this isnt the row we are interested in, then bail:
493       if (!CellUtil.matchingColumn(cell, family, qualifier)
494           || !CellUtil.matchingRow(cell, firstCell)) {
495         break; // rows dont match, bail.
496       }
497 
498       // if the qualifier matches and it's a put, just RM it out of the cellSet.
499       if (cell.getTypeByte() == KeyValue.Type.Put.getCode() &&
500           cell.getTimestamp() > now && CellUtil.matchingQualifier(firstCell, cell)) {
501         now = cell.getTimestamp();
502       }
503     }
504 
505     // create or update (upsert) a new Cell with
506     // 'now' and a 0 memstoreTS == immediately visible
507     List<Cell> cells = new ArrayList<Cell>(1);
508     cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
509     return upsert(cells, 1L);
510   }
511 
512   /**
513    * Update or insert the specified KeyValues.
514    * <p>
515    * For each KeyValue, insert into MemStore.  This will atomically upsert the
516    * value for that row/family/qualifier.  If a KeyValue did already exist,
517    * it will then be removed.
518    * <p>
519    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
520    * be immediately visible.  May want to change this so it is atomic across
521    * all KeyValues.
522    * <p>
523    * This is called under row lock, so Get operations will still see updates
524    * atomically.  Scans will only see each KeyValue update as atomic.
525    *
526    * @param cells
527    * @param readpoint readpoint below which we can safely remove duplicate KVs 
528    * @return change in memstore size
529    */
530   @Override
531   public long upsert(Iterable<Cell> cells, long readpoint) {
532     long size = 0;
533     for (Cell cell : cells) {
534       size += upsert(cell, readpoint);
535     }
536     return size;
537   }
538 
539   /**
540    * Inserts the specified KeyValue into MemStore and deletes any existing
541    * versions of the same row/family/qualifier as the specified KeyValue.
542    * <p>
543    * First, the specified KeyValue is inserted into the Memstore.
544    * <p>
545    * If there are any existing KeyValues in this MemStore with the same row,
546    * family, and qualifier, they are removed.
547    * <p>
548    * Callers must hold the read lock.
549    *
550    * @param cell
551    * @return change in size of MemStore
552    */
553   private long upsert(Cell cell, long readpoint) {
554     // Add the Cell to the MemStore
555     // Use the internalAdd method here since we (a) already have a lock
556     // and (b) cannot safely use the MSLAB here without potentially
557     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
558     // test that triggers the pathological case if we don't avoid MSLAB
559     // here.
560     long addedSize = internalAdd(cell);
561 
562     // Get the Cells for the row/family/qualifier regardless of timestamp.
563     // For this case we want to clean up any other puts
564     Cell firstCell = KeyValueUtil.createFirstOnRow(
565         cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
566         cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
567         cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
568     SortedSet<Cell> ss = cellSet.tailSet(firstCell);
569     Iterator<Cell> it = ss.iterator();
570     // versions visible to oldest scanner
571     int versionsVisible = 0;
572     while ( it.hasNext() ) {
573       Cell cur = it.next();
574 
575       if (cell == cur) {
576         // ignore the one just put in
577         continue;
578       }
579       // check that this is the row and column we are interested in, otherwise bail
580       if (CellUtil.matchingRow(cell, cur) && CellUtil.matchingQualifier(cell, cur)) {
581         // only remove Puts that concurrent scanners cannot possibly see
582         if (cur.getTypeByte() == KeyValue.Type.Put.getCode() &&
583             cur.getSequenceId() <= readpoint) {
584           if (versionsVisible > 1) {
585             // if we get here we have seen at least one version visible to the oldest scanner,
586             // which means we can prove that no scanner will see this version
587 
588             // false means there was a change, so give us the size.
589             long delta = heapSizeChange(cur, true);
590             addedSize -= delta;
591             this.size.addAndGet(-delta);
592             it.remove();
593             setOldestEditTimeToNow();
594           } else {
595             versionsVisible++;
596           }
597         }
598       } else {
599         // past the row or column, done
600         break;
601       }
602     }
603     return addedSize;
604   }
605 
606   /*
607    * Immutable data structure to hold member found in set and the set it was
608    * found in. Include set because it is carrying context.
609    */
610   private static class Member {
611     final Cell cell;
612     final NavigableSet<Cell> set;
613     Member(final NavigableSet<Cell> s, final Cell kv) {
614       this.cell = kv;
615       this.set = s;
616     }
617   }
618 
619   /*
620    * @param set Set to walk back in.  Pass a first in row or we'll return
621    * same row (loop).
622    * @param state Utility and context.
623    * @param firstOnRow First item on the row after the one we want to find a
624    * member in.
625    * @return Null or member of row previous to <code>firstOnRow</code>
626    */
627   private Member memberOfPreviousRow(NavigableSet<Cell> set,
628       final GetClosestRowBeforeTracker state, final Cell firstOnRow) {
629     NavigableSet<Cell> head = set.headSet(firstOnRow, false);
630     if (head.isEmpty()) return null;
631     for (Iterator<Cell> i = head.descendingIterator(); i.hasNext();) {
632       Cell found = i.next();
633       if (state.isExpired(found)) {
634         i.remove();
635         continue;
636       }
637       return new Member(head, found);
638     }
639     return null;
640   }
641 
642   /**
643    * @return scanner on memstore and snapshot in this order.
644    */
645   @Override
646   public List<KeyValueScanner> getScanners(long readPt) {
647     return Collections.<KeyValueScanner> singletonList(new MemStoreScanner(readPt));
648   }
649 
650   /**
651    * Check if this memstore may contain the required keys
652    * @param scan
653    * @return False if the key definitely does not exist in this Memstore
654    */
655   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
656     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
657         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
658         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
659                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
660             oldestUnexpiredTS);
661   }
662 
663   /*
664    * MemStoreScanner implements the KeyValueScanner.
665    * It lets the caller scan the contents of a memstore -- both current
666    * map and snapshot.
667    * This behaves as if it were a real scanner but does not maintain position.
668    */
669   protected class MemStoreScanner extends NonLazyKeyValueScanner {
670     // Next row information for either cellSet or snapshot
671     private Cell cellSetNextRow = null;
672     private Cell snapshotNextRow = null;
673 
674     // last iterated Cells for cellSet and snapshot (to restore iterator state after reseek)
675     private Cell cellSetItRow = null;
676     private Cell snapshotItRow = null;
677     
678     // iterator based scanning.
679     private Iterator<Cell> cellSetIt;
680     private Iterator<Cell> snapshotIt;
681 
682     // The cellSet and snapshot at the time of creating this scanner
683     private CellSkipListSet cellSetAtCreation;
684     private CellSkipListSet snapshotAtCreation;
685 
686     // the pre-calculated Cell to be returned by peek() or next()
687     private Cell theNext;
688 
689     // The allocator and snapshot allocator at the time of creating this scanner
690     volatile MemStoreLAB allocatorAtCreation;
691     volatile MemStoreLAB snapshotAllocatorAtCreation;
692     
693     // A flag represents whether could stop skipping Cells for MVCC
694     // if have encountered the next row. Only used for reversed scan
695     private boolean stopSkippingCellsIfNextRow = false;
696 
697     private long readPoint;
698 
699     /*
700     Some notes...
701 
702      So memstorescanner is fixed at creation time. this includes pointers/iterators into
703     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
704     snapshot is moved.  since kvset is null there is no point on reseeking on both,
705       we can save us the trouble. During the snapshot->hfile transition, the memstore
706       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
707       potentially do something smarter by adjusting the existing memstore scanner.
708 
709       But there is a greater problem here, that being once a scanner has progressed
710       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
711       if a scan lasts a little while, there is a chance for new entries in kvset to
712       become available but we will never see them.  This needs to be handled at the
713       StoreScanner level with coordination with MemStoreScanner.
714 
715       Currently, this problem is only partly managed: during the small amount of time
716       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
717       the adds to kvset in the MemStoreScanner.
718     */
719 
720     MemStoreScanner(long readPoint) {
721       super();
722 
723       this.readPoint = readPoint;
724       cellSetAtCreation = cellSet;
725       snapshotAtCreation = snapshot;
726       if (allocator != null) {
727         this.allocatorAtCreation = allocator;
728         this.allocatorAtCreation.incScannerCount();
729       }
730       if (snapshotAllocator != null) {
731         this.snapshotAllocatorAtCreation = snapshotAllocator;
732         this.snapshotAllocatorAtCreation.incScannerCount();
733       }
734     }
735 
736     /**
737      * Lock on 'this' must be held by caller.
738      * @param it
739      * @return Next Cell
740      */
741     private Cell getNext(Iterator<Cell> it) {
742       Cell startCell = theNext;
743       Cell v = null;
744       try {
745         while (it.hasNext()) {
746           v = it.next();
747           if (v.getSequenceId() <= this.readPoint) {
748             return v;
749           }
750           if (stopSkippingCellsIfNextRow && startCell != null
751               && comparator.compareRows(v, startCell) > 0) {
752             return null;
753           }
754         }
755 
756         return null;
757       } finally {
758         if (v != null) {
759           // in all cases, remember the last Cell iterated to
760           if (it == snapshotIt) {
761             snapshotItRow = v;
762           } else {
763             cellSetItRow = v;
764           }
765         }
766       }
767     }
768 
769     /**
770      *  Set the scanner at the seek key.
771      *  Must be called only once: there is no thread safety between the scanner
772      *   and the memStore.
773      * @param key seek value
774      * @return false if the key is null or if there is no data
775      */
776     @Override
777     public synchronized boolean seek(Cell key) {
778       if (key == null) {
779         close();
780         return false;
781       }
782       // kvset and snapshot will never be null.
783       // if tailSet can't find anything, SortedSet is empty (not null).
784       cellSetIt = cellSetAtCreation.tailSet(key).iterator();
785       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
786       cellSetItRow = null;
787       snapshotItRow = null;
788 
789       return seekInSubLists(key);
790     }
791 
792 
793     /**
794      * (Re)initialize the iterators after a seek or a reseek.
795      */
796     private synchronized boolean seekInSubLists(Cell key){
797       cellSetNextRow = getNext(cellSetIt);
798       snapshotNextRow = getNext(snapshotIt);
799 
800       // Calculate the next value
801       theNext = getLowest(cellSetNextRow, snapshotNextRow);
802 
803       // has data
804       return (theNext != null);
805     }
806 
807 
808     /**
809      * Move forward on the sub-lists set previously by seek.
810      * @param key seek value (should be non-null)
811      * @return true if there is at least one KV to read, false otherwise
812      */
813     @Override
814     public synchronized boolean reseek(Cell key) {
815       /*
816       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
817       This code is executed concurrently with flush and puts, without locks.
818       Two points must be known when working on this code:
819       1) It's not possible to use the 'kvTail' and 'snapshot'
820        variables, as they are modified during a flush.
821       2) The ideal implementation for performance would use the sub skip list
822        implicitly pointed by the iterators 'kvsetIt' and
823        'snapshotIt'. Unfortunately the Java API does not offer a method to
824        get it. So we remember the last keys we iterated to and restore
825        the reseeked set to at least that point.
826        */
827       cellSetIt = cellSetAtCreation.tailSet(getHighest(key, cellSetItRow)).iterator();
828       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
829 
830       return seekInSubLists(key);
831     }
832 
833 
834     @Override
835     public synchronized Cell peek() {
836       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
837       return theNext;
838     }
839 
840     @Override
841     public synchronized Cell next() {
842       if (theNext == null) {
843           return null;
844       }
845 
846       final Cell ret = theNext;
847 
848       // Advance one of the iterators
849       if (theNext == cellSetNextRow) {
850         cellSetNextRow = getNext(cellSetIt);
851       } else {
852         snapshotNextRow = getNext(snapshotIt);
853       }
854 
855       // Calculate the next value
856       theNext = getLowest(cellSetNextRow, snapshotNextRow);
857 
858       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
859       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
860       //    getLowest() + " threadpoint=" + readpoint);
861       return ret;
862     }
863 
864     /*
865      * Returns the lower of the two key values, or null if they are both null.
866      * This uses comparator.compare() to compare the KeyValue using the memstore
867      * comparator.
868      */
869     private Cell getLowest(Cell first, Cell second) {
870       if (first == null && second == null) {
871         return null;
872       }
873       if (first != null && second != null) {
874         int compare = comparator.compare(first, second);
875         return (compare <= 0 ? first : second);
876       }
877       return (first != null ? first : second);
878     }
879 
880     /*
881      * Returns the higher of the two cells, or null if they are both null.
882      * This uses comparator.compare() to compare the Cell using the memstore
883      * comparator.
884      */
885     private Cell getHighest(Cell first, Cell second) {
886       if (first == null && second == null) {
887         return null;
888       }
889       if (first != null && second != null) {
890         int compare = comparator.compare(first, second);
891         return (compare > 0 ? first : second);
892       }
893       return (first != null ? first : second);
894     }
895 
896     public synchronized void close() {
897       this.cellSetNextRow = null;
898       this.snapshotNextRow = null;
899 
900       this.cellSetIt = null;
901       this.snapshotIt = null;
902       
903       if (allocatorAtCreation != null) {
904         this.allocatorAtCreation.decScannerCount();
905         this.allocatorAtCreation = null;
906       }
907       if (snapshotAllocatorAtCreation != null) {
908         this.snapshotAllocatorAtCreation.decScannerCount();
909         this.snapshotAllocatorAtCreation = null;
910       }
911 
912       this.cellSetItRow = null;
913       this.snapshotItRow = null;
914     }
915 
916     /**
917      * MemStoreScanner returns max value as sequence id because it will
918      * always have the latest data among all files.
919      */
920     @Override
921     public long getSequenceID() {
922       return Long.MAX_VALUE;
923     }
924 
925     @Override
926     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
927         long oldestUnexpiredTS) {
928       return shouldSeek(scan, oldestUnexpiredTS);
929     }
930 
931     /**
932      * Seek scanner to the given key first. If it returns false(means
933      * peek()==null) or scanner's peek row is bigger than row of given key, seek
934      * the scanner to the previous row of given key
935      */
936     @Override
937     public synchronized boolean backwardSeek(Cell key) {
938       seek(key);
939       if (peek() == null || comparator.compareRows(peek(), key) > 0) {
940         return seekToPreviousRow(key);
941       }
942       return true;
943     }
944 
945     /**
946      * Separately get the KeyValue before the specified key from kvset and
947      * snapshotset, and use the row of higher one as the previous row of
948      * specified key, then seek to the first KeyValue of previous row
949      */
950     @Override
951     public synchronized boolean seekToPreviousRow(Cell key) {
952       Cell firstKeyOnRow = KeyValueUtil.createFirstOnRow(key.getRowArray(), key.getRowOffset(),
953           key.getRowLength());
954       SortedSet<Cell> cellHead = cellSetAtCreation.headSet(firstKeyOnRow);
955       Cell cellSetBeforeRow = cellHead.isEmpty() ? null : cellHead.last();
956       SortedSet<Cell> snapshotHead = snapshotAtCreation
957           .headSet(firstKeyOnRow);
958       Cell snapshotBeforeRow = snapshotHead.isEmpty() ? null : snapshotHead
959           .last();
960       Cell lastCellBeforeRow = getHighest(cellSetBeforeRow, snapshotBeforeRow);
961       if (lastCellBeforeRow == null) {
962         theNext = null;
963         return false;
964       }
965       Cell firstKeyOnPreviousRow = KeyValueUtil.createFirstOnRow(lastCellBeforeRow.getRowArray(),
966           lastCellBeforeRow.getRowOffset(), lastCellBeforeRow.getRowLength());
967       this.stopSkippingCellsIfNextRow = true;
968       seek(firstKeyOnPreviousRow);
969       this.stopSkippingCellsIfNextRow = false;
970       if (peek() == null
971           || comparator.compareRows(peek(), firstKeyOnPreviousRow) > 0) {
972         return seekToPreviousRow(lastCellBeforeRow);
973       }
974       return true;
975     }
976 
977     @Override
978     public synchronized boolean seekToLastRow() {
979       Cell first = cellSetAtCreation.isEmpty() ? null : cellSetAtCreation
980           .last();
981       Cell second = snapshotAtCreation.isEmpty() ? null
982           : snapshotAtCreation.last();
983       Cell higherCell = getHighest(first, second);
984       if (higherCell == null) {
985         return false;
986       }
987       Cell firstCellOnLastRow = KeyValueUtil.createFirstOnRow(higherCell.getRowArray(),
988           higherCell.getRowOffset(), higherCell.getRowLength());
989       if (seek(firstCellOnLastRow)) {
990         return true;
991       } else {
992         return seekToPreviousRow(higherCell);
993       }
994 
995     }
996   }
997 
998   public final static long FIXED_OVERHEAD = ClassSize.align(
999       ClassSize.OBJECT + (9 * ClassSize.REFERENCE) + (3 * Bytes.SIZEOF_LONG));
1000 
1001   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
1002       ClassSize.ATOMIC_LONG + (2 * ClassSize.TIMERANGE_TRACKER) +
1003       (2 * ClassSize.CELL_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
1004 
1005   /*
1006    * Calculate how the MemStore size has changed.  Includes overhead of the
1007    * backing Map.
1008    * @param cell
1009    * @param notpresent True if the cell was NOT present in the set.
1010    * @return Size
1011    */
1012   static long heapSizeChange(final Cell cell, final boolean notpresent) {
1013     return notpresent ? ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY
1014         + CellUtil.estimatedHeapSizeOf(cell)) : 0;
1015   }
1016 
1017   private long keySize() {
1018     return heapSize() - DEEP_OVERHEAD;
1019   }
1020 
1021   /**
1022    * Get the entire heap usage for this MemStore not including keys in the
1023    * snapshot.
1024    */
1025   @Override
1026   public long heapSize() {
1027     return size.get();
1028   }
1029 
1030   @Override
1031   public long size() {
1032     return heapSize();
1033   }
1034  
1035   /**
1036    * Code to help figure if our approximation of object heap sizes is close
1037    * enough.  See hbase-900.  Fills memstores then waits so user can heap
1038    * dump and bring up resultant hprof in something like jprofiler which
1039    * allows you get 'deep size' on objects.
1040    * @param args main args
1041    */
1042   public static void main(String [] args) {
1043     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
1044     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
1045       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
1046     LOG.info("vmInputArguments=" + runtime.getInputArguments());
1047     DefaultMemStore memstore1 = new DefaultMemStore();
1048     // TODO: x32 vs x64
1049     long size = 0;
1050     final int count = 10000;
1051     byte [] fam = Bytes.toBytes("col");
1052     byte [] qf = Bytes.toBytes("umn");
1053     byte [] empty = new byte[0];
1054     for (int i = 0; i < count; i++) {
1055       // Give each its own ts
1056       Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1057       size += ret.getFirst();
1058     }
1059     LOG.info("memstore1 estimated size=" + size);
1060     for (int i = 0; i < count; i++) {
1061       Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1062       size += ret.getFirst();
1063     }
1064     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1065     // Make a variably sized memstore.
1066     DefaultMemStore memstore2 = new DefaultMemStore();
1067     for (int i = 0; i < count; i++) {
1068       Pair<Long, Cell> ret = memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
1069         new byte[i]));
1070       size += ret.getFirst();
1071     }
1072     LOG.info("memstore2 estimated size=" + size);
1073     final int seconds = 30;
1074     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1075     for (int i = 0; i < seconds; i++) {
1076       // Thread.sleep(1000);
1077     }
1078     LOG.info("Exiting.");
1079   }
1080 
1081 }