View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.rmi.UnexpectedException;
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.NavigableSet;
30  import java.util.SortedSet;
31  import java.util.concurrent.atomic.AtomicLong;
32  import java.util.concurrent.locks.ReentrantReadWriteLock;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.classification.InterfaceAudience;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.hbase.Cell;
39  import org.apache.hadoop.hbase.HBaseConfiguration;
40  import org.apache.hadoop.hbase.HConstants;
41  import org.apache.hadoop.hbase.KeyValue;
42  import org.apache.hadoop.hbase.KeyValueUtil;
43  import org.apache.hadoop.hbase.client.Scan;
44  import org.apache.hadoop.hbase.io.HeapSize;
45  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
46  import org.apache.hadoop.hbase.util.Bytes;
47  import org.apache.hadoop.hbase.util.ClassSize;
48  
49  /**
50   * The MemStore holds in-memory modifications to the Store.  Modifications
51   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
52   * to snapshot and is cleared.  We continue to serve edits out of new memstore
53   * and backing snapshot until flusher reports in that the flush succeeded. At
54   * this point we let the snapshot go.
55   * TODO: Adjust size of the memstore when we remove items because they have
56   * been deleted.
57   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
58   * in KV size.
59   */
60  @InterfaceAudience.Private
61  public class MemStore implements HeapSize {
62    private static final Log LOG = LogFactory.getLog(MemStore.class);
63  
64    static final String USEMSLAB_KEY =
65      "hbase.hregion.memstore.mslab.enabled";
66    private static final boolean USEMSLAB_DEFAULT = true;
67  
68    private Configuration conf;
69  
70    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
71    // better semantics.  The Map will overwrite if passed a key it already had
72    // whereas the Set will not add new KV if key is same though value might be
73    // different.  Value is not important -- just make sure always same
74    // reference passed.
75    volatile KeyValueSkipListSet kvset;
76  
77    // Snapshot of memstore.  Made for flusher.
78    volatile KeyValueSkipListSet snapshot;
79  
80    final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
81  
82    final KeyValue.KVComparator comparator;
83  
84    // Used comparing versions -- same r/c and ts but different type.
85    final KeyValue.KVComparator comparatorIgnoreType;
86  
87    // Used comparing versions -- same r/c and type but different timestamp.
88    final KeyValue.KVComparator comparatorIgnoreTimestamp;
89  
90    // Used to track own heapSize
91    final AtomicLong size;
92  
93    TimeRangeTracker timeRangeTracker;
94    TimeRangeTracker snapshotTimeRangeTracker;
95  
96    MemStoreChunkPool chunkPool;
97    volatile MemStoreLAB allocator;
98    volatile MemStoreLAB snapshotAllocator;
99  
100 
101 
102   /**
103    * Default constructor. Used for tests.
104    */
105   public MemStore() {
106     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
107   }
108 
109   /**
110    * Constructor.
111    * @param c Comparator
112    */
113   public MemStore(final Configuration conf,
114                   final KeyValue.KVComparator c) {
115     this.conf = conf;
116     this.comparator = c;
117     this.comparatorIgnoreTimestamp =
118       this.comparator.getComparatorIgnoringTimestamps();
119     this.comparatorIgnoreType = this.comparator.getComparatorIgnoringType();
120     this.kvset = new KeyValueSkipListSet(c);
121     this.snapshot = new KeyValueSkipListSet(c);
122     timeRangeTracker = new TimeRangeTracker();
123     snapshotTimeRangeTracker = new TimeRangeTracker();
124     this.size = new AtomicLong(DEEP_OVERHEAD);
125     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
126       this.chunkPool = MemStoreChunkPool.getPool(conf);
127       this.allocator = new MemStoreLAB(conf, chunkPool);
128     } else {
129       this.allocator = null;
130       this.chunkPool = null;
131     }
132   }
133 
134   void dump() {
135     for (KeyValue kv: this.kvset) {
136       LOG.info(kv);
137     }
138     for (KeyValue kv: this.snapshot) {
139       LOG.info(kv);
140     }
141   }
142 
143   /**
144    * Creates a snapshot of the current memstore.
145    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet<KeyValue>)}
146    * To get the snapshot made by this method, use {@link #getSnapshot()}
147    */
148   void snapshot() {
149     this.lock.writeLock().lock();
150     try {
151       // If snapshot currently has entries, then flusher failed or didn't call
152       // cleanup.  Log a warning.
153       if (!this.snapshot.isEmpty()) {
154         LOG.warn("Snapshot called again without clearing previous. " +
155           "Doing nothing. Another ongoing flush or did we fail last attempt?");
156       } else {
157         if (!this.kvset.isEmpty()) {
158           this.snapshot = this.kvset;
159           this.kvset = new KeyValueSkipListSet(this.comparator);
160           this.snapshotTimeRangeTracker = this.timeRangeTracker;
161           this.timeRangeTracker = new TimeRangeTracker();
162           // Reset heap to not include any keys
163           this.size.set(DEEP_OVERHEAD);
164           this.snapshotAllocator = this.allocator;
165           // Reset allocator so we get a fresh buffer for the new memstore
166           if (allocator != null) {
167             this.allocator = new MemStoreLAB(conf, chunkPool);
168           }
169         }
170       }
171     } finally {
172       this.lock.writeLock().unlock();
173     }
174   }
175 
176   /**
177    * Return the current snapshot.
178    * Called by flusher to get current snapshot made by a previous
179    * call to {@link #snapshot()}
180    * @return Return snapshot.
181    * @see {@link #snapshot()}
182    * @see {@link #clearSnapshot(SortedSet<KeyValue>)}
183    */
184   KeyValueSkipListSet getSnapshot() {
185     return this.snapshot;
186   }
187 
188   /**
189    * The passed snapshot was successfully persisted; it can be let go.
190    * @param ss The snapshot to clean out.
191    * @throws UnexpectedException
192    * @see {@link #snapshot()}
193    */
194   void clearSnapshot(final SortedSet<KeyValue> ss)
195   throws UnexpectedException {
196     MemStoreLAB tmpAllocator = null;
197     this.lock.writeLock().lock();
198     try {
199       if (this.snapshot != ss) {
200         throw new UnexpectedException("Current snapshot is " +
201           this.snapshot + ", was passed " + ss);
202       }
203       // OK. Passed in snapshot is same as current snapshot.  If not-empty,
204       // create a new snapshot and let the old one go.
205       if (!ss.isEmpty()) {
206         this.snapshot = new KeyValueSkipListSet(this.comparator);
207         this.snapshotTimeRangeTracker = new TimeRangeTracker();
208       }
209       if (this.snapshotAllocator != null) {
210         tmpAllocator = this.snapshotAllocator;
211         this.snapshotAllocator = null;
212       }
213     } finally {
214       this.lock.writeLock().unlock();
215     }
216     if (tmpAllocator != null) {
217       tmpAllocator.close();
218     }
219   }
220 
221   /**
222    * Write an update
223    * @param kv
224    * @return approximate size of the passed key and value.
225    */
226   long add(final KeyValue kv) {
227     this.lock.readLock().lock();
228     try {
229       KeyValue toAdd = maybeCloneWithAllocator(kv);
230       return internalAdd(toAdd);
231     } finally {
232       this.lock.readLock().unlock();
233     }
234   }
235 
236   /**
237    * Internal version of add() that doesn't clone KVs with the
238    * allocator, and doesn't take the lock.
239    *
240    * Callers should ensure they already have the read lock taken
241    */
242   private long internalAdd(final KeyValue toAdd) {
243     long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
244     timeRangeTracker.includeTimestamp(toAdd);
245     this.size.addAndGet(s);
246     return s;
247   }
248 
249   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
250     if (allocator == null) {
251       return kv;
252     }
253 
254     int len = kv.getLength();
255     Allocation alloc = allocator.allocateBytes(len);
256     if (alloc == null) {
257       // The allocation was too large, allocator decided
258       // not to do anything with it.
259       return kv;
260     }
261     assert alloc != null && alloc.getData() != null;
262     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
263     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
264     newKv.setMemstoreTS(kv.getMemstoreTS());
265     return newKv;
266   }
267 
268   /**
269    * Remove n key from the memstore. Only kvs that have the same key and the
270    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
271    * in this call. It is possible that we can optimize this method by using
272    * tailMap/iterator, but since this method is called rarely (only for
273    * error recovery), we can leave those optimization for the future.
274    * @param kv
275    */
276   void rollback(final KeyValue kv) {
277     this.lock.readLock().lock();
278     try {
279       // If the key is in the snapshot, delete it. We should not update
280       // this.size, because that tracks the size of only the memstore and
281       // not the snapshot. The flush of this snapshot to disk has not
282       // yet started because Store.flush() waits for all rwcc transactions to
283       // commit before starting the flush to disk.
284       KeyValue found = this.snapshot.get(kv);
285       if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
286         this.snapshot.remove(kv);
287       }
288       // If the key is in the memstore, delete it. Update this.size.
289       found = this.kvset.get(kv);
290       if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
291         this.kvset.remove(kv);
292         long s = heapSizeChange(kv, true);
293         this.size.addAndGet(-s);
294       }
295     } finally {
296       this.lock.readLock().unlock();
297     }
298   }
299 
300   /**
301    * Write a delete
302    * @param delete
303    * @return approximate size of the passed key and value.
304    */
305   long delete(final KeyValue delete) {
306     long s = 0;
307     this.lock.readLock().lock();
308     try {
309       KeyValue toAdd = maybeCloneWithAllocator(delete);
310       s += heapSizeChange(toAdd, this.kvset.add(toAdd));
311       timeRangeTracker.includeTimestamp(toAdd);
312     } finally {
313       this.lock.readLock().unlock();
314     }
315     this.size.addAndGet(s);
316     return s;
317   }
318 
319   /**
320    * @param kv Find the row that comes after this one.  If null, we return the
321    * first.
322    * @return Next row or null if none found.
323    */
324   KeyValue getNextRow(final KeyValue kv) {
325     this.lock.readLock().lock();
326     try {
327       return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
328     } finally {
329       this.lock.readLock().unlock();
330     }
331   }
332 
333   /*
334    * @param a
335    * @param b
336    * @return Return lowest of a or b or null if both a and b are null
337    */
338   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
339     if (a == null) {
340       return b;
341     }
342     if (b == null) {
343       return a;
344     }
345     return comparator.compareRows(a, b) <= 0? a: b;
346   }
347 
348   /*
349    * @param key Find row that follows this one.  If null, return first.
350    * @param map Set to look in for a row beyond <code>row</code>.
351    * @return Next row or null if none found.  If one found, will be a new
352    * KeyValue -- can be destroyed by subsequent calls to this method.
353    */
354   private KeyValue getNextRow(final KeyValue key,
355       final NavigableSet<KeyValue> set) {
356     KeyValue result = null;
357     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
358     // Iterate until we fall into the next row; i.e. move off current row
359     for (KeyValue kv: tail) {
360       if (comparator.compareRows(kv, key) <= 0)
361         continue;
362       // Note: Not suppressing deletes or expired cells.  Needs to be handled
363       // by higher up functions.
364       result = kv;
365       break;
366     }
367     return result;
368   }
369 
370   /**
371    * @param state column/delete tracking state
372    */
373   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
374     this.lock.readLock().lock();
375     try {
376       getRowKeyAtOrBefore(kvset, state);
377       getRowKeyAtOrBefore(snapshot, state);
378     } finally {
379       this.lock.readLock().unlock();
380     }
381   }
382 
383   /*
384    * @param set
385    * @param state Accumulates deletes and candidates.
386    */
387   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
388       final GetClosestRowBeforeTracker state) {
389     if (set.isEmpty()) {
390       return;
391     }
392     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
393       // Found nothing in row.  Try backing up.
394       getRowKeyBefore(set, state);
395     }
396   }
397 
398   /*
399    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
400    * we have been passed the first possible key on a row.  As we walk forward
401    * we accumulate deletes until we hit a candidate on the row at which point
402    * we return.
403    * @param set
404    * @param firstOnRow First possible key on this row.
405    * @param state
406    * @return True if we found a candidate walking this row.
407    */
408   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
409       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
410     boolean foundCandidate = false;
411     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
412     if (tail.isEmpty()) return foundCandidate;
413     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
414       KeyValue kv = i.next();
415       // Did we go beyond the target row? If so break.
416       if (state.isTooFar(kv, firstOnRow)) break;
417       if (state.isExpired(kv)) {
418         i.remove();
419         continue;
420       }
421       // If we added something, this row is a contender. break.
422       if (state.handle(kv)) {
423         foundCandidate = true;
424         break;
425       }
426     }
427     return foundCandidate;
428   }
429 
430   /*
431    * Walk backwards through the passed set a row at a time until we run out of
432    * set or until we get a candidate.
433    * @param set
434    * @param state
435    */
436   private void getRowKeyBefore(NavigableSet<KeyValue> set,
437       final GetClosestRowBeforeTracker state) {
438     KeyValue firstOnRow = state.getTargetKey();
439     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
440         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
441       // Make sure we don't fall out of our table.
442       if (!state.isTargetTable(p.kv)) break;
443       // Stop looking if we've exited the better candidate range.
444       if (!state.isBetterCandidate(p.kv)) break;
445       // Make into firstOnRow
446       firstOnRow = new KeyValue(p.kv.getRow(), HConstants.LATEST_TIMESTAMP);
447       // If we find something, break;
448       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
449     }
450   }
451 
452   /**
453    * Only used by tests. TODO: Remove
454    *
455    * Given the specs of a column, update it, first by inserting a new record,
456    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
457    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
458    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
459    * get the new value, or the old value and all readers will eventually only see the new
460    * value after the old was removed.
461    *
462    * @param row
463    * @param family
464    * @param qualifier
465    * @param newValue
466    * @param now
467    * @return  Timestamp
468    */
469   long updateColumnValue(byte[] row,
470                                 byte[] family,
471                                 byte[] qualifier,
472                                 long newValue,
473                                 long now) {
474    this.lock.readLock().lock();
475     try {
476       KeyValue firstKv = KeyValue.createFirstOnRow(
477           row, family, qualifier);
478       // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
479       SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
480       if (!snSs.isEmpty()) {
481         KeyValue snKv = snSs.first();
482         // is there a matching KV in the snapshot?
483         if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
484           if (snKv.getTimestamp() == now) {
485             // poop,
486             now += 1;
487           }
488         }
489       }
490 
491       // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
492       // But the timestamp should also be max(now, mostRecentTsInMemstore)
493 
494       // so we cant add the new KV w/o knowing what's there already, but we also
495       // want to take this chance to delete some kvs. So two loops (sad)
496 
497       SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
498       Iterator<KeyValue> it = ss.iterator();
499       while ( it.hasNext() ) {
500         KeyValue kv = it.next();
501 
502         // if this isnt the row we are interested in, then bail:
503         if (!kv.matchingColumn(family,qualifier) || !kv.matchingRow(firstKv) ) {
504           break; // rows dont match, bail.
505         }
506 
507         // if the qualifier matches and it's a put, just RM it out of the kvset.
508         if (kv.getType() == KeyValue.Type.Put.getCode() &&
509             kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
510           now = kv.getTimestamp();
511         }
512       }
513 
514       // create or update (upsert) a new KeyValue with
515       // 'now' and a 0 memstoreTS == immediately visible
516       List<Cell> cells = new ArrayList<Cell>(1);
517       cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
518       return upsert(cells, 1L);
519     } finally {
520       this.lock.readLock().unlock();
521     }
522   }
523 
524   /**
525    * Update or insert the specified KeyValues.
526    * <p>
527    * For each KeyValue, insert into MemStore.  This will atomically upsert the
528    * value for that row/family/qualifier.  If a KeyValue did already exist,
529    * it will then be removed.
530    * <p>
531    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
532    * be immediately visible.  May want to change this so it is atomic across
533    * all KeyValues.
534    * <p>
535    * This is called under row lock, so Get operations will still see updates
536    * atomically.  Scans will only see each KeyValue update as atomic.
537    *
538    * @param cells
539    * @param readpoint readpoint below which we can safely remove duplicate KVs 
540    * @return change in memstore size
541    */
542   public long upsert(Iterable<? extends Cell> cells, long readpoint) {
543    this.lock.readLock().lock();
544     try {
545       long size = 0;
546       for (Cell cell : cells) {
547         size += upsert(cell, readpoint);
548       }
549       return size;
550     } finally {
551       this.lock.readLock().unlock();
552     }
553   }
554 
555   /**
556    * Inserts the specified KeyValue into MemStore and deletes any existing
557    * versions of the same row/family/qualifier as the specified KeyValue.
558    * <p>
559    * First, the specified KeyValue is inserted into the Memstore.
560    * <p>
561    * If there are any existing KeyValues in this MemStore with the same row,
562    * family, and qualifier, they are removed.
563    * <p>
564    * Callers must hold the read lock.
565    *
566    * @param cell
567    * @return change in size of MemStore
568    */
569   private long upsert(Cell cell, long readpoint) {
570     // Add the KeyValue to the MemStore
571     // Use the internalAdd method here since we (a) already have a lock
572     // and (b) cannot safely use the MSLAB here without potentially
573     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
574     // test that triggers the pathological case if we don't avoid MSLAB
575     // here.
576     KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
577     long addedSize = internalAdd(kv);
578 
579     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
580     // For this case we want to clean up any other puts
581     KeyValue firstKv = KeyValue.createFirstOnRow(
582         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
583         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
584         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
585     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
586     Iterator<KeyValue> it = ss.iterator();
587     // versions visible to oldest scanner
588     int versionsVisible = 0;
589     while ( it.hasNext() ) {
590       KeyValue cur = it.next();
591 
592       if (kv == cur) {
593         // ignore the one just put in
594         continue;
595       }
596       // check that this is the row and column we are interested in, otherwise bail
597       if (kv.matchingRow(cur) && kv.matchingQualifier(cur)) {
598         // only remove Puts that concurrent scanners cannot possibly see
599         if (cur.getType() == KeyValue.Type.Put.getCode() && cur.getMemstoreTS() <= readpoint) {
600           if (versionsVisible > 1) {
601             // if we get here we have seen at least one version visible to the oldest scanner,
602             // which means we can prove that no scanner will see this version
603 
604             // false means there was a change, so give us the size.
605             long delta = heapSizeChange(cur, true);
606             addedSize -= delta;
607             this.size.addAndGet(-delta);
608             it.remove();
609           } else {
610             versionsVisible++;
611           }
612         }
613       } else {
614         // past the row or column, done
615         break;
616       }
617     }
618     return addedSize;
619   }
620 
621   /*
622    * Immutable data structure to hold member found in set and the set it was
623    * found in.  Include set because it is carrying context.
624    */
625   private static class Member {
626     final KeyValue kv;
627     final NavigableSet<KeyValue> set;
628     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
629       this.kv = kv;
630       this.set = s;
631     }
632   }
633 
634   /*
635    * @param set Set to walk back in.  Pass a first in row or we'll return
636    * same row (loop).
637    * @param state Utility and context.
638    * @param firstOnRow First item on the row after the one we want to find a
639    * member in.
640    * @return Null or member of row previous to <code>firstOnRow</code>
641    */
642   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
643       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
644     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
645     if (head.isEmpty()) return null;
646     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
647       KeyValue found = i.next();
648       if (state.isExpired(found)) {
649         i.remove();
650         continue;
651       }
652       return new Member(head, found);
653     }
654     return null;
655   }
656 
657   /**
658    * @return scanner on memstore and snapshot in this order.
659    */
660   List<KeyValueScanner> getScanners() {
661     this.lock.readLock().lock();
662     try {
663       return Collections.<KeyValueScanner>singletonList(
664           new MemStoreScanner());
665     } finally {
666       this.lock.readLock().unlock();
667     }
668   }
669 
670   /**
671    * Check if this memstore may contain the required keys
672    * @param scan
673    * @return False if the key definitely does not exist in this Memstore
674    */
675   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
676     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
677         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
678         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
679                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
680             oldestUnexpiredTS);
681   }
682 
683   public TimeRangeTracker getSnapshotTimeRangeTracker() {
684     return this.snapshotTimeRangeTracker;
685   }
686 
687   /*
688    * MemStoreScanner implements the KeyValueScanner.
689    * It lets the caller scan the contents of a memstore -- both current
690    * map and snapshot.
691    * This behaves as if it were a real scanner but does not maintain position.
692    */
693   protected class MemStoreScanner extends NonLazyKeyValueScanner {
694     // Next row information for either kvset or snapshot
695     private KeyValue kvsetNextRow = null;
696     private KeyValue snapshotNextRow = null;
697 
698     // last iterated KVs for kvset and snapshot (to restore iterator state after reseek)
699     private KeyValue kvsetItRow = null;
700     private KeyValue snapshotItRow = null;
701     
702     // iterator based scanning.
703     private Iterator<KeyValue> kvsetIt;
704     private Iterator<KeyValue> snapshotIt;
705 
706     // The kvset and snapshot at the time of creating this scanner
707     private KeyValueSkipListSet kvsetAtCreation;
708     private KeyValueSkipListSet snapshotAtCreation;
709 
710     // the pre-calculated KeyValue to be returned by peek() or next()
711     private KeyValue theNext;
712 
713     // The allocator and snapshot allocator at the time of creating this scanner
714     volatile MemStoreLAB allocatorAtCreation;
715     volatile MemStoreLAB snapshotAllocatorAtCreation;
716 
717     /*
718     Some notes...
719 
720      So memstorescanner is fixed at creation time. this includes pointers/iterators into
721     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
722     snapshot is moved.  since kvset is null there is no point on reseeking on both,
723       we can save us the trouble. During the snapshot->hfile transition, the memstore
724       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
725       potentially do something smarter by adjusting the existing memstore scanner.
726 
727       But there is a greater problem here, that being once a scanner has progressed
728       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
729       if a scan lasts a little while, there is a chance for new entries in kvset to
730       become available but we will never see them.  This needs to be handled at the
731       StoreScanner level with coordination with MemStoreScanner.
732 
733       Currently, this problem is only partly managed: during the small amount of time
734       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
735       the adds to kvset in the MemStoreScanner.
736     */
737 
738     MemStoreScanner() {
739       super();
740 
741       kvsetAtCreation = kvset;
742       snapshotAtCreation = snapshot;
743       if (allocator != null) {
744         this.allocatorAtCreation = allocator;
745         this.allocatorAtCreation.incScannerCount();
746       }
747       if (snapshotAllocator != null) {
748         this.snapshotAllocatorAtCreation = snapshotAllocator;
749         this.snapshotAllocatorAtCreation.incScannerCount();
750       }
751     }
752 
753     private KeyValue getNext(Iterator<KeyValue> it) {
754       long readPoint = MultiVersionConsistencyControl.getThreadReadPoint();
755 
756       KeyValue v = null;
757       try {
758         while (it.hasNext()) {
759           v = it.next();
760           if (v.getMemstoreTS() <= readPoint) {
761             return v;
762           }
763         }
764 
765         return null;
766       } finally {
767         if (v != null) {
768           // in all cases, remember the last KV iterated to
769           if (it == snapshotIt) {
770             snapshotItRow = v;
771           } else {
772             kvsetItRow = v;
773           }
774         }
775       }
776     }
777 
778     /**
779      *  Set the scanner at the seek key.
780      *  Must be called only once: there is no thread safety between the scanner
781      *   and the memStore.
782      * @param key seek value
783      * @return false if the key is null or if there is no data
784      */
785     @Override
786     public synchronized boolean seek(KeyValue key) {
787       if (key == null) {
788         close();
789         return false;
790       }
791 
792       // kvset and snapshot will never be null.
793       // if tailSet can't find anything, SortedSet is empty (not null).
794       kvsetIt = kvsetAtCreation.tailSet(key).iterator();
795       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
796       kvsetItRow = null;
797       snapshotItRow = null;
798 
799       return seekInSubLists(key);
800     }
801 
802 
803     /**
804      * (Re)initialize the iterators after a seek or a reseek.
805      */
806     private synchronized boolean seekInSubLists(KeyValue key){
807       kvsetNextRow = getNext(kvsetIt);
808       snapshotNextRow = getNext(snapshotIt);
809 
810       // Calculate the next value
811       theNext = getLowest(kvsetNextRow, snapshotNextRow);
812 
813       // has data
814       return (theNext != null);
815     }
816 
817 
818     /**
819      * Move forward on the sub-lists set previously by seek.
820      * @param key seek value (should be non-null)
821      * @return true if there is at least one KV to read, false otherwise
822      */
823     @Override
824     public synchronized boolean reseek(KeyValue key) {
825       /*
826       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
827       This code is executed concurrently with flush and puts, without locks.
828       Two points must be known when working on this code:
829       1) It's not possible to use the 'kvTail' and 'snapshot'
830        variables, as they are modified during a flush.
831       2) The ideal implementation for performance would use the sub skip list
832        implicitly pointed by the iterators 'kvsetIt' and
833        'snapshotIt'. Unfortunately the Java API does not offer a method to
834        get it. So we remember the last keys we iterated to and restore
835        the reseeked set to at least that point.
836        */
837 
838       kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
839       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
840 
841       return seekInSubLists(key);
842     }
843 
844 
845     @Override
846     public synchronized KeyValue peek() {
847       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
848       return theNext;
849     }
850 
851     @Override
852     public synchronized KeyValue next() {
853       if (theNext == null) {
854           return null;
855       }
856 
857       final KeyValue ret = theNext;
858 
859       // Advance one of the iterators
860       if (theNext == kvsetNextRow) {
861         kvsetNextRow = getNext(kvsetIt);
862       } else {
863         snapshotNextRow = getNext(snapshotIt);
864       }
865 
866       // Calculate the next value
867       theNext = getLowest(kvsetNextRow, snapshotNextRow);
868 
869       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
870       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
871       //    getLowest() + " threadpoint=" + readpoint);
872       return ret;
873     }
874 
875     /*
876      * Returns the lower of the two key values, or null if they are both null.
877      * This uses comparator.compare() to compare the KeyValue using the memstore
878      * comparator.
879      */
880     private KeyValue getLowest(KeyValue first, KeyValue second) {
881       if (first == null && second == null) {
882         return null;
883       }
884       if (first != null && second != null) {
885         int compare = comparator.compare(first, second);
886         return (compare <= 0 ? first : second);
887       }
888       return (first != null ? first : second);
889     }
890 
891     /*
892      * Returns the higher of the two key values, or null if they are both null.
893      * This uses comparator.compare() to compare the KeyValue using the memstore
894      * comparator.
895      */
896     private KeyValue getHighest(KeyValue first, KeyValue second) {
897       if (first == null && second == null) {
898         return null;
899       }
900       if (first != null && second != null) {
901         int compare = comparator.compare(first, second);
902         return (compare > 0 ? first : second);
903       }
904       return (first != null ? first : second);
905     }
906 
907     public synchronized void close() {
908       this.kvsetNextRow = null;
909       this.snapshotNextRow = null;
910 
911       this.kvsetIt = null;
912       this.snapshotIt = null;
913       
914       if (allocatorAtCreation != null) {
915         this.allocatorAtCreation.decScannerCount();
916         this.allocatorAtCreation = null;
917       }
918       if (snapshotAllocatorAtCreation != null) {
919         this.snapshotAllocatorAtCreation.decScannerCount();
920         this.snapshotAllocatorAtCreation = null;
921       }
922 
923       this.kvsetItRow = null;
924       this.snapshotItRow = null;
925     }
926 
927     /**
928      * MemStoreScanner returns max value as sequence id because it will
929      * always have the latest data among all files.
930      */
931     @Override
932     public long getSequenceID() {
933       return Long.MAX_VALUE;
934     }
935 
936     @Override
937     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
938         long oldestUnexpiredTS) {
939       return shouldSeek(scan, oldestUnexpiredTS);
940     }
941   }
942 
943   public final static long FIXED_OVERHEAD = ClassSize.align(
944       ClassSize.OBJECT + (13 * ClassSize.REFERENCE));
945 
946   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
947       ClassSize.REENTRANT_LOCK + ClassSize.ATOMIC_LONG +
948       ClassSize.COPYONWRITE_ARRAYSET + ClassSize.COPYONWRITE_ARRAYLIST +
949       (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
950 
951   /** Used for readability when we don't store memstore timestamp in HFile */
952   public static final boolean NO_PERSISTENT_TS = false;
953 
954   /*
955    * Calculate how the MemStore size has changed.  Includes overhead of the
956    * backing Map.
957    * @param kv
958    * @param notpresent True if the kv was NOT present in the set.
959    * @return Size
960    */
961   long heapSizeChange(final KeyValue kv, final boolean notpresent) {
962     return notpresent ?
963         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
964         0;
965   }
966 
967   /**
968    * Get the entire heap usage for this MemStore not including keys in the
969    * snapshot.
970    */
971   @Override
972   public long heapSize() {
973     return size.get();
974   }
975 
976   /**
977    * Get the heap usage of KVs in this MemStore.
978    */
979   public long keySize() {
980     return heapSize() - DEEP_OVERHEAD;
981   }
982 
983   /**
984    * Code to help figure if our approximation of object heap sizes is close
985    * enough.  See hbase-900.  Fills memstores then waits so user can heap
986    * dump and bring up resultant hprof in something like jprofiler which
987    * allows you get 'deep size' on objects.
988    * @param args main args
989    */
990   public static void main(String [] args) {
991     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
992     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
993       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
994     LOG.info("vmInputArguments=" + runtime.getInputArguments());
995     MemStore memstore1 = new MemStore();
996     // TODO: x32 vs x64
997     long size = 0;
998     final int count = 10000;
999     byte [] fam = Bytes.toBytes("col");
1000     byte [] qf = Bytes.toBytes("umn");
1001     byte [] empty = new byte[0];
1002     for (int i = 0; i < count; i++) {
1003       // Give each its own ts
1004       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1005     }
1006     LOG.info("memstore1 estimated size=" + size);
1007     for (int i = 0; i < count; i++) {
1008       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1009     }
1010     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1011     // Make a variably sized memstore.
1012     MemStore memstore2 = new MemStore();
1013     for (int i = 0; i < count; i++) {
1014       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
1015         new byte[i]));
1016     }
1017     LOG.info("memstore2 estimated size=" + size);
1018     final int seconds = 30;
1019     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1020     for (int i = 0; i < seconds; i++) {
1021       // Thread.sleep(1000);
1022     }
1023     LOG.info("Exiting.");
1024   }
1025 }