View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.regionserver;
22  
23  import java.lang.management.ManagementFactory;
24  import java.lang.management.RuntimeMXBean;
25  import java.rmi.UnexpectedException;
26  import java.util.Arrays;
27  import java.util.Collections;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.NavigableSet;
31  import java.util.SortedSet;
32  import java.util.concurrent.atomic.AtomicLong;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.hbase.HBaseConfiguration;
38  import org.apache.hadoop.hbase.HConstants;
39  import org.apache.hadoop.hbase.KeyValue;
40  import org.apache.hadoop.hbase.client.Scan;
41  import org.apache.hadoop.hbase.io.HeapSize;
42  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
43  import org.apache.hadoop.hbase.util.Bytes;
44  import org.apache.hadoop.hbase.util.ClassSize;
45  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
46  
47  /**
48   * The MemStore holds in-memory modifications to the Store.  Modifications
49   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
50   * to snapshot and is cleared.  We continue to serve edits out of new memstore
51   * and backing snapshot until flusher reports in that the flush succeeded. At
52   * this point we let the snapshot go.
53   * <p>
54   * The MemStore functions should not be called in parallel. Callers should hold
55   * write and read locks. This is done in {@link Store}.
56   * </p>
57   * TODO: Adjust size of the memstore when we remove items because they have
58   * been deleted.
59   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
60   * in KV size.
61   */
62  public class MemStore implements HeapSize {
63    private static final Log LOG = LogFactory.getLog(MemStore.class);
64  
65    static final String USEMSLAB_KEY =
66      "hbase.hregion.memstore.mslab.enabled";
67    private static final boolean USEMSLAB_DEFAULT = true;
68  
69    private Configuration conf;
70  
71    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
72    // better semantics.  The Map will overwrite if passed a key it already had
73    // whereas the Set will not add new KV if key is same though value might be
74    // different.  Value is not important -- just make sure always same
75    // reference passed.
76    volatile KeyValueSkipListSet kvset;
77  
78    // Snapshot of memstore.  Made for flusher.
79    volatile KeyValueSkipListSet snapshot;
80  
81    final KeyValue.KVComparator comparator;
82  
83    // Used comparing versions -- same r/c and ts but different type.
84    final KeyValue.KVComparator comparatorIgnoreType;
85  
86    // Used comparing versions -- same r/c and type but different timestamp.
87    final KeyValue.KVComparator comparatorIgnoreTimestamp;
88  
89    // Used to track own heapSize
90    final AtomicLong size;
91  
92    // Used to track when to flush
93    volatile long timeOfOldestEdit = Long.MAX_VALUE;
94  
95    TimeRangeTracker timeRangeTracker;
96    TimeRangeTracker snapshotTimeRangeTracker;
97  
98    MemStoreLAB allocator;
99  
100 
101 
102   /**
103    * Default constructor. Used for tests.
104    */
105   public MemStore() {
106     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
107   }
108 
109   /**
110    * Constructor.
111    * @param c Comparator
112    */
113   public MemStore(final Configuration conf,
114                   final KeyValue.KVComparator c) {
115     this.conf = conf;
116     this.comparator = c;
117     this.comparatorIgnoreTimestamp =
118       this.comparator.getComparatorIgnoringTimestamps();
119     this.comparatorIgnoreType = this.comparator.getComparatorIgnoringType();
120     this.kvset = new KeyValueSkipListSet(c);
121     this.snapshot = new KeyValueSkipListSet(c);
122     timeRangeTracker = new TimeRangeTracker();
123     snapshotTimeRangeTracker = new TimeRangeTracker();
124     this.size = new AtomicLong(DEEP_OVERHEAD);
125     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
126       this.allocator = new MemStoreLAB(conf);
127     } else {
128       this.allocator = null;
129     }
130   }
131 
132   void dump() {
133     for (KeyValue kv: this.kvset) {
134       LOG.info(kv);
135     }
136     for (KeyValue kv: this.snapshot) {
137       LOG.info(kv);
138     }
139   }
140 
141   /**
142    * Creates a snapshot of the current memstore.
143    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet<KeyValue>)}
144    * To get the snapshot made by this method, use {@link #getSnapshot()}
145    */
146   void snapshot() {
147     // If snapshot currently has entries, then flusher failed or didn't call
148     // cleanup.  Log a warning.
149     if (!this.snapshot.isEmpty()) {
150       LOG.warn("Snapshot called again without clearing previous. " +
151         "Doing nothing. Another ongoing flush or did we fail last attempt?");
152     } else {
153       if (!this.kvset.isEmpty()) {
154         this.snapshot = this.kvset;
155         this.kvset = new KeyValueSkipListSet(this.comparator);
156         this.snapshotTimeRangeTracker = this.timeRangeTracker;
157         this.timeRangeTracker = new TimeRangeTracker();
158         // Reset heap to not include any keys
159         this.size.set(DEEP_OVERHEAD);
160         // Reset allocator so we get a fresh buffer for the new memstore
161         if (allocator != null) {
162           this.allocator = new MemStoreLAB(conf);
163         }
164         timeOfOldestEdit = Long.MAX_VALUE;
165       }
166     }
167   }
168 
169   /**
170    * Return the current snapshot.
171    * Called by flusher to get current snapshot made by a previous
172    * call to {@link #snapshot()}
173    * @return Return snapshot.
174    * @see {@link #snapshot()}
175    * @see {@link #clearSnapshot(SortedSet<KeyValue>)}
176    */
177   KeyValueSkipListSet getSnapshot() {
178     return this.snapshot;
179   }
180 
181   /**
182    * The passed snapshot was successfully persisted; it can be let go.
183    * @param ss The snapshot to clean out.
184    * @throws UnexpectedException
185    * @see {@link #snapshot()}
186    */
187   void clearSnapshot(final SortedSet<KeyValue> ss)
188   throws UnexpectedException {
189     if (this.snapshot != ss) {
190       throw new UnexpectedException("Current snapshot is " +
191         this.snapshot + ", was passed " + ss);
192     }
193     // OK. Passed in snapshot is same as current snapshot.  If not-empty,
194     // create a new snapshot and let the old one go.
195     if (!ss.isEmpty()) {
196       this.snapshot = new KeyValueSkipListSet(this.comparator);
197       this.snapshotTimeRangeTracker = new TimeRangeTracker();
198     }
199   }
200 
201   /**
202    * Write an update
203    * @param kv
204    * @return approximate size of the passed key and value.
205    */
206   long add(final KeyValue kv) {
207     KeyValue toAdd = maybeCloneWithAllocator(kv);
208     return internalAdd(toAdd);
209   }
210 
211   long timeOfOldestEdit() {
212     return timeOfOldestEdit;
213   }
214 
215   private boolean addToKVSet(KeyValue e) {
216     boolean b = this.kvset.add(e);
217     setOldestEditTimeToNow();
218     return b;
219   }
220 
221   private boolean removeFromKVSet(KeyValue e) {
222     boolean b = this.kvset.remove(e);
223     setOldestEditTimeToNow();
224     return b;
225   }
226 
227   void setOldestEditTimeToNow() {
228     if (timeOfOldestEdit == Long.MAX_VALUE) {
229       timeOfOldestEdit = EnvironmentEdgeManager.currentTimeMillis();
230     }
231   }
232 
233   /**
234    * Internal version of add() that doesn't clone KVs with the
235    * allocator, and doesn't take the lock.
236    *
237    * Callers should ensure they already have the read lock taken
238    */
239   private long internalAdd(final KeyValue toAdd) {
240     long s = heapSizeChange(toAdd, addToKVSet(toAdd));
241     timeRangeTracker.includeTimestamp(toAdd);
242     this.size.addAndGet(s);
243     return s;
244   }
245 
246   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
247     if (allocator == null) {
248       return kv;
249     }
250 
251     int len = kv.getLength();
252     Allocation alloc = allocator.allocateBytes(len);
253     if (alloc == null) {
254       // The allocation was too large, allocator decided
255       // not to do anything with it.
256       return kv;
257     }
258     assert alloc != null && alloc.getData() != null;
259     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
260     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
261     newKv.setMemstoreTS(kv.getMemstoreTS());
262     return newKv;
263   }
264 
265   /**
266    * Remove n key from the memstore. Only kvs that have the same key and the
267    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
268    * in this call. It is possible that we can optimize this method by using
269    * tailMap/iterator, but since this method is called rarely (only for
270    * error recovery), we can leave those optimization for the future.
271    * @param kv
272    */
273   void rollback(final KeyValue kv) {
274     // If the key is in the snapshot, delete it. We should not update
275     // this.size, because that tracks the size of only the memstore and
276     // not the snapshot. The flush of this snapshot to disk has not
277     // yet started because Store.flush() waits for all rwcc transactions to
278     // commit before starting the flush to disk.
279     KeyValue found = this.snapshot.get(kv);
280     if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
281       this.snapshot.remove(kv);
282     }
283     // If the key is in the memstore, delete it. Update this.size.
284     found = this.kvset.get(kv);
285     if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
286       removeFromKVSet(kv);
287       long s = heapSizeChange(kv, true);
288       this.size.addAndGet(-s);
289     }
290   }
291 
292   /**
293    * Write a delete
294    * @param delete
295    * @return approximate size of the passed key and value.
296    */
297   long delete(final KeyValue delete) {
298     KeyValue toAdd = maybeCloneWithAllocator(delete);
299     long s = heapSizeChange(toAdd, addToKVSet(toAdd));
300     timeRangeTracker.includeTimestamp(toAdd);
301     this.size.addAndGet(s);
302     return s;
303   }
304 
305   /**
306    * @param kv Find the row that comes after this one.  If null, we return the
307    * first.
308    * @return Next row or null if none found.
309    */
310   KeyValue getNextRow(final KeyValue kv) {
311     return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
312   }
313 
314   /*
315    * @param a
316    * @param b
317    * @return Return lowest of a or b or null if both a and b are null
318    */
319   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
320     if (a == null) {
321       return b;
322     }
323     if (b == null) {
324       return a;
325     }
326     return comparator.compareRows(a, b) <= 0? a: b;
327   }
328 
329   /*
330    * @param key Find row that follows this one.  If null, return first.
331    * @param map Set to look in for a row beyond <code>row</code>.
332    * @return Next row or null if none found.  If one found, will be a new
333    * KeyValue -- can be destroyed by subsequent calls to this method.
334    */
335   private KeyValue getNextRow(final KeyValue key,
336       final NavigableSet<KeyValue> set) {
337     KeyValue result = null;
338     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
339     // Iterate until we fall into the next row; i.e. move off current row
340     for (KeyValue kv: tail) {
341       if (comparator.compareRows(kv, key) <= 0)
342         continue;
343       // Note: Not suppressing deletes or expired cells.  Needs to be handled
344       // by higher up functions.
345       result = kv;
346       break;
347     }
348     return result;
349   }
350 
351   /**
352    * @param state column/delete tracking state
353    */
354   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
355     getRowKeyAtOrBefore(kvset, state);
356     getRowKeyAtOrBefore(snapshot, state);
357   }
358 
359   /*
360    * @param set
361    * @param state Accumulates deletes and candidates.
362    */
363   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
364       final GetClosestRowBeforeTracker state) {
365     if (set.isEmpty()) {
366       return;
367     }
368     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
369       // Found nothing in row.  Try backing up.
370       getRowKeyBefore(set, state);
371     }
372   }
373 
374   /*
375    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
376    * we have been passed the first possible key on a row.  As we walk forward
377    * we accumulate deletes until we hit a candidate on the row at which point
378    * we return.
379    * @param set
380    * @param firstOnRow First possible key on this row.
381    * @param state
382    * @return True if we found a candidate walking this row.
383    */
384   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
385       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
386     boolean foundCandidate = false;
387     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
388     if (tail.isEmpty()) return foundCandidate;
389     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
390       KeyValue kv = i.next();
391       // Did we go beyond the target row? If so break.
392       if (state.isTooFar(kv, firstOnRow)) break;
393       if (state.isExpired(kv)) {
394         i.remove();
395         continue;
396       }
397       // If we added something, this row is a contender. break.
398       if (state.handle(kv)) {
399         foundCandidate = true;
400         break;
401       }
402     }
403     return foundCandidate;
404   }
405 
406   /*
407    * Walk backwards through the passed set a row at a time until we run out of
408    * set or until we get a candidate.
409    * @param set
410    * @param state
411    */
412   private void getRowKeyBefore(NavigableSet<KeyValue> set,
413       final GetClosestRowBeforeTracker state) {
414     KeyValue firstOnRow = state.getTargetKey();
415     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
416         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
417       // Make sure we don't fall out of our table.
418       if (!state.isTargetTable(p.kv)) break;
419       // Stop looking if we've exited the better candidate range.
420       if (!state.isBetterCandidate(p.kv)) break;
421       // Make into firstOnRow
422       firstOnRow = new KeyValue(p.kv.getRow(), HConstants.LATEST_TIMESTAMP);
423       // If we find something, break;
424       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
425     }
426   }
427 
428   /**
429    * Given the specs of a column, update it, first by inserting a new record,
430    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
431    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
432    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
433    * get the new value, or the old value and all readers will eventually only see the new
434    * value after the old was removed.
435    *
436    * @param row
437    * @param family
438    * @param qualifier
439    * @param newValue
440    * @param now
441    * @return  Timestamp
442    */
443   public long updateColumnValue(byte[] row,
444                                 byte[] family,
445                                 byte[] qualifier,
446                                 long newValue,
447                                 long now) {
448     KeyValue firstKv = KeyValue.createFirstOnRow(
449         row, family, qualifier);
450     // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
451     SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
452     if (!snSs.isEmpty()) {
453       KeyValue snKv = snSs.first();
454       // is there a matching KV in the snapshot?
455       if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
456         if (snKv.getTimestamp() == now) {
457           // poop,
458           now += 1;
459         }
460       }
461     }
462 
463     // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
464     // But the timestamp should also be max(now, mostRecentTsInMemstore)
465 
466     // so we cant add the new KV w/o knowing what's there already, but we also
467     // want to take this chance to delete some kvs. So two loops (sad)
468 
469     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
470     Iterator<KeyValue> it = ss.iterator();
471     while ( it.hasNext() ) {
472       KeyValue kv = it.next();
473 
474       // if this isnt the row we are interested in, then bail:
475       if (!kv.matchingColumn(family,qualifier) || !kv.matchingRow(firstKv) ) {
476         break; // rows dont match, bail.
477       }
478 
479       // if the qualifier matches and it's a put, just RM it out of the kvset.
480       if (kv.getType() == KeyValue.Type.Put.getCode() &&
481           kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
482         now = kv.getTimestamp();
483       }
484     }
485 
486     // create or update (upsert) a new KeyValue with
487     // 'now' and a 0 memstoreTS == immediately visible
488     return upsert(Arrays.asList(
489         new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)))
490     );
491   }
492 
493   /**
494    * Update or insert the specified KeyValues.
495    * <p>
496    * For each KeyValue, insert into MemStore.  This will atomically upsert the
497    * value for that row/family/qualifier.  If a KeyValue did already exist,
498    * it will then be removed.
499    * <p>
500    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
501    * be immediately visible.  May want to change this so it is atomic across
502    * all KeyValues.
503    * <p>
504    * This is called under row lock, so Get operations will still see updates
505    * atomically.  Scans will only see each KeyValue update as atomic.
506    *
507    * @param kvs
508    * @return change in memstore size
509    */
510   public long upsert(List<KeyValue> kvs) {
511     long size = 0;
512     for (KeyValue kv : kvs) {
513       kv.setMemstoreTS(0);
514       size += upsert(kv);
515     }
516     return size;
517   }
518 
519   /**
520    * Inserts the specified KeyValue into MemStore and deletes any existing
521    * versions of the same row/family/qualifier as the specified KeyValue.
522    * <p>
523    * First, the specified KeyValue is inserted into the Memstore.
524    * <p>
525    * If there are any existing KeyValues in this MemStore with the same row,
526    * family, and qualifier, they are removed.
527    * <p>
528    * Callers must hold the read lock.
529    *
530    * @param kv
531    * @return change in size of MemStore
532    */
533   private long upsert(KeyValue kv) {
534     // Add the KeyValue to the MemStore
535     // Use the internalAdd method here since we (a) already have a lock
536     // and (b) cannot safely use the MSLAB here without potentially
537     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
538     // test that triggers the pathological case if we don't avoid MSLAB
539     // here.
540     long addedSize = internalAdd(kv);
541 
542     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
543     // For this case we want to clean up any other puts
544     KeyValue firstKv = KeyValue.createFirstOnRow(
545         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
546         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
547         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
548     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
549     Iterator<KeyValue> it = ss.iterator();
550     while ( it.hasNext() ) {
551       KeyValue cur = it.next();
552 
553       if (kv == cur) {
554         // ignore the one just put in
555         continue;
556       }
557       // if this isn't the row we are interested in, then bail
558       if (!kv.matchingRow(cur)) {
559         break;
560       }
561 
562       // if the qualifier matches and it's a put, remove it
563       if (kv.matchingQualifier(cur)) {
564 
565         // to be extra safe we only remove Puts that have a memstoreTS==0
566         if (kv.getType() == KeyValue.Type.Put.getCode() &&
567             kv.getMemstoreTS() == 0) {
568           // false means there was a change, so give us the size.
569           long delta = heapSizeChange(cur, true);
570           addedSize -= delta;
571           this.size.addAndGet(-delta);
572           it.remove();
573           setOldestEditTimeToNow();
574         }
575       } else {
576         // past the column, done
577         break;
578       }
579     }
580     return addedSize;
581   }
582 
583   /*
584    * Immutable data structure to hold member found in set and the set it was
585    * found in.  Include set because it is carrying context.
586    */
587   private static class Member {
588     final KeyValue kv;
589     final NavigableSet<KeyValue> set;
590     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
591       this.kv = kv;
592       this.set = s;
593     }
594   }
595 
596   /*
597    * @param set Set to walk back in.  Pass a first in row or we'll return
598    * same row (loop).
599    * @param state Utility and context.
600    * @param firstOnRow First item on the row after the one we want to find a
601    * member in.
602    * @return Null or member of row previous to <code>firstOnRow</code>
603    */
604   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
605       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
606     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
607     if (head.isEmpty()) return null;
608     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
609       KeyValue found = i.next();
610       if (state.isExpired(found)) {
611         i.remove();
612         continue;
613       }
614       return new Member(head, found);
615     }
616     return null;
617   }
618 
619   /**
620    * @return scanner on memstore and snapshot in this order.
621    */
622   List<KeyValueScanner> getScanners() {
623     return Collections.<KeyValueScanner>singletonList(
624         new MemStoreScanner(MultiVersionConsistencyControl.getThreadReadPoint()));
625   }
626 
627   /**
628    * Check if this memstore may contain the required keys
629    * @param scan
630    * @return False if the key definitely does not exist in this Memstore
631    */
632   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
633     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
634         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
635         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
636                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
637             oldestUnexpiredTS);
638   }
639 
640   public TimeRangeTracker getSnapshotTimeRangeTracker() {
641     return this.snapshotTimeRangeTracker;
642   }
643 
644   /*
645    * MemStoreScanner implements the KeyValueScanner.
646    * It lets the caller scan the contents of a memstore -- both current
647    * map and snapshot.
648    * This behaves as if it were a real scanner but does not maintain position.
649    */
650   protected class MemStoreScanner extends NonLazyKeyValueScanner {
651     // Next row information for either kvset or snapshot
652     private KeyValue kvsetNextRow = null;
653     private KeyValue snapshotNextRow = null;
654 
655     // last iterated KVs for kvset and snapshot (to restore iterator state after reseek)
656     private KeyValue kvsetItRow = null;
657     private KeyValue snapshotItRow = null;
658     
659     // iterator based scanning.
660     private Iterator<KeyValue> kvsetIt;
661     private Iterator<KeyValue> snapshotIt;
662 
663     // The kvset and snapshot at the time of creating this scanner
664     volatile KeyValueSkipListSet kvsetAtCreation;
665     volatile KeyValueSkipListSet snapshotAtCreation;
666 
667     // the pre-calculated KeyValue to be returned by peek() or next()
668     private KeyValue theNext;
669     private final long readPoint;
670 
671     /*
672     Some notes...
673 
674      So memstorescanner is fixed at creation time. this includes pointers/iterators into
675     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
676     snapshot is moved.  since kvset is null there is no point on reseeking on both,
677       we can save us the trouble. During the snapshot->hfile transition, the memstore
678       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
679       potentially do something smarter by adjusting the existing memstore scanner.
680 
681       But there is a greater problem here, that being once a scanner has progressed
682       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
683       if a scan lasts a little while, there is a chance for new entries in kvset to
684       become available but we will never see them.  This needs to be handled at the
685       StoreScanner level with coordination with MemStoreScanner.
686 
687       Currently, this problem is only partly managed: during the small amount of time
688       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
689       the adds to kvset in the MemStoreScanner.
690     */
691 
692     MemStoreScanner(long readPoint) {
693       super();
694 
695       this.readPoint = readPoint;
696       kvsetAtCreation = kvset;
697       snapshotAtCreation = snapshot;
698     }
699 
700     private KeyValue getNext(Iterator<KeyValue> it) {
701       KeyValue v = null;
702       try {
703         while (it.hasNext()) {
704           v = it.next();
705           if (v.getMemstoreTS() <= readPoint) {
706             return v;
707           }
708         }
709 
710         return null;
711       } finally {
712         if (v != null) {
713           // in all cases, remember the last KV iterated to
714           if (it == snapshotIt) {
715             snapshotItRow = v;
716           } else {
717             kvsetItRow = v;
718           }
719         }
720       }
721     }
722 
723     /**
724      *  Set the scanner at the seek key.
725      *  Must be called only once: there is no thread safety between the scanner
726      *   and the memStore.
727      * @param key seek value
728      * @return false if the key is null or if there is no data
729      */
730     @Override
731     public synchronized boolean seek(KeyValue key) {
732       if (key == null) {
733         close();
734         return false;
735       }
736 
737       // kvset and snapshot will never be null.
738       // if tailSet can't find anything, SortedSet is empty (not null).
739       kvsetIt = kvsetAtCreation.tailSet(key).iterator();
740       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
741       kvsetItRow = null;
742       snapshotItRow = null;
743 
744       return seekInSubLists(key);
745     }
746 
747 
748     /**
749      * (Re)initialize the iterators after a seek or a reseek.
750      */
751     private synchronized boolean seekInSubLists(KeyValue key){
752       kvsetNextRow = getNext(kvsetIt);
753       snapshotNextRow = getNext(snapshotIt);
754 
755       // Calculate the next value
756       theNext = getLowest(kvsetNextRow, snapshotNextRow);
757 
758       // has data
759       return (theNext != null);
760     }
761 
762 
763     /**
764      * Move forward on the sub-lists set previously by seek.
765      * @param key seek value (should be non-null)
766      * @return true if there is at least one KV to read, false otherwise
767      */
768     @Override
769     public synchronized boolean reseek(KeyValue key) {
770       /*
771       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
772       This code is executed concurrently with flush and puts, without locks.
773       Two points must be known when working on this code:
774       1) It's not possible to use the 'kvTail' and 'snapshot'
775        variables, as they are modified during a flush.
776       2) The ideal implementation for performance would use the sub skip list
777        implicitly pointed by the iterators 'kvsetIt' and
778        'snapshotIt'. Unfortunately the Java API does not offer a method to
779        get it. So we remember the last keys we iterated to and restore
780        the reseeked set to at least that point.
781        */
782 
783       kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
784       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
785 
786       return seekInSubLists(key);
787     }
788 
789 
790     @Override
791     public synchronized KeyValue peek() {
792       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
793       return theNext;
794     }
795 
796     @Override
797     public synchronized KeyValue next() {
798       if (theNext == null) {
799           return null;
800       }
801 
802       final KeyValue ret = theNext;
803 
804       // Advance one of the iterators
805       if (theNext == kvsetNextRow) {
806         kvsetNextRow = getNext(kvsetIt);
807       } else {
808         snapshotNextRow = getNext(snapshotIt);
809       }
810 
811       // Calculate the next value
812       theNext = getLowest(kvsetNextRow, snapshotNextRow);
813 
814       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
815       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
816       //    getLowest() + " threadpoint=" + readpoint);
817       return ret;
818     }
819 
820     /*
821      * Returns the lower of the two key values, or null if they are both null.
822      * This uses comparator.compare() to compare the KeyValue using the memstore
823      * comparator.
824      */
825     private KeyValue getLowest(KeyValue first, KeyValue second) {
826       if (first == null && second == null) {
827         return null;
828       }
829       if (first != null && second != null) {
830         int compare = comparator.compare(first, second);
831         return (compare <= 0 ? first : second);
832       }
833       return (first != null ? first : second);
834     }
835 
836     /*
837      * Returns the higher of the two key values, or null if they are both null.
838      * This uses comparator.compare() to compare the KeyValue using the memstore
839      * comparator.
840      */
841     private KeyValue getHighest(KeyValue first, KeyValue second) {
842       if (first == null && second == null) {
843         return null;
844       }
845       if (first != null && second != null) {
846         int compare = comparator.compare(first, second);
847         return (compare > 0 ? first : second);
848       }
849       return (first != null ? first : second);
850     }
851 
852     public synchronized void close() {
853       this.kvsetNextRow = null;
854       this.snapshotNextRow = null;
855 
856       this.kvsetIt = null;
857       this.snapshotIt = null;
858 
859       this.kvsetItRow = null;
860       this.snapshotItRow = null;
861     }
862 
863     /**
864      * MemStoreScanner returns max value as sequence id because it will
865      * always have the latest data among all files.
866      */
867     @Override
868     public long getSequenceID() {
869       return Long.MAX_VALUE;
870     }
871 
872     @Override
873     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
874         long oldestUnexpiredTS) {
875       return shouldSeek(scan, oldestUnexpiredTS);
876     }
877   }
878 
879   public final static long FIXED_OVERHEAD = ClassSize.align(
880       ClassSize.OBJECT + (10 * ClassSize.REFERENCE) + Bytes.SIZEOF_LONG);
881 
882   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
883       ClassSize.ATOMIC_LONG +
884       (2 * ClassSize.TIMERANGE_TRACKER) +
885       (2 * ClassSize.KEYVALUE_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
886 
887   /** Used for readability when we don't store memstore timestamp in HFile */
888   public static final boolean NO_PERSISTENT_TS = false;
889 
890   /*
891    * Calculate how the MemStore size has changed.  Includes overhead of the
892    * backing Map.
893    * @param kv
894    * @param notpresent True if the kv was NOT present in the set.
895    * @return Size
896    */
897   long heapSizeChange(final KeyValue kv, final boolean notpresent) {
898     return notpresent ?
899         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
900         0;
901   }
902 
903   /**
904    * Get the entire heap usage for this MemStore not including keys in the
905    * snapshot.
906    */
907   @Override
908   public long heapSize() {
909     return size.get();
910   }
911 
912   /**
913    * Get the heap usage of KVs in this MemStore.
914    */
915   public long keySize() {
916     return heapSize() - DEEP_OVERHEAD;
917   }
918 
919   /**
920    * Code to help figure if our approximation of object heap sizes is close
921    * enough.  See hbase-900.  Fills memstores then waits so user can heap
922    * dump and bring up resultant hprof in something like jprofiler which
923    * allows you get 'deep size' on objects.
924    * @param args main args
925    */
926   public static void main(String [] args) {
927     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
928     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
929       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
930     LOG.info("vmInputArguments=" + runtime.getInputArguments());
931     MemStore memstore1 = new MemStore();
932     // TODO: x32 vs x64
933     long size = 0;
934     final int count = 10000;
935     byte [] fam = Bytes.toBytes("col");
936     byte [] qf = Bytes.toBytes("umn");
937     byte [] empty = new byte[0];
938     for (int i = 0; i < count; i++) {
939       // Give each its own ts
940       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
941     }
942     LOG.info("memstore1 estimated size=" + size);
943     for (int i = 0; i < count; i++) {
944       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
945     }
946     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
947     // Make a variably sized memstore.
948     MemStore memstore2 = new MemStore();
949     for (int i = 0; i < count; i++) {
950       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
951         new byte[i]));
952     }
953     LOG.info("memstore2 estimated size=" + size);
954     final int seconds = 30;
955     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
956     for (int i = 0; i < seconds; i++) {
957       // Thread.sleep(1000);
958     }
959     LOG.info("Exiting.");
960   }
961 }