View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.regionserver;
22  
23  import java.lang.management.ManagementFactory;
24  import java.lang.management.RuntimeMXBean;
25  import java.rmi.UnexpectedException;
26  import java.util.Arrays;
27  import java.util.Collections;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.NavigableSet;
31  import java.util.SortedSet;
32  import java.util.concurrent.atomic.AtomicLong;
33  import java.util.concurrent.locks.ReentrantReadWriteLock;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.hbase.HBaseConfiguration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.KeyValue;
41  import org.apache.hadoop.hbase.client.Scan;
42  import org.apache.hadoop.hbase.io.HeapSize;
43  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.ClassSize;
46  
47  /**
48   * The MemStore holds in-memory modifications to the Store.  Modifications
49   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
50   * to snapshot and is cleared.  We continue to serve edits out of new memstore
51   * and backing snapshot until flusher reports in that the flush succeeded. At
52   * this point we let the snapshot go.
53   * TODO: Adjust size of the memstore when we remove items because they have
54   * been deleted.
55   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
56   * in KV size.
57   */
58  public class MemStore implements HeapSize {
59    private static final Log LOG = LogFactory.getLog(MemStore.class);
60  
61    static final String USEMSLAB_KEY =
62      "hbase.hregion.memstore.mslab.enabled";
63    private static final boolean USEMSLAB_DEFAULT = true;
64  
65    private Configuration conf;
66  
67    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
68    // better semantics.  The Map will overwrite if passed a key it already had
69    // whereas the Set will not add new KV if key is same though value might be
70    // different.  Value is not important -- just make sure always same
71    // reference passed.
72    volatile KeyValueSkipListSet kvset;
73  
74    // Snapshot of memstore.  Made for flusher.
75    volatile KeyValueSkipListSet snapshot;
76  
77    final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
78  
79    final KeyValue.KVComparator comparator;
80  
81    // Used comparing versions -- same r/c and ts but different type.
82    final KeyValue.KVComparator comparatorIgnoreType;
83  
84    // Used comparing versions -- same r/c and type but different timestamp.
85    final KeyValue.KVComparator comparatorIgnoreTimestamp;
86  
87    // Used to track own heapSize
88    final AtomicLong size;
89  
90    TimeRangeTracker timeRangeTracker;
91    TimeRangeTracker snapshotTimeRangeTracker;
92  
93    MemStoreLAB allocator;
94  
95  
96  
97    /**
98     * Default constructor. Used for tests.
99     */
100   public MemStore() {
101     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
102   }
103 
104   /**
105    * Constructor.
106    * @param c Comparator
107    */
108   public MemStore(final Configuration conf,
109                   final KeyValue.KVComparator c) {
110     this.conf = conf;
111     this.comparator = c;
112     this.comparatorIgnoreTimestamp =
113       this.comparator.getComparatorIgnoringTimestamps();
114     this.comparatorIgnoreType = this.comparator.getComparatorIgnoringType();
115     this.kvset = new KeyValueSkipListSet(c);
116     this.snapshot = new KeyValueSkipListSet(c);
117     timeRangeTracker = new TimeRangeTracker();
118     snapshotTimeRangeTracker = new TimeRangeTracker();
119     this.size = new AtomicLong(DEEP_OVERHEAD);
120     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
121       this.allocator = new MemStoreLAB(conf);
122     } else {
123       this.allocator = null;
124     }
125   }
126 
127   void dump() {
128     for (KeyValue kv: this.kvset) {
129       LOG.info(kv);
130     }
131     for (KeyValue kv: this.snapshot) {
132       LOG.info(kv);
133     }
134   }
135 
136   /**
137    * Creates a snapshot of the current memstore.
138    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet<KeyValue>)}
139    * To get the snapshot made by this method, use {@link #getSnapshot()}
140    */
141   void snapshot() {
142     this.lock.writeLock().lock();
143     try {
144       // If snapshot currently has entries, then flusher failed or didn't call
145       // cleanup.  Log a warning.
146       if (!this.snapshot.isEmpty()) {
147         LOG.warn("Snapshot called again without clearing previous. " +
148           "Doing nothing. Another ongoing flush or did we fail last attempt?");
149       } else {
150         if (!this.kvset.isEmpty()) {
151           this.snapshot = this.kvset;
152           this.kvset = new KeyValueSkipListSet(this.comparator);
153           this.snapshotTimeRangeTracker = this.timeRangeTracker;
154           this.timeRangeTracker = new TimeRangeTracker();
155           // Reset heap to not include any keys
156           this.size.set(DEEP_OVERHEAD);
157           // Reset allocator so we get a fresh buffer for the new memstore
158           if (allocator != null) {
159             this.allocator = new MemStoreLAB(conf);
160           }
161         }
162       }
163     } finally {
164       this.lock.writeLock().unlock();
165     }
166   }
167 
168   /**
169    * Return the current snapshot.
170    * Called by flusher to get current snapshot made by a previous
171    * call to {@link #snapshot()}
172    * @return Return snapshot.
173    * @see {@link #snapshot()}
174    * @see {@link #clearSnapshot(SortedSet<KeyValue>)}
175    */
176   KeyValueSkipListSet getSnapshot() {
177     return this.snapshot;
178   }
179 
180   /**
181    * The passed snapshot was successfully persisted; it can be let go.
182    * @param ss The snapshot to clean out.
183    * @throws UnexpectedException
184    * @see {@link #snapshot()}
185    */
186   void clearSnapshot(final SortedSet<KeyValue> ss)
187   throws UnexpectedException {
188     this.lock.writeLock().lock();
189     try {
190       if (this.snapshot != ss) {
191         throw new UnexpectedException("Current snapshot is " +
192           this.snapshot + ", was passed " + ss);
193       }
194       // OK. Passed in snapshot is same as current snapshot.  If not-empty,
195       // create a new snapshot and let the old one go.
196       if (!ss.isEmpty()) {
197         this.snapshot = new KeyValueSkipListSet(this.comparator);
198         this.snapshotTimeRangeTracker = new TimeRangeTracker();
199       }
200     } finally {
201       this.lock.writeLock().unlock();
202     }
203   }
204 
205   /**
206    * Write an update
207    * @param kv
208    * @return approximate size of the passed key and value.
209    */
210   long add(final KeyValue kv) {
211     this.lock.readLock().lock();
212     try {
213       KeyValue toAdd = maybeCloneWithAllocator(kv);
214       return internalAdd(toAdd);
215     } finally {
216       this.lock.readLock().unlock();
217     }
218   }
219 
220   /**
221    * Internal version of add() that doesn't clone KVs with the
222    * allocator, and doesn't take the lock.
223    *
224    * Callers should ensure they already have the read lock taken
225    */
226   private long internalAdd(final KeyValue toAdd) {
227     long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
228     timeRangeTracker.includeTimestamp(toAdd);
229     this.size.addAndGet(s);
230     return s;
231   }
232 
233   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
234     if (allocator == null) {
235       return kv;
236     }
237 
238     int len = kv.getLength();
239     Allocation alloc = allocator.allocateBytes(len);
240     if (alloc == null) {
241       // The allocation was too large, allocator decided
242       // not to do anything with it.
243       return kv;
244     }
245     assert alloc != null && alloc.getData() != null;
246     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
247     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
248     newKv.setMemstoreTS(kv.getMemstoreTS());
249     return newKv;
250   }
251 
252   /**
253    * Remove n key from the memstore. Only kvs that have the same key and the
254    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
255    * in this call. It is possible that we can optimize this method by using
256    * tailMap/iterator, but since this method is called rarely (only for
257    * error recovery), we can leave those optimization for the future.
258    * @param kv
259    */
260   void rollback(final KeyValue kv) {
261     this.lock.readLock().lock();
262     try {
263       // If the key is in the snapshot, delete it. We should not update
264       // this.size, because that tracks the size of only the memstore and
265       // not the snapshot. The flush of this snapshot to disk has not
266       // yet started because Store.flush() waits for all rwcc transactions to
267       // commit before starting the flush to disk.
268       KeyValue found = this.snapshot.get(kv);
269       if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
270         this.snapshot.remove(kv);
271       }
272       // If the key is in the memstore, delete it. Update this.size.
273       found = this.kvset.get(kv);
274       if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
275         this.kvset.remove(kv);
276         long s = heapSizeChange(kv, true);
277         this.size.addAndGet(-s);
278       }
279     } finally {
280       this.lock.readLock().unlock();
281     }
282   }
283 
284   /**
285    * Write a delete
286    * @param delete
287    * @return approximate size of the passed key and value.
288    */
289   long delete(final KeyValue delete) {
290     long s = 0;
291     this.lock.readLock().lock();
292     try {
293       KeyValue toAdd = maybeCloneWithAllocator(delete);
294       s += heapSizeChange(toAdd, this.kvset.add(toAdd));
295       timeRangeTracker.includeTimestamp(toAdd);
296     } finally {
297       this.lock.readLock().unlock();
298     }
299     this.size.addAndGet(s);
300     return s;
301   }
302 
303   /**
304    * @param kv Find the row that comes after this one.  If null, we return the
305    * first.
306    * @return Next row or null if none found.
307    */
308   KeyValue getNextRow(final KeyValue kv) {
309     this.lock.readLock().lock();
310     try {
311       return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
312     } finally {
313       this.lock.readLock().unlock();
314     }
315   }
316 
317   /*
318    * @param a
319    * @param b
320    * @return Return lowest of a or b or null if both a and b are null
321    */
322   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
323     if (a == null) {
324       return b;
325     }
326     if (b == null) {
327       return a;
328     }
329     return comparator.compareRows(a, b) <= 0? a: b;
330   }
331 
332   /*
333    * @param key Find row that follows this one.  If null, return first.
334    * @param map Set to look in for a row beyond <code>row</code>.
335    * @return Next row or null if none found.  If one found, will be a new
336    * KeyValue -- can be destroyed by subsequent calls to this method.
337    */
338   private KeyValue getNextRow(final KeyValue key,
339       final NavigableSet<KeyValue> set) {
340     KeyValue result = null;
341     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
342     // Iterate until we fall into the next row; i.e. move off current row
343     for (KeyValue kv: tail) {
344       if (comparator.compareRows(kv, key) <= 0)
345         continue;
346       // Note: Not suppressing deletes or expired cells.  Needs to be handled
347       // by higher up functions.
348       result = kv;
349       break;
350     }
351     return result;
352   }
353 
354   /**
355    * @param state column/delete tracking state
356    */
357   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
358     this.lock.readLock().lock();
359     try {
360       getRowKeyAtOrBefore(kvset, state);
361       getRowKeyAtOrBefore(snapshot, state);
362     } finally {
363       this.lock.readLock().unlock();
364     }
365   }
366 
367   /*
368    * @param set
369    * @param state Accumulates deletes and candidates.
370    */
371   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
372       final GetClosestRowBeforeTracker state) {
373     if (set.isEmpty()) {
374       return;
375     }
376     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
377       // Found nothing in row.  Try backing up.
378       getRowKeyBefore(set, state);
379     }
380   }
381 
382   /*
383    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
384    * we have been passed the first possible key on a row.  As we walk forward
385    * we accumulate deletes until we hit a candidate on the row at which point
386    * we return.
387    * @param set
388    * @param firstOnRow First possible key on this row.
389    * @param state
390    * @return True if we found a candidate walking this row.
391    */
392   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
393       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
394     boolean foundCandidate = false;
395     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
396     if (tail.isEmpty()) return foundCandidate;
397     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
398       KeyValue kv = i.next();
399       // Did we go beyond the target row? If so break.
400       if (state.isTooFar(kv, firstOnRow)) break;
401       if (state.isExpired(kv)) {
402         i.remove();
403         continue;
404       }
405       // If we added something, this row is a contender. break.
406       if (state.handle(kv)) {
407         foundCandidate = true;
408         break;
409       }
410     }
411     return foundCandidate;
412   }
413 
414   /*
415    * Walk backwards through the passed set a row at a time until we run out of
416    * set or until we get a candidate.
417    * @param set
418    * @param state
419    */
420   private void getRowKeyBefore(NavigableSet<KeyValue> set,
421       final GetClosestRowBeforeTracker state) {
422     KeyValue firstOnRow = state.getTargetKey();
423     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
424         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
425       // Make sure we don't fall out of our table.
426       if (!state.isTargetTable(p.kv)) break;
427       // Stop looking if we've exited the better candidate range.
428       if (!state.isBetterCandidate(p.kv)) break;
429       // Make into firstOnRow
430       firstOnRow = new KeyValue(p.kv.getRow(), HConstants.LATEST_TIMESTAMP);
431       // If we find something, break;
432       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
433     }
434   }
435 
436   /**
437    * Given the specs of a column, update it, first by inserting a new record,
438    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
439    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
440    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
441    * get the new value, or the old value and all readers will eventually only see the new
442    * value after the old was removed.
443    *
444    * @param row
445    * @param family
446    * @param qualifier
447    * @param newValue
448    * @param now
449    * @return  Timestamp
450    */
451   public long updateColumnValue(byte[] row,
452                                 byte[] family,
453                                 byte[] qualifier,
454                                 long newValue,
455                                 long now) {
456    this.lock.readLock().lock();
457     try {
458       KeyValue firstKv = KeyValue.createFirstOnRow(
459           row, family, qualifier);
460       // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
461       SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
462       if (!snSs.isEmpty()) {
463         KeyValue snKv = snSs.first();
464         // is there a matching KV in the snapshot?
465         if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
466           if (snKv.getTimestamp() == now) {
467             // poop,
468             now += 1;
469           }
470         }
471       }
472 
473       // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
474       // But the timestamp should also be max(now, mostRecentTsInMemstore)
475 
476       // so we cant add the new KV w/o knowing what's there already, but we also
477       // want to take this chance to delete some kvs. So two loops (sad)
478 
479       SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
480       Iterator<KeyValue> it = ss.iterator();
481       while ( it.hasNext() ) {
482         KeyValue kv = it.next();
483 
484         // if this isnt the row we are interested in, then bail:
485         if (!kv.matchingColumn(family,qualifier) || !kv.matchingRow(firstKv) ) {
486           break; // rows dont match, bail.
487         }
488 
489         // if the qualifier matches and it's a put, just RM it out of the kvset.
490         if (kv.getType() == KeyValue.Type.Put.getCode() &&
491             kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
492           now = kv.getTimestamp();
493         }
494       }
495 
496       // create or update (upsert) a new KeyValue with
497       // 'now' and a 0 memstoreTS == immediately visible
498       return upsert(Arrays.asList(
499           new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)))
500       );
501     } finally {
502       this.lock.readLock().unlock();
503     }
504   }
505 
506   /**
507    * Update or insert the specified KeyValues.
508    * <p>
509    * For each KeyValue, insert into MemStore.  This will atomically upsert the
510    * value for that row/family/qualifier.  If a KeyValue did already exist,
511    * it will then be removed.
512    * <p>
513    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
514    * be immediately visible.  May want to change this so it is atomic across
515    * all KeyValues.
516    * <p>
517    * This is called under row lock, so Get operations will still see updates
518    * atomically.  Scans will only see each KeyValue update as atomic.
519    *
520    * @param kvs
521    * @return change in memstore size
522    */
523   public long upsert(List<KeyValue> kvs) {
524    this.lock.readLock().lock();
525     try {
526       long size = 0;
527       for (KeyValue kv : kvs) {
528         kv.setMemstoreTS(0);
529         size += upsert(kv);
530       }
531       return size;
532     } finally {
533       this.lock.readLock().unlock();
534     }
535   }
536 
537   /**
538    * Inserts the specified KeyValue into MemStore and deletes any existing
539    * versions of the same row/family/qualifier as the specified KeyValue.
540    * <p>
541    * First, the specified KeyValue is inserted into the Memstore.
542    * <p>
543    * If there are any existing KeyValues in this MemStore with the same row,
544    * family, and qualifier, they are removed.
545    * <p>
546    * Callers must hold the read lock.
547    *
548    * @param kv
549    * @return change in size of MemStore
550    */
551   private long upsert(KeyValue kv) {
552     // Add the KeyValue to the MemStore
553     // Use the internalAdd method here since we (a) already have a lock
554     // and (b) cannot safely use the MSLAB here without potentially
555     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
556     // test that triggers the pathological case if we don't avoid MSLAB
557     // here.
558     long addedSize = internalAdd(kv);
559 
560     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
561     // For this case we want to clean up any other puts
562     KeyValue firstKv = KeyValue.createFirstOnRow(
563         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
564         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
565         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
566     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
567     Iterator<KeyValue> it = ss.iterator();
568     while ( it.hasNext() ) {
569       KeyValue cur = it.next();
570 
571       if (kv == cur) {
572         // ignore the one just put in
573         continue;
574       }
575       // if this isn't the row we are interested in, then bail
576       if (!kv.matchingRow(cur)) {
577         break;
578       }
579 
580       // if the qualifier matches and it's a put, remove it
581       if (kv.matchingQualifier(cur)) {
582 
583         // to be extra safe we only remove Puts that have a memstoreTS==0
584         if (kv.getType() == KeyValue.Type.Put.getCode() &&
585             kv.getMemstoreTS() == 0) {
586           // false means there was a change, so give us the size.
587           long delta = heapSizeChange(cur, true);
588           addedSize -= delta;
589           this.size.addAndGet(-delta);
590           it.remove();
591         }
592       } else {
593         // past the column, done
594         break;
595       }
596     }
597     return addedSize;
598   }
599 
600   /*
601    * Immutable data structure to hold member found in set and the set it was
602    * found in.  Include set because it is carrying context.
603    */
604   private static class Member {
605     final KeyValue kv;
606     final NavigableSet<KeyValue> set;
607     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
608       this.kv = kv;
609       this.set = s;
610     }
611   }
612 
613   /*
614    * @param set Set to walk back in.  Pass a first in row or we'll return
615    * same row (loop).
616    * @param state Utility and context.
617    * @param firstOnRow First item on the row after the one we want to find a
618    * member in.
619    * @return Null or member of row previous to <code>firstOnRow</code>
620    */
621   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
622       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
623     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
624     if (head.isEmpty()) return null;
625     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
626       KeyValue found = i.next();
627       if (state.isExpired(found)) {
628         i.remove();
629         continue;
630       }
631       return new Member(head, found);
632     }
633     return null;
634   }
635 
636   /**
637    * @return scanner on memstore and snapshot in this order.
638    */
639   List<KeyValueScanner> getScanners() {
640     this.lock.readLock().lock();
641     try {
642       return Collections.<KeyValueScanner>singletonList(
643           new MemStoreScanner());
644     } finally {
645       this.lock.readLock().unlock();
646     }
647   }
648 
649   /**
650    * Check if this memstore may contain the required keys
651    * @param scan
652    * @return False if the key definitely does not exist in this Memstore
653    */
654   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
655     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
656         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
657         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
658                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
659             oldestUnexpiredTS);
660   }
661 
662   public TimeRangeTracker getSnapshotTimeRangeTracker() {
663     return this.snapshotTimeRangeTracker;
664   }
665 
666   /*
667    * MemStoreScanner implements the KeyValueScanner.
668    * It lets the caller scan the contents of a memstore -- both current
669    * map and snapshot.
670    * This behaves as if it were a real scanner but does not maintain position.
671    */
672   protected class MemStoreScanner extends NonLazyKeyValueScanner {
673     // Next row information for either kvset or snapshot
674     private KeyValue kvsetNextRow = null;
675     private KeyValue snapshotNextRow = null;
676 
677     // last iterated KVs for kvset and snapshot (to restore iterator state after reseek)
678     private KeyValue kvsetItRow = null;
679     private KeyValue snapshotItRow = null;
680     
681     // iterator based scanning.
682     private Iterator<KeyValue> kvsetIt;
683     private Iterator<KeyValue> snapshotIt;
684 
685     // The kvset and snapshot at the time of creating this scanner
686     volatile KeyValueSkipListSet kvsetAtCreation;
687     volatile KeyValueSkipListSet snapshotAtCreation;
688 
689     // the pre-calculated KeyValue to be returned by peek() or next()
690     private KeyValue theNext;
691 
692     /*
693     Some notes...
694 
695      So memstorescanner is fixed at creation time. this includes pointers/iterators into
696     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
697     snapshot is moved.  since kvset is null there is no point on reseeking on both,
698       we can save us the trouble. During the snapshot->hfile transition, the memstore
699       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
700       potentially do something smarter by adjusting the existing memstore scanner.
701 
702       But there is a greater problem here, that being once a scanner has progressed
703       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
704       if a scan lasts a little while, there is a chance for new entries in kvset to
705       become available but we will never see them.  This needs to be handled at the
706       StoreScanner level with coordination with MemStoreScanner.
707 
708       Currently, this problem is only partly managed: during the small amount of time
709       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
710       the adds to kvset in the MemStoreScanner.
711     */
712 
713     MemStoreScanner() {
714       super();
715 
716       kvsetAtCreation = kvset;
717       snapshotAtCreation = snapshot;
718     }
719 
720     private KeyValue getNext(Iterator<KeyValue> it) {
721       long readPoint = MultiVersionConsistencyControl.getThreadReadPoint();
722 
723       KeyValue v = null;
724       try {
725         while (it.hasNext()) {
726           v = it.next();
727           if (v.getMemstoreTS() <= readPoint) {
728             return v;
729           }
730         }
731 
732         return null;
733       } finally {
734         if (v != null) {
735           // in all cases, remember the last KV iterated to
736           if (it == snapshotIt) {
737             snapshotItRow = v;
738           } else {
739             kvsetItRow = v;
740           }
741         }
742       }
743     }
744 
745     /**
746      *  Set the scanner at the seek key.
747      *  Must be called only once: there is no thread safety between the scanner
748      *   and the memStore.
749      * @param key seek value
750      * @return false if the key is null or if there is no data
751      */
752     @Override
753     public synchronized boolean seek(KeyValue key) {
754       if (key == null) {
755         close();
756         return false;
757       }
758 
759       // kvset and snapshot will never be null.
760       // if tailSet can't find anything, SortedSet is empty (not null).
761       kvsetIt = kvsetAtCreation.tailSet(key).iterator();
762       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
763       kvsetItRow = null;
764       snapshotItRow = null;
765 
766       return seekInSubLists(key);
767     }
768 
769 
770     /**
771      * (Re)initialize the iterators after a seek or a reseek.
772      */
773     private synchronized boolean seekInSubLists(KeyValue key){
774       kvsetNextRow = getNext(kvsetIt);
775       snapshotNextRow = getNext(snapshotIt);
776 
777       // Calculate the next value
778       theNext = getLowest(kvsetNextRow, snapshotNextRow);
779 
780       // has data
781       return (theNext != null);
782     }
783 
784 
785     /**
786      * Move forward on the sub-lists set previously by seek.
787      * @param key seek value (should be non-null)
788      * @return true if there is at least one KV to read, false otherwise
789      */
790     @Override
791     public synchronized boolean reseek(KeyValue key) {
792       /*
793       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
794       This code is executed concurrently with flush and puts, without locks.
795       Two points must be known when working on this code:
796       1) It's not possible to use the 'kvTail' and 'snapshot'
797        variables, as they are modified during a flush.
798       2) The ideal implementation for performance would use the sub skip list
799        implicitly pointed by the iterators 'kvsetIt' and
800        'snapshotIt'. Unfortunately the Java API does not offer a method to
801        get it. So we remember the last keys we iterated to and restore
802        the reseeked set to at least that point.
803        */
804 
805       kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
806       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
807 
808       return seekInSubLists(key);
809     }
810 
811 
812     @Override
813     public synchronized KeyValue peek() {
814       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
815       return theNext;
816     }
817 
818     @Override
819     public synchronized KeyValue next() {
820       if (theNext == null) {
821           return null;
822       }
823 
824       final KeyValue ret = theNext;
825 
826       // Advance one of the iterators
827       if (theNext == kvsetNextRow) {
828         kvsetNextRow = getNext(kvsetIt);
829       } else {
830         snapshotNextRow = getNext(snapshotIt);
831       }
832 
833       // Calculate the next value
834       theNext = getLowest(kvsetNextRow, snapshotNextRow);
835 
836       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
837       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
838       //    getLowest() + " threadpoint=" + readpoint);
839       return ret;
840     }
841 
842     /*
843      * Returns the lower of the two key values, or null if they are both null.
844      * This uses comparator.compare() to compare the KeyValue using the memstore
845      * comparator.
846      */
847     private KeyValue getLowest(KeyValue first, KeyValue second) {
848       if (first == null && second == null) {
849         return null;
850       }
851       if (first != null && second != null) {
852         int compare = comparator.compare(first, second);
853         return (compare <= 0 ? first : second);
854       }
855       return (first != null ? first : second);
856     }
857 
858     /*
859      * Returns the higher of the two key values, or null if they are both null.
860      * This uses comparator.compare() to compare the KeyValue using the memstore
861      * comparator.
862      */
863     private KeyValue getHighest(KeyValue first, KeyValue second) {
864       if (first == null && second == null) {
865         return null;
866       }
867       if (first != null && second != null) {
868         int compare = comparator.compare(first, second);
869         return (compare > 0 ? first : second);
870       }
871       return (first != null ? first : second);
872     }
873 
874     public synchronized void close() {
875       this.kvsetNextRow = null;
876       this.snapshotNextRow = null;
877 
878       this.kvsetIt = null;
879       this.snapshotIt = null;
880 
881       this.kvsetItRow = null;
882       this.snapshotItRow = null;
883     }
884 
885     /**
886      * MemStoreScanner returns max value as sequence id because it will
887      * always have the latest data among all files.
888      */
889     @Override
890     public long getSequenceID() {
891       return Long.MAX_VALUE;
892     }
893 
894     @Override
895     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
896         long oldestUnexpiredTS) {
897       return shouldSeek(scan, oldestUnexpiredTS);
898     }
899   }
900 
901   public final static long FIXED_OVERHEAD = ClassSize.align(
902       ClassSize.OBJECT + (11 * ClassSize.REFERENCE));
903 
904   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
905       ClassSize.REENTRANT_LOCK + ClassSize.ATOMIC_LONG +
906       ClassSize.COPYONWRITE_ARRAYSET + ClassSize.COPYONWRITE_ARRAYLIST +
907       (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
908 
909   /** Used for readability when we don't store memstore timestamp in HFile */
910   public static final boolean NO_PERSISTENT_TS = false;
911 
912   /*
913    * Calculate how the MemStore size has changed.  Includes overhead of the
914    * backing Map.
915    * @param kv
916    * @param notpresent True if the kv was NOT present in the set.
917    * @return Size
918    */
919   long heapSizeChange(final KeyValue kv, final boolean notpresent) {
920     return notpresent ?
921         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
922         0;
923   }
924 
925   /**
926    * Get the entire heap usage for this MemStore not including keys in the
927    * snapshot.
928    */
929   @Override
930   public long heapSize() {
931     return size.get();
932   }
933 
934   /**
935    * Get the heap usage of KVs in this MemStore.
936    */
937   public long keySize() {
938     return heapSize() - DEEP_OVERHEAD;
939   }
940 
941   /**
942    * Code to help figure if our approximation of object heap sizes is close
943    * enough.  See hbase-900.  Fills memstores then waits so user can heap
944    * dump and bring up resultant hprof in something like jprofiler which
945    * allows you get 'deep size' on objects.
946    * @param args main args
947    */
948   public static void main(String [] args) {
949     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
950     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
951       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
952     LOG.info("vmInputArguments=" + runtime.getInputArguments());
953     MemStore memstore1 = new MemStore();
954     // TODO: x32 vs x64
955     long size = 0;
956     final int count = 10000;
957     byte [] fam = Bytes.toBytes("col");
958     byte [] qf = Bytes.toBytes("umn");
959     byte [] empty = new byte[0];
960     for (int i = 0; i < count; i++) {
961       // Give each its own ts
962       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
963     }
964     LOG.info("memstore1 estimated size=" + size);
965     for (int i = 0; i < count; i++) {
966       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
967     }
968     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
969     // Make a variably sized memstore.
970     MemStore memstore2 = new MemStore();
971     for (int i = 0; i < count; i++) {
972       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
973         new byte[i]));
974     }
975     LOG.info("memstore2 estimated size=" + size);
976     final int seconds = 30;
977     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
978     for (int i = 0; i < seconds; i++) {
979       // Thread.sleep(1000);
980     }
981     LOG.info("Exiting.");
982   }
983 }