View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.rmi.UnexpectedException;
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.NavigableSet;
30  import java.util.SortedSet;
31  import java.util.concurrent.atomic.AtomicLong;
32  
33  import org.apache.commons.logging.Log;
34  import org.apache.commons.logging.LogFactory;
35  import org.apache.hadoop.classification.InterfaceAudience;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.hbase.Cell;
38  import org.apache.hadoop.hbase.HBaseConfiguration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.KeyValue;
41  import org.apache.hadoop.hbase.KeyValueUtil;
42  import org.apache.hadoop.hbase.client.Scan;
43  import org.apache.hadoop.hbase.io.HeapSize;
44  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
45  import org.apache.hadoop.hbase.util.Bytes;
46  import org.apache.hadoop.hbase.util.ClassSize;
47  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
48  
49  /**
50   * The MemStore holds in-memory modifications to the Store.  Modifications
51   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
52   * to snapshot and is cleared.  We continue to serve edits out of new memstore
53   * and backing snapshot until flusher reports in that the flush succeeded. At
54   * this point we let the snapshot go.
55   *  <p>
56   * The MemStore functions should not be called in parallel. Callers should hold
57   *  write and read locks. This is done in {@link HStore}.
58   *  </p>
59   *
60   * TODO: Adjust size of the memstore when we remove items because they have
61   * been deleted.
62   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
63   * in KV size.
64   */
65  @InterfaceAudience.Private
66  public class MemStore implements HeapSize {
67    private static final Log LOG = LogFactory.getLog(MemStore.class);
68  
69    static final String USEMSLAB_KEY =
70      "hbase.hregion.memstore.mslab.enabled";
71    private static final boolean USEMSLAB_DEFAULT = true;
72  
73    private Configuration conf;
74  
75    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
76    // better semantics.  The Map will overwrite if passed a key it already had
77    // whereas the Set will not add new KV if key is same though value might be
78    // different.  Value is not important -- just make sure always same
79    // reference passed.
80    volatile KeyValueSkipListSet kvset;
81  
82    // Snapshot of memstore.  Made for flusher.
83    volatile KeyValueSkipListSet snapshot;
84  
85    final KeyValue.KVComparator comparator;
86  
87    // Used to track own heapSize
88    final AtomicLong size;
89  
90    // Used to track when to flush
91    volatile long timeOfOldestEdit = Long.MAX_VALUE;
92  
93    TimeRangeTracker timeRangeTracker;
94    TimeRangeTracker snapshotTimeRangeTracker;
95  
96    MemStoreChunkPool chunkPool;
97    volatile MemStoreLAB allocator;
98    volatile MemStoreLAB snapshotAllocator;
99  
100   /**
101    * Default constructor. Used for tests.
102    */
103   public MemStore() {
104     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
105   }
106 
107   /**
108    * Constructor.
109    * @param c Comparator
110    */
111   public MemStore(final Configuration conf,
112                   final KeyValue.KVComparator c) {
113     this.conf = conf;
114     this.comparator = c;
115     this.kvset = new KeyValueSkipListSet(c);
116     this.snapshot = new KeyValueSkipListSet(c);
117     timeRangeTracker = new TimeRangeTracker();
118     snapshotTimeRangeTracker = new TimeRangeTracker();
119     this.size = new AtomicLong(DEEP_OVERHEAD);
120     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
121       this.chunkPool = MemStoreChunkPool.getPool(conf);
122       this.allocator = new MemStoreLAB(conf, chunkPool);
123     } else {
124       this.allocator = null;
125       this.chunkPool = null;
126     }
127   }
128 
129   void dump() {
130     for (KeyValue kv: this.kvset) {
131       LOG.info(kv);
132     }
133     for (KeyValue kv: this.snapshot) {
134       LOG.info(kv);
135     }
136   }
137 
138   /**
139    * Creates a snapshot of the current memstore.
140    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet)}
141    * To get the snapshot made by this method, use {@link #getSnapshot()}
142    */
143   void snapshot() {
144     // If snapshot currently has entries, then flusher failed or didn't call
145     // cleanup.  Log a warning.
146     if (!this.snapshot.isEmpty()) {
147       LOG.warn("Snapshot called again without clearing previous. " +
148           "Doing nothing. Another ongoing flush or did we fail last attempt?");
149     } else {
150       if (!this.kvset.isEmpty()) {
151         this.snapshot = this.kvset;
152         this.kvset = new KeyValueSkipListSet(this.comparator);
153         this.snapshotTimeRangeTracker = this.timeRangeTracker;
154         this.timeRangeTracker = new TimeRangeTracker();
155         // Reset heap to not include any keys
156         this.size.set(DEEP_OVERHEAD);
157         this.snapshotAllocator = this.allocator;
158         // Reset allocator so we get a fresh buffer for the new memstore
159         if (allocator != null) {
160           this.allocator = new MemStoreLAB(conf, chunkPool);
161         }
162         timeOfOldestEdit = Long.MAX_VALUE;
163       }
164     }
165   }
166 
167   /**
168    * Return the current snapshot.
169    * Called by flusher to get current snapshot made by a previous
170    * call to {@link #snapshot()}
171    * @return Return snapshot.
172    * @see #snapshot()
173    * @see #clearSnapshot(SortedSet)
174    */
175   KeyValueSkipListSet getSnapshot() {
176     return this.snapshot;
177   }
178 
179   /**
180    * The passed snapshot was successfully persisted; it can be let go.
181    * @param ss The snapshot to clean out.
182    * @throws UnexpectedException
183    * @see #snapshot()
184    */
185   void clearSnapshot(final SortedSet<KeyValue> ss)
186   throws UnexpectedException {
187     MemStoreLAB tmpAllocator = null;
188     if (this.snapshot != ss) {
189       throw new UnexpectedException("Current snapshot is " +
190           this.snapshot + ", was passed " + ss);
191     }
192     // OK. Passed in snapshot is same as current snapshot.  If not-empty,
193     // create a new snapshot and let the old one go.
194     if (!ss.isEmpty()) {
195       this.snapshot = new KeyValueSkipListSet(this.comparator);
196       this.snapshotTimeRangeTracker = new TimeRangeTracker();
197     }
198     if (this.snapshotAllocator != null) {
199       tmpAllocator = this.snapshotAllocator;
200       this.snapshotAllocator = null;
201     }
202     if (tmpAllocator != null) {
203       tmpAllocator.close();
204     }
205   }
206 
207   /**
208    * Write an update
209    * @param kv
210    * @return approximate size of the passed key and value.
211    */
212   long add(final KeyValue kv) {
213     KeyValue toAdd = maybeCloneWithAllocator(kv);
214     return internalAdd(toAdd);
215   }
216 
217   long timeOfOldestEdit() {
218     return timeOfOldestEdit;
219   }
220 
221   private boolean addToKVSet(KeyValue e) {
222     boolean b = this.kvset.add(e);
223     setOldestEditTimeToNow();
224     return b;
225   }
226 
227   private boolean removeFromKVSet(KeyValue e) {
228     boolean b = this.kvset.remove(e);
229     setOldestEditTimeToNow();
230     return b;
231   }
232 
233   void setOldestEditTimeToNow() {
234     if (timeOfOldestEdit == Long.MAX_VALUE) {
235       timeOfOldestEdit = EnvironmentEdgeManager.currentTimeMillis();
236     }
237   }
238 
239   /**
240    * Internal version of add() that doesn't clone KVs with the
241    * allocator, and doesn't take the lock.
242    *
243    * Callers should ensure they already have the read lock taken
244    */
245   private long internalAdd(final KeyValue toAdd) {
246     long s = heapSizeChange(toAdd, addToKVSet(toAdd));
247     timeRangeTracker.includeTimestamp(toAdd);
248     this.size.addAndGet(s);
249     return s;
250   }
251 
252   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
253     if (allocator == null) {
254       return kv;
255     }
256 
257     int len = kv.getLength();
258     Allocation alloc = allocator.allocateBytes(len);
259     if (alloc == null) {
260       // The allocation was too large, allocator decided
261       // not to do anything with it.
262       return kv;
263     }
264     assert alloc.getData() != null;
265     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
266     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
267     newKv.setMvccVersion(kv.getMvccVersion());
268     return newKv;
269   }
270 
271   /**
272    * Remove n key from the memstore. Only kvs that have the same key and the
273    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
274    * in this call. It is possible that we can optimize this method by using
275    * tailMap/iterator, but since this method is called rarely (only for
276    * error recovery), we can leave those optimization for the future.
277    * @param kv
278    */
279   void rollback(final KeyValue kv) {
280     // If the key is in the snapshot, delete it. We should not update
281     // this.size, because that tracks the size of only the memstore and
282     // not the snapshot. The flush of this snapshot to disk has not
283     // yet started because Store.flush() waits for all rwcc transactions to
284     // commit before starting the flush to disk.
285     KeyValue found = this.snapshot.get(kv);
286     if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
287       this.snapshot.remove(kv);
288     }
289     // If the key is in the memstore, delete it. Update this.size.
290     found = this.kvset.get(kv);
291     if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
292       removeFromKVSet(kv);
293       long s = heapSizeChange(kv, true);
294       this.size.addAndGet(-s);
295     }
296   }
297 
298   /**
299    * Write a delete
300    * @param delete
301    * @return approximate size of the passed key and value.
302    */
303   long delete(final KeyValue delete) {
304     long s = 0;
305     KeyValue toAdd = maybeCloneWithAllocator(delete);
306     s += heapSizeChange(toAdd, addToKVSet(toAdd));
307     timeRangeTracker.includeTimestamp(toAdd);
308     this.size.addAndGet(s);
309     return s;
310   }
311 
312   /**
313    * @param kv Find the row that comes after this one.  If null, we return the
314    * first.
315    * @return Next row or null if none found.
316    */
317   KeyValue getNextRow(final KeyValue kv) {
318     return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
319   }
320 
321   /*
322    * @param a
323    * @param b
324    * @return Return lowest of a or b or null if both a and b are null
325    */
326   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
327     if (a == null) {
328       return b;
329     }
330     if (b == null) {
331       return a;
332     }
333     return comparator.compareRows(a, b) <= 0? a: b;
334   }
335 
336   /*
337    * @param key Find row that follows this one.  If null, return first.
338    * @param map Set to look in for a row beyond <code>row</code>.
339    * @return Next row or null if none found.  If one found, will be a new
340    * KeyValue -- can be destroyed by subsequent calls to this method.
341    */
342   private KeyValue getNextRow(final KeyValue key,
343       final NavigableSet<KeyValue> set) {
344     KeyValue result = null;
345     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
346     // Iterate until we fall into the next row; i.e. move off current row
347     for (KeyValue kv: tail) {
348       if (comparator.compareRows(kv, key) <= 0)
349         continue;
350       // Note: Not suppressing deletes or expired cells.  Needs to be handled
351       // by higher up functions.
352       result = kv;
353       break;
354     }
355     return result;
356   }
357 
358   /**
359    * @param state column/delete tracking state
360    */
361   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
362     getRowKeyAtOrBefore(kvset, state);
363     getRowKeyAtOrBefore(snapshot, state);
364   }
365 
366   /*
367    * @param set
368    * @param state Accumulates deletes and candidates.
369    */
370   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
371       final GetClosestRowBeforeTracker state) {
372     if (set.isEmpty()) {
373       return;
374     }
375     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
376       // Found nothing in row.  Try backing up.
377       getRowKeyBefore(set, state);
378     }
379   }
380 
381   /*
382    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
383    * we have been passed the first possible key on a row.  As we walk forward
384    * we accumulate deletes until we hit a candidate on the row at which point
385    * we return.
386    * @param set
387    * @param firstOnRow First possible key on this row.
388    * @param state
389    * @return True if we found a candidate walking this row.
390    */
391   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
392       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
393     boolean foundCandidate = false;
394     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
395     if (tail.isEmpty()) return foundCandidate;
396     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
397       KeyValue kv = i.next();
398       // Did we go beyond the target row? If so break.
399       if (state.isTooFar(kv, firstOnRow)) break;
400       if (state.isExpired(kv)) {
401         i.remove();
402         continue;
403       }
404       // If we added something, this row is a contender. break.
405       if (state.handle(kv)) {
406         foundCandidate = true;
407         break;
408       }
409     }
410     return foundCandidate;
411   }
412 
413   /*
414    * Walk backwards through the passed set a row at a time until we run out of
415    * set or until we get a candidate.
416    * @param set
417    * @param state
418    */
419   private void getRowKeyBefore(NavigableSet<KeyValue> set,
420       final GetClosestRowBeforeTracker state) {
421     KeyValue firstOnRow = state.getTargetKey();
422     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
423         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
424       // Make sure we don't fall out of our table.
425       if (!state.isTargetTable(p.kv)) break;
426       // Stop looking if we've exited the better candidate range.
427       if (!state.isBetterCandidate(p.kv)) break;
428       // Make into firstOnRow
429       firstOnRow = new KeyValue(p.kv.getRowArray(), p.kv.getRowOffset(), p.kv.getRowLength(),
430           HConstants.LATEST_TIMESTAMP);
431       // If we find something, break;
432       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
433     }
434   }
435 
436   /**
437    * Only used by tests. TODO: Remove
438    *
439    * Given the specs of a column, update it, first by inserting a new record,
440    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
441    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
442    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
443    * get the new value, or the old value and all readers will eventually only see the new
444    * value after the old was removed.
445    *
446    * @param row
447    * @param family
448    * @param qualifier
449    * @param newValue
450    * @param now
451    * @return  Timestamp
452    */
453   long updateColumnValue(byte[] row,
454                                 byte[] family,
455                                 byte[] qualifier,
456                                 long newValue,
457                                 long now) {
458     KeyValue firstKv = KeyValue.createFirstOnRow(
459         row, family, qualifier);
460     // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
461     SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
462     if (!snSs.isEmpty()) {
463       KeyValue snKv = snSs.first();
464       // is there a matching KV in the snapshot?
465       if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
466         if (snKv.getTimestamp() == now) {
467           // poop,
468           now += 1;
469         }
470       }
471     }
472 
473     // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
474     // But the timestamp should also be max(now, mostRecentTsInMemstore)
475 
476     // so we cant add the new KV w/o knowing what's there already, but we also
477     // want to take this chance to delete some kvs. So two loops (sad)
478 
479     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
480     for (KeyValue kv : ss) {
481       // if this isnt the row we are interested in, then bail:
482       if (!kv.matchingColumn(family, qualifier) || !kv.matchingRow(firstKv)) {
483         break; // rows dont match, bail.
484       }
485 
486       // if the qualifier matches and it's a put, just RM it out of the kvset.
487       if (kv.getTypeByte() == KeyValue.Type.Put.getCode() &&
488           kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
489         now = kv.getTimestamp();
490       }
491     }
492 
493     // create or update (upsert) a new KeyValue with
494     // 'now' and a 0 memstoreTS == immediately visible
495     List<Cell> cells = new ArrayList<Cell>(1);
496     cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
497     return upsert(cells, 1L);
498   }
499 
500   /**
501    * Update or insert the specified KeyValues.
502    * <p>
503    * For each KeyValue, insert into MemStore.  This will atomically upsert the
504    * value for that row/family/qualifier.  If a KeyValue did already exist,
505    * it will then be removed.
506    * <p>
507    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
508    * be immediately visible.  May want to change this so it is atomic across
509    * all KeyValues.
510    * <p>
511    * This is called under row lock, so Get operations will still see updates
512    * atomically.  Scans will only see each KeyValue update as atomic.
513    *
514    * @param cells
515    * @param readpoint readpoint below which we can safely remove duplicate KVs 
516    * @return change in memstore size
517    */
518   public long upsert(Iterable<Cell> cells, long readpoint) {
519     long size = 0;
520     for (Cell cell : cells) {
521       size += upsert(cell, readpoint);
522     }
523     return size;
524   }
525 
526   /**
527    * Inserts the specified KeyValue into MemStore and deletes any existing
528    * versions of the same row/family/qualifier as the specified KeyValue.
529    * <p>
530    * First, the specified KeyValue is inserted into the Memstore.
531    * <p>
532    * If there are any existing KeyValues in this MemStore with the same row,
533    * family, and qualifier, they are removed.
534    * <p>
535    * Callers must hold the read lock.
536    *
537    * @param cell
538    * @return change in size of MemStore
539    */
540   private long upsert(Cell cell, long readpoint) {
541     // Add the KeyValue to the MemStore
542     // Use the internalAdd method here since we (a) already have a lock
543     // and (b) cannot safely use the MSLAB here without potentially
544     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
545     // test that triggers the pathological case if we don't avoid MSLAB
546     // here.
547     KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
548     long addedSize = internalAdd(kv);
549 
550     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
551     // For this case we want to clean up any other puts
552     KeyValue firstKv = KeyValue.createFirstOnRow(
553         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
554         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
555         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
556     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
557     Iterator<KeyValue> it = ss.iterator();
558     // versions visible to oldest scanner
559     int versionsVisible = 0;
560     while ( it.hasNext() ) {
561       KeyValue cur = it.next();
562 
563       if (kv == cur) {
564         // ignore the one just put in
565         continue;
566       }
567       // check that this is the row and column we are interested in, otherwise bail
568       if (kv.matchingRow(cur) && kv.matchingQualifier(cur)) {
569         // only remove Puts that concurrent scanners cannot possibly see
570         if (cur.getTypeByte() == KeyValue.Type.Put.getCode() &&
571             cur.getMvccVersion() <= readpoint) {
572           if (versionsVisible > 1) {
573             // if we get here we have seen at least one version visible to the oldest scanner,
574             // which means we can prove that no scanner will see this version
575 
576             // false means there was a change, so give us the size.
577             long delta = heapSizeChange(cur, true);
578             addedSize -= delta;
579             this.size.addAndGet(-delta);
580             it.remove();
581             setOldestEditTimeToNow();
582           } else {
583             versionsVisible++;
584           }
585         }
586       } else {
587         // past the row or column, done
588         break;
589       }
590     }
591     return addedSize;
592   }
593 
594   /*
595    * Immutable data structure to hold member found in set and the set it was
596    * found in.  Include set because it is carrying context.
597    */
598   private static class Member {
599     final KeyValue kv;
600     final NavigableSet<KeyValue> set;
601     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
602       this.kv = kv;
603       this.set = s;
604     }
605   }
606 
607   /*
608    * @param set Set to walk back in.  Pass a first in row or we'll return
609    * same row (loop).
610    * @param state Utility and context.
611    * @param firstOnRow First item on the row after the one we want to find a
612    * member in.
613    * @return Null or member of row previous to <code>firstOnRow</code>
614    */
615   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
616       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
617     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
618     if (head.isEmpty()) return null;
619     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
620       KeyValue found = i.next();
621       if (state.isExpired(found)) {
622         i.remove();
623         continue;
624       }
625       return new Member(head, found);
626     }
627     return null;
628   }
629 
630   /**
631    * @return scanner on memstore and snapshot in this order.
632    */
633   List<KeyValueScanner> getScanners(long readPt) {
634     return Collections.<KeyValueScanner>singletonList(
635         new MemStoreScanner(readPt));
636   }
637 
638   /**
639    * Check if this memstore may contain the required keys
640    * @param scan
641    * @return False if the key definitely does not exist in this Memstore
642    */
643   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
644     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
645         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
646         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
647                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
648             oldestUnexpiredTS);
649   }
650 
651   public TimeRangeTracker getSnapshotTimeRangeTracker() {
652     return this.snapshotTimeRangeTracker;
653   }
654 
655   /*
656    * MemStoreScanner implements the KeyValueScanner.
657    * It lets the caller scan the contents of a memstore -- both current
658    * map and snapshot.
659    * This behaves as if it were a real scanner but does not maintain position.
660    */
661   protected class MemStoreScanner extends NonLazyKeyValueScanner {
662     // Next row information for either kvset or snapshot
663     private KeyValue kvsetNextRow = null;
664     private KeyValue snapshotNextRow = null;
665 
666     // last iterated KVs for kvset and snapshot (to restore iterator state after reseek)
667     private KeyValue kvsetItRow = null;
668     private KeyValue snapshotItRow = null;
669     
670     // iterator based scanning.
671     private Iterator<KeyValue> kvsetIt;
672     private Iterator<KeyValue> snapshotIt;
673 
674     // The kvset and snapshot at the time of creating this scanner
675     private KeyValueSkipListSet kvsetAtCreation;
676     private KeyValueSkipListSet snapshotAtCreation;
677 
678     // the pre-calculated KeyValue to be returned by peek() or next()
679     private KeyValue theNext;
680 
681     // The allocator and snapshot allocator at the time of creating this scanner
682     volatile MemStoreLAB allocatorAtCreation;
683     volatile MemStoreLAB snapshotAllocatorAtCreation;
684     
685     // A flag represents whether could stop skipping KeyValues for MVCC
686     // if have encountered the next row. Only used for reversed scan
687     private boolean stopSkippingKVsIfNextRow = false;
688 
689     private long readPoint;
690 
691     /*
692     Some notes...
693 
694      So memstorescanner is fixed at creation time. this includes pointers/iterators into
695     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
696     snapshot is moved.  since kvset is null there is no point on reseeking on both,
697       we can save us the trouble. During the snapshot->hfile transition, the memstore
698       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
699       potentially do something smarter by adjusting the existing memstore scanner.
700 
701       But there is a greater problem here, that being once a scanner has progressed
702       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
703       if a scan lasts a little while, there is a chance for new entries in kvset to
704       become available but we will never see them.  This needs to be handled at the
705       StoreScanner level with coordination with MemStoreScanner.
706 
707       Currently, this problem is only partly managed: during the small amount of time
708       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
709       the adds to kvset in the MemStoreScanner.
710     */
711 
712     MemStoreScanner(long readPoint) {
713       super();
714 
715       this.readPoint = readPoint;
716       kvsetAtCreation = kvset;
717       snapshotAtCreation = snapshot;
718       if (allocator != null) {
719         this.allocatorAtCreation = allocator;
720         this.allocatorAtCreation.incScannerCount();
721       }
722       if (snapshotAllocator != null) {
723         this.snapshotAllocatorAtCreation = snapshotAllocator;
724         this.snapshotAllocatorAtCreation.incScannerCount();
725       }
726     }
727 
728     private KeyValue getNext(Iterator<KeyValue> it) {
729       KeyValue startKV = theNext;
730       KeyValue v = null;
731       try {
732         while (it.hasNext()) {
733           v = it.next();
734           if (v.getMvccVersion() <= this.readPoint) {
735             return v;
736           }
737           if (stopSkippingKVsIfNextRow && startKV != null
738               && comparator.compareRows(v, startKV) > 0) {
739             return null;
740           }
741         }
742 
743         return null;
744       } finally {
745         if (v != null) {
746           // in all cases, remember the last KV iterated to
747           if (it == snapshotIt) {
748             snapshotItRow = v;
749           } else {
750             kvsetItRow = v;
751           }
752         }
753       }
754     }
755 
756     /**
757      *  Set the scanner at the seek key.
758      *  Must be called only once: there is no thread safety between the scanner
759      *   and the memStore.
760      * @param key seek value
761      * @return false if the key is null or if there is no data
762      */
763     @Override
764     public synchronized boolean seek(KeyValue key) {
765       if (key == null) {
766         close();
767         return false;
768       }
769 
770       // kvset and snapshot will never be null.
771       // if tailSet can't find anything, SortedSet is empty (not null).
772       kvsetIt = kvsetAtCreation.tailSet(key).iterator();
773       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
774       kvsetItRow = null;
775       snapshotItRow = null;
776 
777       return seekInSubLists(key);
778     }
779 
780 
781     /**
782      * (Re)initialize the iterators after a seek or a reseek.
783      */
784     private synchronized boolean seekInSubLists(KeyValue key){
785       kvsetNextRow = getNext(kvsetIt);
786       snapshotNextRow = getNext(snapshotIt);
787 
788       // Calculate the next value
789       theNext = getLowest(kvsetNextRow, snapshotNextRow);
790 
791       // has data
792       return (theNext != null);
793     }
794 
795 
796     /**
797      * Move forward on the sub-lists set previously by seek.
798      * @param key seek value (should be non-null)
799      * @return true if there is at least one KV to read, false otherwise
800      */
801     @Override
802     public synchronized boolean reseek(KeyValue key) {
803       /*
804       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
805       This code is executed concurrently with flush and puts, without locks.
806       Two points must be known when working on this code:
807       1) It's not possible to use the 'kvTail' and 'snapshot'
808        variables, as they are modified during a flush.
809       2) The ideal implementation for performance would use the sub skip list
810        implicitly pointed by the iterators 'kvsetIt' and
811        'snapshotIt'. Unfortunately the Java API does not offer a method to
812        get it. So we remember the last keys we iterated to and restore
813        the reseeked set to at least that point.
814        */
815 
816       kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
817       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
818 
819       return seekInSubLists(key);
820     }
821 
822 
823     @Override
824     public synchronized KeyValue peek() {
825       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
826       return theNext;
827     }
828 
829     @Override
830     public synchronized KeyValue next() {
831       if (theNext == null) {
832           return null;
833       }
834 
835       final KeyValue ret = theNext;
836 
837       // Advance one of the iterators
838       if (theNext == kvsetNextRow) {
839         kvsetNextRow = getNext(kvsetIt);
840       } else {
841         snapshotNextRow = getNext(snapshotIt);
842       }
843 
844       // Calculate the next value
845       theNext = getLowest(kvsetNextRow, snapshotNextRow);
846 
847       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
848       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
849       //    getLowest() + " threadpoint=" + readpoint);
850       return ret;
851     }
852 
853     /*
854      * Returns the lower of the two key values, or null if they are both null.
855      * This uses comparator.compare() to compare the KeyValue using the memstore
856      * comparator.
857      */
858     private KeyValue getLowest(KeyValue first, KeyValue second) {
859       if (first == null && second == null) {
860         return null;
861       }
862       if (first != null && second != null) {
863         int compare = comparator.compare(first, second);
864         return (compare <= 0 ? first : second);
865       }
866       return (first != null ? first : second);
867     }
868 
869     /*
870      * Returns the higher of the two key values, or null if they are both null.
871      * This uses comparator.compare() to compare the KeyValue using the memstore
872      * comparator.
873      */
874     private KeyValue getHighest(KeyValue first, KeyValue second) {
875       if (first == null && second == null) {
876         return null;
877       }
878       if (first != null && second != null) {
879         int compare = comparator.compare(first, second);
880         return (compare > 0 ? first : second);
881       }
882       return (first != null ? first : second);
883     }
884 
885     public synchronized void close() {
886       this.kvsetNextRow = null;
887       this.snapshotNextRow = null;
888 
889       this.kvsetIt = null;
890       this.snapshotIt = null;
891       
892       if (allocatorAtCreation != null) {
893         this.allocatorAtCreation.decScannerCount();
894         this.allocatorAtCreation = null;
895       }
896       if (snapshotAllocatorAtCreation != null) {
897         this.snapshotAllocatorAtCreation.decScannerCount();
898         this.snapshotAllocatorAtCreation = null;
899       }
900 
901       this.kvsetItRow = null;
902       this.snapshotItRow = null;
903     }
904 
905     /**
906      * MemStoreScanner returns max value as sequence id because it will
907      * always have the latest data among all files.
908      */
909     @Override
910     public long getSequenceID() {
911       return Long.MAX_VALUE;
912     }
913 
914     @Override
915     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
916         long oldestUnexpiredTS) {
917       return shouldSeek(scan, oldestUnexpiredTS);
918     }
919 
920     /**
921      * Seek scanner to the given key first. If it returns false(means
922      * peek()==null) or scanner's peek row is bigger than row of given key, seek
923      * the scanner to the previous row of given key
924      */
925     @Override
926     public synchronized boolean backwardSeek(KeyValue key) {
927       seek(key);
928       if (peek() == null || comparator.compareRows(peek(), key) > 0) {
929         return seekToPreviousRow(key);
930       }
931       return true;
932     }
933 
934     /**
935      * Separately get the KeyValue before the specified key from kvset and
936      * snapshotset, and use the row of higher one as the previous row of
937      * specified key, then seek to the first KeyValue of previous row
938      */
939     @Override
940     public synchronized boolean seekToPreviousRow(KeyValue key) {
941       KeyValue firstKeyOnRow = KeyValue.createFirstOnRow(key.getRow());
942       SortedSet<KeyValue> kvHead = kvsetAtCreation.headSet(firstKeyOnRow);
943       KeyValue kvsetBeforeRow = kvHead.isEmpty() ? null : kvHead.last();
944       SortedSet<KeyValue> snapshotHead = snapshotAtCreation
945           .headSet(firstKeyOnRow);
946       KeyValue snapshotBeforeRow = snapshotHead.isEmpty() ? null : snapshotHead
947           .last();
948       KeyValue lastKVBeforeRow = getHighest(kvsetBeforeRow, snapshotBeforeRow);
949       if (lastKVBeforeRow == null) {
950         theNext = null;
951         return false;
952       }
953       KeyValue firstKeyOnPreviousRow = KeyValue
954           .createFirstOnRow(lastKVBeforeRow.getRow());
955       this.stopSkippingKVsIfNextRow = true;
956       seek(firstKeyOnPreviousRow);
957       this.stopSkippingKVsIfNextRow = false;
958       if (peek() == null
959           || comparator.compareRows(peek(), firstKeyOnPreviousRow) > 0) {
960         return seekToPreviousRow(lastKVBeforeRow);
961       }
962       return true;
963     }
964 
965     @Override
966     public synchronized boolean seekToLastRow() {
967       KeyValue first = kvsetAtCreation.isEmpty() ? null : kvsetAtCreation
968           .last();
969       KeyValue second = snapshotAtCreation.isEmpty() ? null
970           : snapshotAtCreation.last();
971       KeyValue higherKv = getHighest(first, second);
972       if (higherKv == null) {
973         return false;
974       }
975       KeyValue firstKvOnLastRow = KeyValue.createFirstOnRow(higherKv.getRow());
976       if (seek(firstKvOnLastRow)) {
977         return true;
978       } else {
979         return seekToPreviousRow(higherKv);
980       }
981 
982     }
983   }
984 
985   public final static long FIXED_OVERHEAD = ClassSize.align(
986       ClassSize.OBJECT + (10 * ClassSize.REFERENCE) + Bytes.SIZEOF_LONG);
987 
988   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
989       ClassSize.ATOMIC_LONG + (2 * ClassSize.TIMERANGE_TRACKER) +
990       (2 * ClassSize.KEYVALUE_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
991 
992   /*
993    * Calculate how the MemStore size has changed.  Includes overhead of the
994    * backing Map.
995    * @param kv
996    * @param notpresent True if the kv was NOT present in the set.
997    * @return Size
998    */
999   static long heapSizeChange(final KeyValue kv, final boolean notpresent) {
1000     return notpresent ?
1001         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
1002         0;
1003   }
1004 
1005   /**
1006    * Get the entire heap usage for this MemStore not including keys in the
1007    * snapshot.
1008    */
1009   @Override
1010   public long heapSize() {
1011     return size.get();
1012   }
1013 
1014   /**
1015    * Get the heap usage of KVs in this MemStore.
1016    */
1017   public long keySize() {
1018     return heapSize() - DEEP_OVERHEAD;
1019   }
1020 
1021   /**
1022    * Code to help figure if our approximation of object heap sizes is close
1023    * enough.  See hbase-900.  Fills memstores then waits so user can heap
1024    * dump and bring up resultant hprof in something like jprofiler which
1025    * allows you get 'deep size' on objects.
1026    * @param args main args
1027    */
1028   public static void main(String [] args) {
1029     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
1030     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
1031       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
1032     LOG.info("vmInputArguments=" + runtime.getInputArguments());
1033     MemStore memstore1 = new MemStore();
1034     // TODO: x32 vs x64
1035     long size = 0;
1036     final int count = 10000;
1037     byte [] fam = Bytes.toBytes("col");
1038     byte [] qf = Bytes.toBytes("umn");
1039     byte [] empty = new byte[0];
1040     for (int i = 0; i < count; i++) {
1041       // Give each its own ts
1042       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1043     }
1044     LOG.info("memstore1 estimated size=" + size);
1045     for (int i = 0; i < count; i++) {
1046       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1047     }
1048     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1049     // Make a variably sized memstore.
1050     MemStore memstore2 = new MemStore();
1051     for (int i = 0; i < count; i++) {
1052       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
1053         new byte[i]));
1054     }
1055     LOG.info("memstore2 estimated size=" + size);
1056     final int seconds = 30;
1057     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1058     for (int i = 0; i < seconds; i++) {
1059       // Thread.sleep(1000);
1060     }
1061     LOG.info("Exiting.");
1062   }
1063 }