View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.util.ArrayList;
25  import java.util.Collections;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.NavigableSet;
29  import java.util.SortedSet;
30  import java.util.concurrent.atomic.AtomicLong;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.Cell;
37  import org.apache.hadoop.hbase.CellUtil;
38  import org.apache.hadoop.hbase.HBaseConfiguration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.KeyValue;
41  import org.apache.hadoop.hbase.KeyValueUtil;
42  import org.apache.hadoop.hbase.client.Scan;
43  import org.apache.hadoop.hbase.util.ByteRange;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.ClassSize;
46  import org.apache.hadoop.hbase.util.CollectionBackedScanner;
47  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
48  import org.apache.hadoop.hbase.util.Pair;
49  import org.apache.hadoop.hbase.util.ReflectionUtils;
50  import org.apache.htrace.Trace;
51  
52  /**
53   * The MemStore holds in-memory modifications to the Store.  Modifications
54   * are {@link Cell}s.  When asked to flush, current memstore is moved
55   * to snapshot and is cleared.  We continue to serve edits out of new memstore
56   * and backing snapshot until flusher reports in that the flush succeeded. At
57   * this point we let the snapshot go.
58   *  <p>
59   * The MemStore functions should not be called in parallel. Callers should hold
60   *  write and read locks. This is done in {@link HStore}.
61   *  </p>
62   *
63   * TODO: Adjust size of the memstore when we remove items because they have
64   * been deleted.
65   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
66   * in KV size.
67   */
68  @InterfaceAudience.Private
69  public class DefaultMemStore implements MemStore {
70    private static final Log LOG = LogFactory.getLog(DefaultMemStore.class);
71    static final String USEMSLAB_KEY = "hbase.hregion.memstore.mslab.enabled";
72    private static final boolean USEMSLAB_DEFAULT = true;
73    static final String MSLAB_CLASS_NAME = "hbase.regionserver.mslab.class";
74  
75    private Configuration conf;
76  
77    // MemStore.  Use a CellSkipListSet rather than SkipListSet because of the
78    // better semantics.  The Map will overwrite if passed a key it already had
79    // whereas the Set will not add new Cell if key is same though value might be
80    // different.  Value is not important -- just make sure always same
81    // reference passed.
82    volatile CellSkipListSet cellSet;
83  
84    // Snapshot of memstore.  Made for flusher.
85    volatile CellSkipListSet snapshot;
86  
87    final KeyValue.KVComparator comparator;
88  
89    // Used to track own heapSize
90    final AtomicLong size;
91    private volatile long snapshotSize;
92  
93    // Used to track when to flush
94    volatile long timeOfOldestEdit = Long.MAX_VALUE;
95  
96    TimeRangeTracker timeRangeTracker;
97    TimeRangeTracker snapshotTimeRangeTracker;
98  
99    volatile MemStoreLAB allocator;
100   volatile MemStoreLAB snapshotAllocator;
101   volatile long snapshotId;
102 
103   /**
104    * Default constructor. Used for tests.
105    */
106   public DefaultMemStore() {
107     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
108   }
109 
110   /**
111    * Constructor.
112    * @param c Comparator
113    */
114   public DefaultMemStore(final Configuration conf,
115                   final KeyValue.KVComparator c) {
116     this.conf = conf;
117     this.comparator = c;
118     this.cellSet = new CellSkipListSet(c);
119     this.snapshot = new CellSkipListSet(c);
120     timeRangeTracker = new TimeRangeTracker();
121     snapshotTimeRangeTracker = new TimeRangeTracker();
122     this.size = new AtomicLong(DEEP_OVERHEAD);
123     this.snapshotSize = 0;
124     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
125       String className = conf.get(MSLAB_CLASS_NAME, HeapMemStoreLAB.class.getName());
126       this.allocator = ReflectionUtils.instantiateWithCustomCtor(className,
127           new Class[] { Configuration.class }, new Object[] { conf });
128     } else {
129       this.allocator = null;
130     }
131   }
132 
133   void dump() {
134     for (Cell cell: this.cellSet) {
135       LOG.info(cell);
136     }
137     for (Cell cell: this.snapshot) {
138       LOG.info(cell);
139     }
140   }
141 
142   /**
143    * Creates a snapshot of the current memstore.
144    * Snapshot must be cleared by call to {@link #clearSnapshot(long)}
145    */
146   @Override
147   public MemStoreSnapshot snapshot() {
148     // If snapshot currently has entries, then flusher failed or didn't call
149     // cleanup.  Log a warning.
150     if (!this.snapshot.isEmpty()) {
151       LOG.warn("Snapshot called again without clearing previous. " +
152           "Doing nothing. Another ongoing flush or did we fail last attempt?");
153     } else {
154       this.snapshotId = EnvironmentEdgeManager.currentTime();
155       this.snapshotSize = keySize();
156       if (!this.cellSet.isEmpty()) {
157         this.snapshot = this.cellSet;
158         this.cellSet = new CellSkipListSet(this.comparator);
159         this.snapshotTimeRangeTracker = this.timeRangeTracker;
160         this.timeRangeTracker = new TimeRangeTracker();
161         // Reset heap to not include any keys
162         this.size.set(DEEP_OVERHEAD);
163         this.snapshotAllocator = this.allocator;
164         // Reset allocator so we get a fresh buffer for the new memstore
165         if (allocator != null) {
166           String className = conf.get(MSLAB_CLASS_NAME, HeapMemStoreLAB.class.getName());
167           this.allocator = ReflectionUtils.instantiateWithCustomCtor(className,
168               new Class[] { Configuration.class }, new Object[] { conf });
169         }
170         timeOfOldestEdit = Long.MAX_VALUE;
171       }
172     }
173     return new MemStoreSnapshot(this.snapshotId, snapshot.size(), this.snapshotSize,
174         this.snapshotTimeRangeTracker, new CollectionBackedScanner(snapshot, this.comparator));
175   }
176 
177   /**
178    * The passed snapshot was successfully persisted; it can be let go.
179    * @param id Id of the snapshot to clean out.
180    * @throws UnexpectedStateException
181    * @see #snapshot()
182    */
183   @Override
184   public void clearSnapshot(long id) throws UnexpectedStateException {
185     MemStoreLAB tmpAllocator = null;
186     if (this.snapshotId != id) {
187       throw new UnexpectedStateException("Current snapshot id is " + this.snapshotId + ",passed "
188           + id);
189     }
190     // OK. Passed in snapshot is same as current snapshot. If not-empty,
191     // create a new snapshot and let the old one go.
192     if (!this.snapshot.isEmpty()) {
193       this.snapshot = new CellSkipListSet(this.comparator);
194       this.snapshotTimeRangeTracker = new TimeRangeTracker();
195     }
196     this.snapshotSize = 0;
197     this.snapshotId = -1;
198     if (this.snapshotAllocator != null) {
199       tmpAllocator = this.snapshotAllocator;
200       this.snapshotAllocator = null;
201     }
202     if (tmpAllocator != null) {
203       tmpAllocator.close();
204     }
205   }
206 
207   @Override
208   public long getFlushableSize() {
209     return this.snapshotSize > 0 ? this.snapshotSize : keySize();
210   }
211 
212   /**
213    * Write an update
214    * @param cell
215    * @return approximate size of the passed KV & newly added KV which maybe different than the
216    *         passed-in KV
217    */
218   @Override
219   public Pair<Long, Cell> add(Cell cell) {
220     Cell toAdd = maybeCloneWithAllocator(cell);
221     return new Pair<Long, Cell>(internalAdd(toAdd), toAdd);
222   }
223 
224   @Override
225   public long timeOfOldestEdit() {
226     return timeOfOldestEdit;
227   }
228 
229   private boolean addToCellSet(Cell e) {
230     boolean b = this.cellSet.add(e);
231     setOldestEditTimeToNow();
232     return b;
233   }
234 
235   private boolean removeFromCellSet(Cell e) {
236     boolean b = this.cellSet.remove(e);
237     setOldestEditTimeToNow();
238     return b;
239   }
240 
241   void setOldestEditTimeToNow() {
242     if (timeOfOldestEdit == Long.MAX_VALUE) {
243       timeOfOldestEdit = EnvironmentEdgeManager.currentTime();
244     }
245   }
246 
247   /**
248    * Internal version of add() that doesn't clone Cells with the
249    * allocator, and doesn't take the lock.
250    *
251    * Callers should ensure they already have the read lock taken
252    */
253   private long internalAdd(final Cell toAdd) {
254     long s = heapSizeChange(toAdd, addToCellSet(toAdd));
255     timeRangeTracker.includeTimestamp(toAdd);
256     this.size.addAndGet(s);
257     return s;
258   }
259 
260   private Cell maybeCloneWithAllocator(Cell cell) {
261     if (allocator == null) {
262       return cell;
263     }
264 
265     int len = KeyValueUtil.length(cell);
266     ByteRange alloc = allocator.allocateBytes(len);
267     if (alloc == null) {
268       // The allocation was too large, allocator decided
269       // not to do anything with it.
270       return cell;
271     }
272     assert alloc.getBytes() != null;
273     KeyValueUtil.appendToByteArray(cell, alloc.getBytes(), alloc.getOffset());
274     KeyValue newKv = new KeyValue(alloc.getBytes(), alloc.getOffset(), len);
275     newKv.setSequenceId(cell.getSequenceId());
276     return newKv;
277   }
278 
279   /**
280    * Remove n key from the memstore. Only cells that have the same key and the
281    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
282    * in this call. It is possible that we can optimize this method by using
283    * tailMap/iterator, but since this method is called rarely (only for
284    * error recovery), we can leave those optimization for the future.
285    * @param cell
286    */
287   @Override
288   public void rollback(Cell cell) {
289     // If the key is in the snapshot, delete it. We should not update
290     // this.size, because that tracks the size of only the memstore and
291     // not the snapshot. The flush of this snapshot to disk has not
292     // yet started because Store.flush() waits for all rwcc transactions to
293     // commit before starting the flush to disk.
294     Cell found = this.snapshot.get(cell);
295     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
296       this.snapshot.remove(cell);
297       long sz = heapSizeChange(cell, true);
298       this.snapshotSize -= sz;
299     }
300     // If the key is in the memstore, delete it. Update this.size.
301     found = this.cellSet.get(cell);
302     if (found != null && found.getSequenceId() == cell.getSequenceId()) {
303       removeFromCellSet(cell);
304       long s = heapSizeChange(cell, true);
305       this.size.addAndGet(-s);
306     }
307   }
308 
309   /**
310    * Write a delete
311    * @param deleteCell
312    * @return approximate size of the passed key and value.
313    */
314   @Override
315   public long delete(Cell deleteCell) {
316     long s = 0;
317     Cell toAdd = maybeCloneWithAllocator(deleteCell);
318     s += heapSizeChange(toAdd, addToCellSet(toAdd));
319     timeRangeTracker.includeTimestamp(toAdd);
320     this.size.addAndGet(s);
321     return s;
322   }
323 
324   /**
325    * @param cell Find the row that comes after this one.  If null, we return the
326    * first.
327    * @return Next row or null if none found.
328    */
329   Cell getNextRow(final Cell cell) {
330     return getLowest(getNextRow(cell, this.cellSet), getNextRow(cell, this.snapshot));
331   }
332 
333   /*
334    * @param a
335    * @param b
336    * @return Return lowest of a or b or null if both a and b are null
337    */
338   private Cell getLowest(final Cell a, final Cell b) {
339     if (a == null) {
340       return b;
341     }
342     if (b == null) {
343       return a;
344     }
345     return comparator.compareRows(a, b) <= 0? a: b;
346   }
347 
348   /*
349    * @param key Find row that follows this one.  If null, return first.
350    * @param map Set to look in for a row beyond <code>row</code>.
351    * @return Next row or null if none found.  If one found, will be a new
352    * KeyValue -- can be destroyed by subsequent calls to this method.
353    */
354   private Cell getNextRow(final Cell key,
355       final NavigableSet<Cell> set) {
356     Cell result = null;
357     SortedSet<Cell> tail = key == null? set: set.tailSet(key);
358     // Iterate until we fall into the next row; i.e. move off current row
359     for (Cell cell: tail) {
360       if (comparator.compareRows(cell, key) <= 0)
361         continue;
362       // Note: Not suppressing deletes or expired cells.  Needs to be handled
363       // by higher up functions.
364       result = cell;
365       break;
366     }
367     return result;
368   }
369 
370   /**
371    * @param state column/delete tracking state
372    */
373   @Override
374   public void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
375     getRowKeyAtOrBefore(cellSet, state);
376     getRowKeyAtOrBefore(snapshot, state);
377   }
378 
379   /*
380    * @param set
381    * @param state Accumulates deletes and candidates.
382    */
383   private void getRowKeyAtOrBefore(final NavigableSet<Cell> set,
384       final GetClosestRowBeforeTracker state) {
385     if (set.isEmpty()) {
386       return;
387     }
388     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
389       // Found nothing in row.  Try backing up.
390       getRowKeyBefore(set, state);
391     }
392   }
393 
394   /*
395    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
396    * we have been passed the first possible key on a row.  As we walk forward
397    * we accumulate deletes until we hit a candidate on the row at which point
398    * we return.
399    * @param set
400    * @param firstOnRow First possible key on this row.
401    * @param state
402    * @return True if we found a candidate walking this row.
403    */
404   private boolean walkForwardInSingleRow(final SortedSet<Cell> set,
405       final Cell firstOnRow, final GetClosestRowBeforeTracker state) {
406     boolean foundCandidate = false;
407     SortedSet<Cell> tail = set.tailSet(firstOnRow);
408     if (tail.isEmpty()) return foundCandidate;
409     for (Iterator<Cell> i = tail.iterator(); i.hasNext();) {
410       Cell kv = i.next();
411       // Did we go beyond the target row? If so break.
412       if (state.isTooFar(kv, firstOnRow)) break;
413       if (state.isExpired(kv)) {
414         i.remove();
415         continue;
416       }
417       // If we added something, this row is a contender. break.
418       if (state.handle(kv)) {
419         foundCandidate = true;
420         break;
421       }
422     }
423     return foundCandidate;
424   }
425 
426   /*
427    * Walk backwards through the passed set a row at a time until we run out of
428    * set or until we get a candidate.
429    * @param set
430    * @param state
431    */
432   private void getRowKeyBefore(NavigableSet<Cell> set,
433       final GetClosestRowBeforeTracker state) {
434     Cell firstOnRow = state.getTargetKey();
435     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
436         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
437       // Make sure we don't fall out of our table.
438       if (!state.isTargetTable(p.cell)) break;
439       // Stop looking if we've exited the better candidate range.
440       if (!state.isBetterCandidate(p.cell)) break;
441       // Make into firstOnRow
442       firstOnRow = new KeyValue(p.cell.getRowArray(), p.cell.getRowOffset(), p.cell.getRowLength(),
443           HConstants.LATEST_TIMESTAMP);
444       // If we find something, break;
445       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
446     }
447   }
448 
449   /**
450    * Only used by tests. TODO: Remove
451    *
452    * Given the specs of a column, update it, first by inserting a new record,
453    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
454    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
455    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
456    * get the new value, or the old value and all readers will eventually only see the new
457    * value after the old was removed.
458    *
459    * @param row
460    * @param family
461    * @param qualifier
462    * @param newValue
463    * @param now
464    * @return  Timestamp
465    */
466   public long updateColumnValue(byte[] row,
467                                 byte[] family,
468                                 byte[] qualifier,
469                                 long newValue,
470                                 long now) {
471     Cell firstCell = KeyValueUtil.createFirstOnRow(row, family, qualifier);
472     // Is there a Cell in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
473     SortedSet<Cell> snSs = snapshot.tailSet(firstCell);
474     if (!snSs.isEmpty()) {
475       Cell snc = snSs.first();
476       // is there a matching Cell in the snapshot?
477       if (CellUtil.matchingRow(snc, firstCell) && CellUtil.matchingQualifier(snc, firstCell)) {
478         if (snc.getTimestamp() == now) {
479           // poop,
480           now += 1;
481         }
482       }
483     }
484 
485     // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
486     // But the timestamp should also be max(now, mostRecentTsInMemstore)
487 
488     // so we cant add the new Cell w/o knowing what's there already, but we also
489     // want to take this chance to delete some cells. So two loops (sad)
490 
491     SortedSet<Cell> ss = cellSet.tailSet(firstCell);
492     for (Cell cell : ss) {
493       // if this isnt the row we are interested in, then bail:
494       if (!CellUtil.matchingColumn(cell, family, qualifier)
495           || !CellUtil.matchingRow(cell, firstCell)) {
496         break; // rows dont match, bail.
497       }
498 
499       // if the qualifier matches and it's a put, just RM it out of the cellSet.
500       if (cell.getTypeByte() == KeyValue.Type.Put.getCode() &&
501           cell.getTimestamp() > now && CellUtil.matchingQualifier(firstCell, cell)) {
502         now = cell.getTimestamp();
503       }
504     }
505 
506     // create or update (upsert) a new Cell with
507     // 'now' and a 0 memstoreTS == immediately visible
508     List<Cell> cells = new ArrayList<Cell>(1);
509     cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
510     return upsert(cells, 1L);
511   }
512 
513   /**
514    * Update or insert the specified KeyValues.
515    * <p>
516    * For each KeyValue, insert into MemStore.  This will atomically upsert the
517    * value for that row/family/qualifier.  If a KeyValue did already exist,
518    * it will then be removed.
519    * <p>
520    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
521    * be immediately visible.  May want to change this so it is atomic across
522    * all KeyValues.
523    * <p>
524    * This is called under row lock, so Get operations will still see updates
525    * atomically.  Scans will only see each KeyValue update as atomic.
526    *
527    * @param cells
528    * @param readpoint readpoint below which we can safely remove duplicate KVs 
529    * @return change in memstore size
530    */
531   @Override
532   public long upsert(Iterable<Cell> cells, long readpoint) {
533     long size = 0;
534     for (Cell cell : cells) {
535       size += upsert(cell, readpoint);
536     }
537     return size;
538   }
539 
540   /**
541    * Inserts the specified KeyValue into MemStore and deletes any existing
542    * versions of the same row/family/qualifier as the specified KeyValue.
543    * <p>
544    * First, the specified KeyValue is inserted into the Memstore.
545    * <p>
546    * If there are any existing KeyValues in this MemStore with the same row,
547    * family, and qualifier, they are removed.
548    * <p>
549    * Callers must hold the read lock.
550    *
551    * @param cell
552    * @return change in size of MemStore
553    */
554   private long upsert(Cell cell, long readpoint) {
555     // Add the Cell to the MemStore
556     // Use the internalAdd method here since we (a) already have a lock
557     // and (b) cannot safely use the MSLAB here without potentially
558     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
559     // test that triggers the pathological case if we don't avoid MSLAB
560     // here.
561     long addedSize = internalAdd(cell);
562 
563     // Get the Cells for the row/family/qualifier regardless of timestamp.
564     // For this case we want to clean up any other puts
565     Cell firstCell = KeyValueUtil.createFirstOnRow(
566         cell.getRowArray(), cell.getRowOffset(), cell.getRowLength(),
567         cell.getFamilyArray(), cell.getFamilyOffset(), cell.getFamilyLength(),
568         cell.getQualifierArray(), cell.getQualifierOffset(), cell.getQualifierLength());
569     SortedSet<Cell> ss = cellSet.tailSet(firstCell);
570     Iterator<Cell> it = ss.iterator();
571     // versions visible to oldest scanner
572     int versionsVisible = 0;
573     while ( it.hasNext() ) {
574       Cell cur = it.next();
575 
576       if (cell == cur) {
577         // ignore the one just put in
578         continue;
579       }
580       // check that this is the row and column we are interested in, otherwise bail
581       if (CellUtil.matchingRow(cell, cur) && CellUtil.matchingQualifier(cell, cur)) {
582         // only remove Puts that concurrent scanners cannot possibly see
583         if (cur.getTypeByte() == KeyValue.Type.Put.getCode() &&
584             cur.getSequenceId() <= readpoint) {
585           if (versionsVisible >= 1) {
586             // if we get here we have seen at least one version visible to the oldest scanner,
587             // which means we can prove that no scanner will see this version
588 
589             // false means there was a change, so give us the size.
590             long delta = heapSizeChange(cur, true);
591             addedSize -= delta;
592             this.size.addAndGet(-delta);
593             it.remove();
594             setOldestEditTimeToNow();
595           } else {
596             versionsVisible++;
597           }
598         }
599       } else {
600         // past the row or column, done
601         break;
602       }
603     }
604     return addedSize;
605   }
606 
607   /*
608    * Immutable data structure to hold member found in set and the set it was
609    * found in. Include set because it is carrying context.
610    */
611   private static class Member {
612     final Cell cell;
613     final NavigableSet<Cell> set;
614     Member(final NavigableSet<Cell> s, final Cell kv) {
615       this.cell = kv;
616       this.set = s;
617     }
618   }
619 
620   /*
621    * @param set Set to walk back in.  Pass a first in row or we'll return
622    * same row (loop).
623    * @param state Utility and context.
624    * @param firstOnRow First item on the row after the one we want to find a
625    * member in.
626    * @return Null or member of row previous to <code>firstOnRow</code>
627    */
628   private Member memberOfPreviousRow(NavigableSet<Cell> set,
629       final GetClosestRowBeforeTracker state, final Cell firstOnRow) {
630     NavigableSet<Cell> head = set.headSet(firstOnRow, false);
631     if (head.isEmpty()) return null;
632     for (Iterator<Cell> i = head.descendingIterator(); i.hasNext();) {
633       Cell found = i.next();
634       if (state.isExpired(found)) {
635         i.remove();
636         continue;
637       }
638       return new Member(head, found);
639     }
640     return null;
641   }
642 
643   /**
644    * @return scanner on memstore and snapshot in this order.
645    */
646   @Override
647   public List<KeyValueScanner> getScanners(long readPt) {
648     return Collections.<KeyValueScanner> singletonList(new MemStoreScanner(readPt));
649   }
650 
651   /**
652    * Check if this memstore may contain the required keys
653    * @param scan
654    * @return False if the key definitely does not exist in this Memstore
655    */
656   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
657     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
658         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
659         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
660                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
661             oldestUnexpiredTS);
662   }
663 
664   /*
665    * MemStoreScanner implements the KeyValueScanner.
666    * It lets the caller scan the contents of a memstore -- both current
667    * map and snapshot.
668    * This behaves as if it were a real scanner but does not maintain position.
669    */
670   protected class MemStoreScanner extends NonLazyKeyValueScanner {
671     // Next row information for either cellSet or snapshot
672     private Cell cellSetNextRow = null;
673     private Cell snapshotNextRow = null;
674 
675     // last iterated Cells for cellSet and snapshot (to restore iterator state after reseek)
676     private Cell cellSetItRow = null;
677     private Cell snapshotItRow = null;
678     
679     // iterator based scanning.
680     private Iterator<Cell> cellSetIt;
681     private Iterator<Cell> snapshotIt;
682 
683     // The cellSet and snapshot at the time of creating this scanner
684     private CellSkipListSet cellSetAtCreation;
685     private CellSkipListSet snapshotAtCreation;
686 
687     // the pre-calculated Cell to be returned by peek() or next()
688     private Cell theNext;
689 
690     // The allocator and snapshot allocator at the time of creating this scanner
691     volatile MemStoreLAB allocatorAtCreation;
692     volatile MemStoreLAB snapshotAllocatorAtCreation;
693     
694     // A flag represents whether could stop skipping Cells for MVCC
695     // if have encountered the next row. Only used for reversed scan
696     private boolean stopSkippingCellsIfNextRow = false;
697 
698     private long readPoint;
699 
700     /*
701     Some notes...
702 
703      So memstorescanner is fixed at creation time. this includes pointers/iterators into
704     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
705     snapshot is moved.  since kvset is null there is no point on reseeking on both,
706       we can save us the trouble. During the snapshot->hfile transition, the memstore
707       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
708       potentially do something smarter by adjusting the existing memstore scanner.
709 
710       But there is a greater problem here, that being once a scanner has progressed
711       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
712       if a scan lasts a little while, there is a chance for new entries in kvset to
713       become available but we will never see them.  This needs to be handled at the
714       StoreScanner level with coordination with MemStoreScanner.
715 
716       Currently, this problem is only partly managed: during the small amount of time
717       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
718       the adds to kvset in the MemStoreScanner.
719     */
720 
721     MemStoreScanner(long readPoint) {
722       super();
723 
724       this.readPoint = readPoint;
725       cellSetAtCreation = cellSet;
726       snapshotAtCreation = snapshot;
727       if (allocator != null) {
728         this.allocatorAtCreation = allocator;
729         this.allocatorAtCreation.incScannerCount();
730       }
731       if (snapshotAllocator != null) {
732         this.snapshotAllocatorAtCreation = snapshotAllocator;
733         this.snapshotAllocatorAtCreation.incScannerCount();
734       }
735       if (Trace.isTracing() && Trace.currentSpan() != null) {
736         Trace.currentSpan().addTimelineAnnotation("Creating MemStoreScanner");
737       }
738     }
739 
740     /**
741      * Lock on 'this' must be held by caller.
742      * @param it
743      * @return Next Cell
744      */
745     private Cell getNext(Iterator<Cell> it) {
746       Cell startCell = theNext;
747       Cell v = null;
748       try {
749         while (it.hasNext()) {
750           v = it.next();
751           if (v.getSequenceId() <= this.readPoint) {
752             return v;
753           }
754           if (stopSkippingCellsIfNextRow && startCell != null
755               && comparator.compareRows(v, startCell) > 0) {
756             return null;
757           }
758         }
759 
760         return null;
761       } finally {
762         if (v != null) {
763           // in all cases, remember the last Cell iterated to
764           if (it == snapshotIt) {
765             snapshotItRow = v;
766           } else {
767             cellSetItRow = v;
768           }
769         }
770       }
771     }
772 
773     /**
774      *  Set the scanner at the seek key.
775      *  Must be called only once: there is no thread safety between the scanner
776      *   and the memStore.
777      * @param key seek value
778      * @return false if the key is null or if there is no data
779      */
780     @Override
781     public synchronized boolean seek(Cell key) {
782       if (key == null) {
783         close();
784         return false;
785       }
786       // kvset and snapshot will never be null.
787       // if tailSet can't find anything, SortedSet is empty (not null).
788       cellSetIt = cellSetAtCreation.tailSet(key).iterator();
789       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
790       cellSetItRow = null;
791       snapshotItRow = null;
792 
793       return seekInSubLists(key);
794     }
795 
796 
797     /**
798      * (Re)initialize the iterators after a seek or a reseek.
799      */
800     private synchronized boolean seekInSubLists(Cell key){
801       cellSetNextRow = getNext(cellSetIt);
802       snapshotNextRow = getNext(snapshotIt);
803 
804       // Calculate the next value
805       theNext = getLowest(cellSetNextRow, snapshotNextRow);
806 
807       // has data
808       return (theNext != null);
809     }
810 
811 
812     /**
813      * Move forward on the sub-lists set previously by seek.
814      * @param key seek value (should be non-null)
815      * @return true if there is at least one KV to read, false otherwise
816      */
817     @Override
818     public synchronized boolean reseek(Cell key) {
819       /*
820       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
821       This code is executed concurrently with flush and puts, without locks.
822       Two points must be known when working on this code:
823       1) It's not possible to use the 'kvTail' and 'snapshot'
824        variables, as they are modified during a flush.
825       2) The ideal implementation for performance would use the sub skip list
826        implicitly pointed by the iterators 'kvsetIt' and
827        'snapshotIt'. Unfortunately the Java API does not offer a method to
828        get it. So we remember the last keys we iterated to and restore
829        the reseeked set to at least that point.
830        */
831       cellSetIt = cellSetAtCreation.tailSet(getHighest(key, cellSetItRow)).iterator();
832       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
833 
834       return seekInSubLists(key);
835     }
836 
837 
838     @Override
839     public synchronized Cell peek() {
840       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
841       return theNext;
842     }
843 
844     @Override
845     public synchronized Cell next() {
846       if (theNext == null) {
847           return null;
848       }
849 
850       final Cell ret = theNext;
851 
852       // Advance one of the iterators
853       if (theNext == cellSetNextRow) {
854         cellSetNextRow = getNext(cellSetIt);
855       } else {
856         snapshotNextRow = getNext(snapshotIt);
857       }
858 
859       // Calculate the next value
860       theNext = getLowest(cellSetNextRow, snapshotNextRow);
861 
862       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
863       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
864       //    getLowest() + " threadpoint=" + readpoint);
865       return ret;
866     }
867 
868     /*
869      * Returns the lower of the two key values, or null if they are both null.
870      * This uses comparator.compare() to compare the KeyValue using the memstore
871      * comparator.
872      */
873     private Cell getLowest(Cell first, Cell second) {
874       if (first == null && second == null) {
875         return null;
876       }
877       if (first != null && second != null) {
878         int compare = comparator.compare(first, second);
879         return (compare <= 0 ? first : second);
880       }
881       return (first != null ? first : second);
882     }
883 
884     /*
885      * Returns the higher of the two cells, or null if they are both null.
886      * This uses comparator.compare() to compare the Cell using the memstore
887      * comparator.
888      */
889     private Cell getHighest(Cell first, Cell second) {
890       if (first == null && second == null) {
891         return null;
892       }
893       if (first != null && second != null) {
894         int compare = comparator.compare(first, second);
895         return (compare > 0 ? first : second);
896       }
897       return (first != null ? first : second);
898     }
899 
900     public synchronized void close() {
901       this.cellSetNextRow = null;
902       this.snapshotNextRow = null;
903 
904       this.cellSetIt = null;
905       this.snapshotIt = null;
906       
907       if (allocatorAtCreation != null) {
908         this.allocatorAtCreation.decScannerCount();
909         this.allocatorAtCreation = null;
910       }
911       if (snapshotAllocatorAtCreation != null) {
912         this.snapshotAllocatorAtCreation.decScannerCount();
913         this.snapshotAllocatorAtCreation = null;
914       }
915 
916       this.cellSetItRow = null;
917       this.snapshotItRow = null;
918     }
919 
920     /**
921      * MemStoreScanner returns max value as sequence id because it will
922      * always have the latest data among all files.
923      */
924     @Override
925     public long getSequenceID() {
926       return Long.MAX_VALUE;
927     }
928 
929     @Override
930     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
931         long oldestUnexpiredTS) {
932       return shouldSeek(scan, oldestUnexpiredTS);
933     }
934 
935     /**
936      * Seek scanner to the given key first. If it returns false(means
937      * peek()==null) or scanner's peek row is bigger than row of given key, seek
938      * the scanner to the previous row of given key
939      */
940     @Override
941     public synchronized boolean backwardSeek(Cell key) {
942       seek(key);
943       if (peek() == null || comparator.compareRows(peek(), key) > 0) {
944         return seekToPreviousRow(key);
945       }
946       return true;
947     }
948 
949     /**
950      * Separately get the KeyValue before the specified key from kvset and
951      * snapshotset, and use the row of higher one as the previous row of
952      * specified key, then seek to the first KeyValue of previous row
953      */
954     @Override
955     public synchronized boolean seekToPreviousRow(Cell key) {
956       Cell firstKeyOnRow = KeyValueUtil.createFirstOnRow(key.getRowArray(), key.getRowOffset(),
957           key.getRowLength());
958       SortedSet<Cell> cellHead = cellSetAtCreation.headSet(firstKeyOnRow);
959       Cell cellSetBeforeRow = cellHead.isEmpty() ? null : cellHead.last();
960       SortedSet<Cell> snapshotHead = snapshotAtCreation
961           .headSet(firstKeyOnRow);
962       Cell snapshotBeforeRow = snapshotHead.isEmpty() ? null : snapshotHead
963           .last();
964       Cell lastCellBeforeRow = getHighest(cellSetBeforeRow, snapshotBeforeRow);
965       if (lastCellBeforeRow == null) {
966         theNext = null;
967         return false;
968       }
969       Cell firstKeyOnPreviousRow = KeyValueUtil.createFirstOnRow(lastCellBeforeRow.getRowArray(),
970           lastCellBeforeRow.getRowOffset(), lastCellBeforeRow.getRowLength());
971       this.stopSkippingCellsIfNextRow = true;
972       seek(firstKeyOnPreviousRow);
973       this.stopSkippingCellsIfNextRow = false;
974       if (peek() == null
975           || comparator.compareRows(peek(), firstKeyOnPreviousRow) > 0) {
976         return seekToPreviousRow(lastCellBeforeRow);
977       }
978       return true;
979     }
980 
981     @Override
982     public synchronized boolean seekToLastRow() {
983       Cell first = cellSetAtCreation.isEmpty() ? null : cellSetAtCreation
984           .last();
985       Cell second = snapshotAtCreation.isEmpty() ? null
986           : snapshotAtCreation.last();
987       Cell higherCell = getHighest(first, second);
988       if (higherCell == null) {
989         return false;
990       }
991       Cell firstCellOnLastRow = KeyValueUtil.createFirstOnRow(higherCell.getRowArray(),
992           higherCell.getRowOffset(), higherCell.getRowLength());
993       if (seek(firstCellOnLastRow)) {
994         return true;
995       } else {
996         return seekToPreviousRow(higherCell);
997       }
998 
999     }
1000   }
1001 
1002   public final static long FIXED_OVERHEAD = ClassSize.align(
1003       ClassSize.OBJECT + (9 * ClassSize.REFERENCE) + (3 * Bytes.SIZEOF_LONG));
1004 
1005   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
1006       ClassSize.ATOMIC_LONG + (2 * ClassSize.TIMERANGE_TRACKER) +
1007       (2 * ClassSize.CELL_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
1008 
1009   /*
1010    * Calculate how the MemStore size has changed.  Includes overhead of the
1011    * backing Map.
1012    * @param cell
1013    * @param notpresent True if the cell was NOT present in the set.
1014    * @return Size
1015    */
1016   static long heapSizeChange(final Cell cell, final boolean notpresent) {
1017     return notpresent ? ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY
1018         + CellUtil.estimatedHeapSizeOf(cell)) : 0;
1019   }
1020 
1021   private long keySize() {
1022     return heapSize() - DEEP_OVERHEAD;
1023   }
1024 
1025   /**
1026    * Get the entire heap usage for this MemStore not including keys in the
1027    * snapshot.
1028    */
1029   @Override
1030   public long heapSize() {
1031     return size.get();
1032   }
1033 
1034   @Override
1035   public long size() {
1036     return heapSize();
1037   }
1038  
1039   /**
1040    * Code to help figure if our approximation of object heap sizes is close
1041    * enough.  See hbase-900.  Fills memstores then waits so user can heap
1042    * dump and bring up resultant hprof in something like jprofiler which
1043    * allows you get 'deep size' on objects.
1044    * @param args main args
1045    */
1046   public static void main(String [] args) {
1047     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
1048     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
1049       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
1050     LOG.info("vmInputArguments=" + runtime.getInputArguments());
1051     DefaultMemStore memstore1 = new DefaultMemStore();
1052     // TODO: x32 vs x64
1053     long size = 0;
1054     final int count = 10000;
1055     byte [] fam = Bytes.toBytes("col");
1056     byte [] qf = Bytes.toBytes("umn");
1057     byte [] empty = new byte[0];
1058     for (int i = 0; i < count; i++) {
1059       // Give each its own ts
1060       Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1061       size += ret.getFirst();
1062     }
1063     LOG.info("memstore1 estimated size=" + size);
1064     for (int i = 0; i < count; i++) {
1065       Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1066       size += ret.getFirst();
1067     }
1068     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1069     // Make a variably sized memstore.
1070     DefaultMemStore memstore2 = new DefaultMemStore();
1071     for (int i = 0; i < count; i++) {
1072       Pair<Long, Cell> ret = memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
1073         new byte[i]));
1074       size += ret.getFirst();
1075     }
1076     LOG.info("memstore2 estimated size=" + size);
1077     final int seconds = 30;
1078     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1079     for (int i = 0; i < seconds; i++) {
1080       // Thread.sleep(1000);
1081     }
1082     LOG.info("Exiting.");
1083   }
1084 
1085 }