View Javadoc

1   /**
2    * Copyright 2010 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  
21  package org.apache.hadoop.hbase.regionserver;
22  
23  import java.lang.management.ManagementFactory;
24  import java.lang.management.RuntimeMXBean;
25  import java.rmi.UnexpectedException;
26  import java.util.Arrays;
27  import java.util.Collections;
28  import java.util.Iterator;
29  import java.util.List;
30  import java.util.NavigableSet;
31  import java.util.SortedSet;
32  import java.util.concurrent.atomic.AtomicLong;
33  import java.util.concurrent.locks.ReentrantReadWriteLock;
34  
35  import org.apache.commons.logging.Log;
36  import org.apache.commons.logging.LogFactory;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.hbase.HBaseConfiguration;
39  import org.apache.hadoop.hbase.HConstants;
40  import org.apache.hadoop.hbase.KeyValue;
41  import org.apache.hadoop.hbase.client.Scan;
42  import org.apache.hadoop.hbase.io.HeapSize;
43  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
44  import org.apache.hadoop.hbase.util.Bytes;
45  import org.apache.hadoop.hbase.util.ClassSize;
46  
47  /**
48   * The MemStore holds in-memory modifications to the Store.  Modifications
49   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
50   * to snapshot and is cleared.  We continue to serve edits out of new memstore
51   * and backing snapshot until flusher reports in that the flush succeeded. At
52   * this point we let the snapshot go.
53   * TODO: Adjust size of the memstore when we remove items because they have
54   * been deleted.
55   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
56   * in KV size.
57   */
58  public class MemStore implements HeapSize {
59    private static final Log LOG = LogFactory.getLog(MemStore.class);
60  
61    static final String USEMSLAB_KEY =
62      "hbase.hregion.memstore.mslab.enabled";
63    private static final boolean USEMSLAB_DEFAULT = false;
64  
65  
66    private Configuration conf;
67  
68    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
69    // better semantics.  The Map will overwrite if passed a key it already had
70    // whereas the Set will not add new KV if key is same though value might be
71    // different.  Value is not important -- just make sure always same
72    // reference passed.
73    volatile KeyValueSkipListSet kvset;
74  
75    // Snapshot of memstore.  Made for flusher.
76    volatile KeyValueSkipListSet snapshot;
77  
78    final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
79  
80    final KeyValue.KVComparator comparator;
81  
82    // Used comparing versions -- same r/c and ts but different type.
83    final KeyValue.KVComparator comparatorIgnoreType;
84  
85    // Used comparing versions -- same r/c and type but different timestamp.
86    final KeyValue.KVComparator comparatorIgnoreTimestamp;
87  
88    // Used to track own heapSize
89    final AtomicLong size;
90  
91    TimeRangeTracker timeRangeTracker;
92    TimeRangeTracker snapshotTimeRangeTracker;
93    
94    MemStoreLAB allocator;
95  
96    /**
97     * Default constructor. Used for tests.
98     */
99    public MemStore() {
100     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
101   }
102 
103   /**
104    * Constructor.
105    * @param c Comparator
106    */
107   public MemStore(final Configuration conf,
108                   final KeyValue.KVComparator c) {
109     this.conf = conf;
110     this.comparator = c;
111     this.comparatorIgnoreTimestamp =
112       this.comparator.getComparatorIgnoringTimestamps();
113     this.comparatorIgnoreType = this.comparator.getComparatorIgnoringType();
114     this.kvset = new KeyValueSkipListSet(c);
115     this.snapshot = new KeyValueSkipListSet(c);
116     timeRangeTracker = new TimeRangeTracker();
117     snapshotTimeRangeTracker = new TimeRangeTracker();
118     this.size = new AtomicLong(DEEP_OVERHEAD);
119     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
120       this.allocator = new MemStoreLAB(conf);
121     } else {
122       this.allocator = null;
123     }
124   }
125 
126   void dump() {
127     for (KeyValue kv: this.kvset) {
128       LOG.info(kv);
129     }
130     for (KeyValue kv: this.snapshot) {
131       LOG.info(kv);
132     }
133   }
134 
135   /**
136    * Creates a snapshot of the current memstore.
137    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet<KeyValue>)}
138    * To get the snapshot made by this method, use {@link #getSnapshot()}
139    */
140   void snapshot() {
141     this.lock.writeLock().lock();
142     try {
143       // If snapshot currently has entries, then flusher failed or didn't call
144       // cleanup.  Log a warning.
145       if (!this.snapshot.isEmpty()) {
146         LOG.warn("Snapshot called again without clearing previous. " +
147           "Doing nothing. Another ongoing flush or did we fail last attempt?");
148       } else {
149         if (!this.kvset.isEmpty()) {
150           this.snapshot = this.kvset;
151           this.kvset = new KeyValueSkipListSet(this.comparator);
152           this.snapshotTimeRangeTracker = this.timeRangeTracker;
153           this.timeRangeTracker = new TimeRangeTracker();
154           // Reset heap to not include any keys
155           this.size.set(DEEP_OVERHEAD);
156           // Reset allocator so we get a fresh buffer for the new memstore
157           if (allocator != null) {
158             this.allocator = new MemStoreLAB(conf);
159           }
160         }
161       }
162     } finally {
163       this.lock.writeLock().unlock();
164     }
165   }
166 
167   /**
168    * Return the current snapshot.
169    * Called by flusher to get current snapshot made by a previous
170    * call to {@link #snapshot()}
171    * @return Return snapshot.
172    * @see {@link #snapshot()}
173    * @see {@link #clearSnapshot(SortedSet<KeyValue>)}
174    */
175   KeyValueSkipListSet getSnapshot() {
176     return this.snapshot;
177   }
178 
179   /**
180    * The passed snapshot was successfully persisted; it can be let go.
181    * @param ss The snapshot to clean out.
182    * @throws UnexpectedException
183    * @see {@link #snapshot()}
184    */
185   void clearSnapshot(final SortedSet<KeyValue> ss)
186   throws UnexpectedException {
187     this.lock.writeLock().lock();
188     try {
189       if (this.snapshot != ss) {
190         throw new UnexpectedException("Current snapshot is " +
191           this.snapshot + ", was passed " + ss);
192       }
193       // OK. Passed in snapshot is same as current snapshot.  If not-empty,
194       // create a new snapshot and let the old one go.
195       if (!ss.isEmpty()) {
196         this.snapshot = new KeyValueSkipListSet(this.comparator);
197         this.snapshotTimeRangeTracker = new TimeRangeTracker();
198       }
199     } finally {
200       this.lock.writeLock().unlock();
201     }
202   }
203 
204   /**
205    * Write an update
206    * @param kv
207    * @return approximate size of the passed key and value.
208    */
209   long add(final KeyValue kv) {
210     this.lock.readLock().lock();
211     try {
212       KeyValue toAdd = maybeCloneWithAllocator(kv);
213       return internalAdd(toAdd);
214     } finally {
215       this.lock.readLock().unlock();
216     }
217   }
218   
219   /**
220    * Internal version of add() that doesn't clone KVs with the
221    * allocator, and doesn't take the lock.
222    * 
223    * Callers should ensure they already have the read lock taken
224    */
225   private long internalAdd(final KeyValue toAdd) {
226     long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
227     timeRangeTracker.includeTimestamp(toAdd);
228     this.size.addAndGet(s);
229     return s;
230   }
231 
232   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
233     if (allocator == null) {
234       return kv;
235     }
236 
237     int len = kv.getLength();
238     Allocation alloc = allocator.allocateBytes(len);
239     if (alloc == null) {
240       // The allocation was too large, allocator decided
241       // not to do anything with it.
242       return kv;
243     }
244     assert alloc != null && alloc.getData() != null;
245     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
246     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
247     newKv.setMemstoreTS(kv.getMemstoreTS());
248     return newKv;
249   }
250 
251   /**
252    * Write a delete
253    * @param delete
254    * @return approximate size of the passed key and value.
255    */
256   long delete(final KeyValue delete) {
257     long s = 0;
258     this.lock.readLock().lock();
259     try {
260       KeyValue toAdd = maybeCloneWithAllocator(delete);
261       s += heapSizeChange(toAdd, this.kvset.add(toAdd));
262       timeRangeTracker.includeTimestamp(toAdd);
263     } finally {
264       this.lock.readLock().unlock();
265     }
266     this.size.addAndGet(s);
267     return s;
268   }
269 
270   /**
271    * @param kv Find the row that comes after this one.  If null, we return the
272    * first.
273    * @return Next row or null if none found.
274    */
275   KeyValue getNextRow(final KeyValue kv) {
276     this.lock.readLock().lock();
277     try {
278       return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
279     } finally {
280       this.lock.readLock().unlock();
281     }
282   }
283 
284   /*
285    * @param a
286    * @param b
287    * @return Return lowest of a or b or null if both a and b are null
288    */
289   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
290     if (a == null) {
291       return b;
292     }
293     if (b == null) {
294       return a;
295     }
296     return comparator.compareRows(a, b) <= 0? a: b;
297   }
298 
299   /*
300    * @param key Find row that follows this one.  If null, return first.
301    * @param map Set to look in for a row beyond <code>row</code>.
302    * @return Next row or null if none found.  If one found, will be a new
303    * KeyValue -- can be destroyed by subsequent calls to this method.
304    */
305   private KeyValue getNextRow(final KeyValue key,
306       final NavigableSet<KeyValue> set) {
307     KeyValue result = null;
308     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
309     // Iterate until we fall into the next row; i.e. move off current row
310     for (KeyValue kv: tail) {
311       if (comparator.compareRows(kv, key) <= 0)
312         continue;
313       // Note: Not suppressing deletes or expired cells.  Needs to be handled
314       // by higher up functions.
315       result = kv;
316       break;
317     }
318     return result;
319   }
320 
321   /**
322    * @param state column/delete tracking state
323    */
324   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
325     this.lock.readLock().lock();
326     try {
327       getRowKeyAtOrBefore(kvset, state);
328       getRowKeyAtOrBefore(snapshot, state);
329     } finally {
330       this.lock.readLock().unlock();
331     }
332   }
333 
334   /*
335    * @param set
336    * @param state Accumulates deletes and candidates.
337    */
338   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
339       final GetClosestRowBeforeTracker state) {
340     if (set.isEmpty()) {
341       return;
342     }
343     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
344       // Found nothing in row.  Try backing up.
345       getRowKeyBefore(set, state);
346     }
347   }
348 
349   /*
350    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
351    * we have been passed the first possible key on a row.  As we walk forward
352    * we accumulate deletes until we hit a candidate on the row at which point
353    * we return.
354    * @param set
355    * @param firstOnRow First possible key on this row.
356    * @param state
357    * @return True if we found a candidate walking this row.
358    */
359   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
360       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
361     boolean foundCandidate = false;
362     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
363     if (tail.isEmpty()) return foundCandidate;
364     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
365       KeyValue kv = i.next();
366       // Did we go beyond the target row? If so break.
367       if (state.isTooFar(kv, firstOnRow)) break;
368       if (state.isExpired(kv)) {
369         i.remove();
370         continue;
371       }
372       // If we added something, this row is a contender. break.
373       if (state.handle(kv)) {
374         foundCandidate = true;
375         break;
376       }
377     }
378     return foundCandidate;
379   }
380 
381   /*
382    * Walk backwards through the passed set a row at a time until we run out of
383    * set or until we get a candidate.
384    * @param set
385    * @param state
386    */
387   private void getRowKeyBefore(NavigableSet<KeyValue> set,
388       final GetClosestRowBeforeTracker state) {
389     KeyValue firstOnRow = state.getTargetKey();
390     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
391         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
392       // Make sure we don't fall out of our table.
393       if (!state.isTargetTable(p.kv)) break;
394       // Stop looking if we've exited the better candidate range.
395       if (!state.isBetterCandidate(p.kv)) break;
396       // Make into firstOnRow
397       firstOnRow = new KeyValue(p.kv.getRow(), HConstants.LATEST_TIMESTAMP);
398       // If we find something, break;
399       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
400     }
401   }
402 
403   /**
404    * Given the specs of a column, update it, first by inserting a new record,
405    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
406    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
407    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
408    * get the new value, or the old value and all readers will eventually only see the new
409    * value after the old was removed.
410    *
411    * @param row
412    * @param family
413    * @param qualifier
414    * @param newValue
415    * @param now
416    * @return  Timestamp
417    */
418   public long updateColumnValue(byte[] row,
419                                 byte[] family,
420                                 byte[] qualifier,
421                                 long newValue,
422                                 long now) {
423    this.lock.readLock().lock();
424     try {
425       KeyValue firstKv = KeyValue.createFirstOnRow(
426           row, family, qualifier);
427       // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
428       SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
429       if (!snSs.isEmpty()) {
430         KeyValue snKv = snSs.first();
431         // is there a matching KV in the snapshot?
432         if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
433           if (snKv.getTimestamp() == now) {
434             // poop,
435             now += 1;
436           }
437         }
438       }
439 
440       // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
441       // But the timestamp should also be max(now, mostRecentTsInMemstore)
442 
443       // so we cant add the new KV w/o knowing what's there already, but we also
444       // want to take this chance to delete some kvs. So two loops (sad)
445 
446       SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
447       Iterator<KeyValue> it = ss.iterator();
448       while ( it.hasNext() ) {
449         KeyValue kv = it.next();
450 
451         // if this isnt the row we are interested in, then bail:
452         if (!firstKv.matchingColumn(family,qualifier) || !firstKv.matchingRow(kv) ) {
453           break; // rows dont match, bail.
454         }
455 
456         // if the qualifier matches and it's a put, just RM it out of the kvset.
457         if (firstKv.matchingQualifier(kv)) {
458           // to be extra safe we only remove Puts that have a memstoreTS==0
459           if (kv.getType() == KeyValue.Type.Put.getCode()) {
460             now = Math.max(now, kv.getTimestamp());
461           }
462         }
463       }
464 
465       // create or update (upsert) a new KeyValue with
466       // 'now' and a 0 memstoreTS == immediately visible
467       return upsert(Arrays.asList(new KeyValue [] {
468           new KeyValue(row, family, qualifier, now,
469               Bytes.toBytes(newValue))
470       }));
471     } finally {
472       this.lock.readLock().unlock();
473     }
474   }
475 
476   /**
477    * Update or insert the specified KeyValues.
478    * <p>
479    * For each KeyValue, insert into MemStore.  This will atomically upsert the
480    * value for that row/family/qualifier.  If a KeyValue did already exist,
481    * it will then be removed.
482    * <p>
483    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
484    * be immediately visible.  May want to change this so it is atomic across
485    * all KeyValues.
486    * <p>
487    * This is called under row lock, so Get operations will still see updates
488    * atomically.  Scans will only see each KeyValue update as atomic.
489    *
490    * @param kvs
491    * @return change in memstore size
492    */
493   public long upsert(List<KeyValue> kvs) {
494    this.lock.readLock().lock();
495     try {
496       long size = 0;
497       for (KeyValue kv : kvs) {
498         kv.setMemstoreTS(0);
499         size += upsert(kv);
500       }
501       return size;
502     } finally {
503       this.lock.readLock().unlock();
504     }
505   }
506 
507   /**
508    * Inserts the specified KeyValue into MemStore and deletes any existing
509    * versions of the same row/family/qualifier as the specified KeyValue.
510    * <p>
511    * First, the specified KeyValue is inserted into the Memstore.
512    * <p>
513    * If there are any existing KeyValues in this MemStore with the same row,
514    * family, and qualifier, they are removed.
515    * <p>
516    * Callers must hold the read lock.
517    * 
518    * @param kv
519    * @return change in size of MemStore
520    */
521   private long upsert(KeyValue kv) {
522     // Add the KeyValue to the MemStore
523     // Use the internalAdd method here since we (a) already have a lock
524     // and (b) cannot safely use the MSLAB here without potentially
525     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
526     // test that triggers the pathological case if we don't avoid MSLAB
527     // here.
528     long addedSize = internalAdd(kv);
529 
530     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
531     // For this case we want to clean up any other puts
532     KeyValue firstKv = KeyValue.createFirstOnRow(
533         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
534         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
535         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
536     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
537     Iterator<KeyValue> it = ss.iterator();
538     while ( it.hasNext() ) {
539       KeyValue cur = it.next();
540 
541       if (kv == cur) {
542         // ignore the one just put in
543         continue;
544       }
545       // if this isn't the row we are interested in, then bail
546       if (!kv.matchingRow(cur)) {
547         break;
548       }
549 
550       // if the qualifier matches and it's a put, remove it
551       if (kv.matchingQualifier(cur)) {
552 
553         // to be extra safe we only remove Puts that have a memstoreTS==0
554         if (kv.getType() == KeyValue.Type.Put.getCode() &&
555             kv.getMemstoreTS() == 0) {
556           // false means there was a change, so give us the size.
557           addedSize -= heapSizeChange(kv, true);
558           it.remove();
559         }
560       } else {
561         // past the column, done
562         break;
563       }
564     }
565     return addedSize;
566   }
567 
568   /*
569    * Immutable data structure to hold member found in set and the set it was
570    * found in.  Include set because it is carrying context.
571    */
572   private static class Member {
573     final KeyValue kv;
574     final NavigableSet<KeyValue> set;
575     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
576       this.kv = kv;
577       this.set = s;
578     }
579   }
580 
581   /*
582    * @param set Set to walk back in.  Pass a first in row or we'll return
583    * same row (loop).
584    * @param state Utility and context.
585    * @param firstOnRow First item on the row after the one we want to find a
586    * member in.
587    * @return Null or member of row previous to <code>firstOnRow</code>
588    */
589   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
590       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
591     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
592     if (head.isEmpty()) return null;
593     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
594       KeyValue found = i.next();
595       if (state.isExpired(found)) {
596         i.remove();
597         continue;
598       }
599       return new Member(head, found);
600     }
601     return null;
602   }
603 
604   /**
605    * @return scanner on memstore and snapshot in this order.
606    */
607   List<KeyValueScanner> getScanners() {
608     this.lock.readLock().lock();
609     try {
610       return Collections.<KeyValueScanner>singletonList(
611           new MemStoreScanner());
612     } finally {
613       this.lock.readLock().unlock();
614     }
615   }
616 
617   /**
618    * Check if this memstore may contain the required keys
619    * @param scan
620    * @return False if the key definitely does not exist in this Memstore
621    */
622   public boolean shouldSeek(Scan scan) {
623     return timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
624         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange());
625   }
626 
627   public TimeRangeTracker getSnapshotTimeRangeTracker() {
628     return this.snapshotTimeRangeTracker;
629   }
630 
631   /*
632    * MemStoreScanner implements the KeyValueScanner.
633    * It lets the caller scan the contents of a memstore -- both current
634    * map and snapshot.
635    * This behaves as if it were a real scanner but does not maintain position.
636    */
637   protected class MemStoreScanner implements KeyValueScanner {
638     // Next row information for either kvset or snapshot
639     private KeyValue kvsetNextRow = null;
640     private KeyValue snapshotNextRow = null;
641 
642     // iterator based scanning.
643     Iterator<KeyValue> kvsetIt;
644     Iterator<KeyValue> snapshotIt;
645 
646     /*
647     Some notes...
648 
649      So memstorescanner is fixed at creation time. this includes pointers/iterators into
650     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
651     snapshot is moved.  since kvset is null there is no point on reseeking on both,
652       we can save us the trouble. During the snapshot->hfile transition, the memstore
653       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
654       potentially do something smarter by adjusting the existing memstore scanner.
655 
656       But there is a greater problem here, that being once a scanner has progressed
657       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
658       if a scan lasts a little while, there is a chance for new entries in kvset to
659       become available but we will never see them.  This needs to be handled at the
660       StoreScanner level with coordination with MemStoreScanner.
661 
662     */
663 
664     MemStoreScanner() {
665       super();
666 
667       //DebugPrint.println(" MS new@" + hashCode());
668     }
669 
670     protected KeyValue getNext(Iterator<KeyValue> it) {
671       KeyValue ret = null;
672       long readPoint = ReadWriteConsistencyControl.getThreadReadPoint();
673       //DebugPrint.println( " MS@" + hashCode() + ": threadpoint = " + readPoint);
674 
675       while (ret == null && it.hasNext()) {
676         KeyValue v = it.next();
677         if (v.getMemstoreTS() <= readPoint) {
678           // keep it.
679           ret = v;
680         }
681       }
682       return ret;
683     }
684 
685     public synchronized boolean seek(KeyValue key) {
686       if (key == null) {
687         close();
688         return false;
689       }
690 
691       // kvset and snapshot will never be empty.
692       // if tailSet cant find anything, SS is empty (not null).
693       SortedSet<KeyValue> kvTail = kvset.tailSet(key);
694       SortedSet<KeyValue> snapshotTail = snapshot.tailSet(key);
695 
696       kvsetIt = kvTail.iterator();
697       snapshotIt = snapshotTail.iterator();
698 
699       kvsetNextRow = getNext(kvsetIt);
700       snapshotNextRow = getNext(snapshotIt);
701 
702 
703       //long readPoint = ReadWriteConsistencyControl.getThreadReadPoint();
704       //DebugPrint.println( " MS@" + hashCode() + " kvset seek: " + kvsetNextRow + " with size = " +
705       //    kvset.size() + " threadread = " + readPoint);
706       //DebugPrint.println( " MS@" + hashCode() + " snapshot seek: " + snapshotNextRow + " with size = " +
707       //    snapshot.size() + " threadread = " + readPoint);
708 
709 
710       KeyValue lowest = getLowest();
711 
712       // has data := (lowest != null)
713       return lowest != null;
714     }
715 
716     @Override
717     public boolean reseek(KeyValue key) {
718       while (kvsetNextRow != null &&
719           comparator.compare(kvsetNextRow, key) < 0) {
720         kvsetNextRow = getNext(kvsetIt);
721       }
722 
723       while (snapshotNextRow != null &&
724           comparator.compare(snapshotNextRow, key) < 0) {
725         snapshotNextRow = getNext(snapshotIt);
726       }
727       return (kvsetNextRow != null || snapshotNextRow != null);
728     }
729 
730     public synchronized KeyValue peek() {
731       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
732       return getLowest();
733     }
734 
735 
736     public synchronized KeyValue next() {
737       KeyValue theNext = getLowest();
738 
739       if (theNext == null) {
740           return null;
741       }
742 
743       // Advance one of the iterators
744       if (theNext == kvsetNextRow) {
745         kvsetNextRow = getNext(kvsetIt);
746       } else {
747         snapshotNextRow = getNext(snapshotIt);
748       }
749 
750       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
751       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
752       //    getLowest() + " threadpoint=" + readpoint);
753       return theNext;
754     }
755 
756     protected KeyValue getLowest() {
757       return getLower(kvsetNextRow,
758           snapshotNextRow);
759     }
760 
761     /*
762      * Returns the lower of the two key values, or null if they are both null.
763      * This uses comparator.compare() to compare the KeyValue using the memstore
764      * comparator.
765      */
766     protected KeyValue getLower(KeyValue first, KeyValue second) {
767       if (first == null && second == null) {
768         return null;
769       }
770       if (first != null && second != null) {
771         int compare = comparator.compare(first, second);
772         return (compare <= 0 ? first : second);
773       }
774       return (first != null ? first : second);
775     }
776 
777     public synchronized void close() {
778       this.kvsetNextRow = null;
779       this.snapshotNextRow = null;
780 
781       this.kvsetIt = null;
782       this.snapshotIt = null;
783     }
784 
785     /**
786      * MemStoreScanner returns max value as sequence id because it will
787      * always have the latest data among all files.
788      */
789     @Override
790     public long getSequenceID() {
791       return Long.MAX_VALUE;
792     }
793   }
794 
795   public final static long FIXED_OVERHEAD = ClassSize.align(
796       ClassSize.OBJECT + (11 * ClassSize.REFERENCE));
797 
798   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
799       ClassSize.REENTRANT_LOCK + ClassSize.ATOMIC_LONG +
800       ClassSize.COPYONWRITE_ARRAYSET + ClassSize.COPYONWRITE_ARRAYLIST +
801       (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
802 
803   /*
804    * Calculate how the MemStore size has changed.  Includes overhead of the
805    * backing Map.
806    * @param kv
807    * @param notpresent True if the kv was NOT present in the set.
808    * @return Size
809    */
810   long heapSizeChange(final KeyValue kv, final boolean notpresent) {
811     return notpresent ?
812         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
813         0;
814   }
815 
816   /**
817    * Get the entire heap usage for this MemStore not including keys in the
818    * snapshot.
819    */
820   @Override
821   public long heapSize() {
822     return size.get();
823   }
824 
825   /**
826    * Get the heap usage of KVs in this MemStore.
827    */
828   public long keySize() {
829     return heapSize() - DEEP_OVERHEAD;
830   }
831 
832   /**
833    * Code to help figure if our approximation of object heap sizes is close
834    * enough.  See hbase-900.  Fills memstores then waits so user can heap
835    * dump and bring up resultant hprof in something like jprofiler which
836    * allows you get 'deep size' on objects.
837    * @param args main args
838    */
839   public static void main(String [] args) {
840     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
841     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
842       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
843     LOG.info("vmInputArguments=" + runtime.getInputArguments());
844     MemStore memstore1 = new MemStore();
845     // TODO: x32 vs x64
846     long size = 0;
847     final int count = 10000;
848     byte [] fam = Bytes.toBytes("col");
849     byte [] qf = Bytes.toBytes("umn");
850     byte [] empty = new byte[0];
851     for (int i = 0; i < count; i++) {
852       // Give each its own ts
853       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
854     }
855     LOG.info("memstore1 estimated size=" + size);
856     for (int i = 0; i < count; i++) {
857       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
858     }
859     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
860     // Make a variably sized memstore.
861     MemStore memstore2 = new MemStore();
862     for (int i = 0; i < count; i++) {
863       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
864         new byte[i]));
865     }
866     LOG.info("memstore2 estimated size=" + size);
867     final int seconds = 30;
868     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
869     for (int i = 0; i < seconds; i++) {
870       // Thread.sleep(1000);
871     }
872     LOG.info("Exiting.");
873   }
874 }