View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.lang.management.ManagementFactory;
23  import java.lang.management.RuntimeMXBean;
24  import java.rmi.UnexpectedException;
25  import java.util.ArrayList;
26  import java.util.Collections;
27  import java.util.Iterator;
28  import java.util.List;
29  import java.util.NavigableSet;
30  import java.util.SortedSet;
31  import java.util.concurrent.atomic.AtomicLong;
32  import java.util.concurrent.locks.ReentrantReadWriteLock;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.classification.InterfaceAudience;
37  import org.apache.hadoop.conf.Configuration;
38  import org.apache.hadoop.hbase.Cell;
39  import org.apache.hadoop.hbase.HBaseConfiguration;
40  import org.apache.hadoop.hbase.HConstants;
41  import org.apache.hadoop.hbase.KeyValue;
42  import org.apache.hadoop.hbase.KeyValueUtil;
43  import org.apache.hadoop.hbase.client.Scan;
44  import org.apache.hadoop.hbase.io.HeapSize;
45  import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
46  import org.apache.hadoop.hbase.util.Bytes;
47  import org.apache.hadoop.hbase.util.ClassSize;
48  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
49  
50  /**
51   * The MemStore holds in-memory modifications to the Store.  Modifications
52   * are {@link KeyValue}s.  When asked to flush, current memstore is moved
53   * to snapshot and is cleared.  We continue to serve edits out of new memstore
54   * and backing snapshot until flusher reports in that the flush succeeded. At
55   * this point we let the snapshot go.
56   * TODO: Adjust size of the memstore when we remove items because they have
57   * been deleted.
58   * TODO: With new KVSLS, need to make sure we update HeapSize with difference
59   * in KV size.
60   */
61  @InterfaceAudience.Private
62  public class MemStore implements HeapSize {
63    private static final Log LOG = LogFactory.getLog(MemStore.class);
64  
65    static final String USEMSLAB_KEY =
66      "hbase.hregion.memstore.mslab.enabled";
67    private static final boolean USEMSLAB_DEFAULT = true;
68  
69    private Configuration conf;
70  
71    // MemStore.  Use a KeyValueSkipListSet rather than SkipListSet because of the
72    // better semantics.  The Map will overwrite if passed a key it already had
73    // whereas the Set will not add new KV if key is same though value might be
74    // different.  Value is not important -- just make sure always same
75    // reference passed.
76    volatile KeyValueSkipListSet kvset;
77  
78    // Snapshot of memstore.  Made for flusher.
79    volatile KeyValueSkipListSet snapshot;
80  
81    final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
82  
83    final KeyValue.KVComparator comparator;
84  
85    // Used to track own heapSize
86    final AtomicLong size;
87  
88    // Used to track when to flush
89    volatile long timeOfOldestEdit = Long.MAX_VALUE;
90  
91    TimeRangeTracker timeRangeTracker;
92    TimeRangeTracker snapshotTimeRangeTracker;
93  
94    MemStoreChunkPool chunkPool;
95    volatile MemStoreLAB allocator;
96    volatile MemStoreLAB snapshotAllocator;
97  
98    /**
99     * Default constructor. Used for tests.
100    */
101   public MemStore() {
102     this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
103   }
104 
105   /**
106    * Constructor.
107    * @param c Comparator
108    */
109   public MemStore(final Configuration conf,
110                   final KeyValue.KVComparator c) {
111     this.conf = conf;
112     this.comparator = c;
113     this.kvset = new KeyValueSkipListSet(c);
114     this.snapshot = new KeyValueSkipListSet(c);
115     timeRangeTracker = new TimeRangeTracker();
116     snapshotTimeRangeTracker = new TimeRangeTracker();
117     this.size = new AtomicLong(DEEP_OVERHEAD);
118     if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
119       this.chunkPool = MemStoreChunkPool.getPool(conf);
120       this.allocator = new MemStoreLAB(conf, chunkPool);
121     } else {
122       this.allocator = null;
123       this.chunkPool = null;
124     }
125   }
126 
127   void dump() {
128     for (KeyValue kv: this.kvset) {
129       LOG.info(kv);
130     }
131     for (KeyValue kv: this.snapshot) {
132       LOG.info(kv);
133     }
134   }
135 
136   /**
137    * Creates a snapshot of the current memstore.
138    * Snapshot must be cleared by call to {@link #clearSnapshot(SortedSet<KeyValue>)}
139    * To get the snapshot made by this method, use {@link #getSnapshot()}
140    */
141   void snapshot() {
142     this.lock.writeLock().lock();
143     try {
144       // If snapshot currently has entries, then flusher failed or didn't call
145       // cleanup.  Log a warning.
146       if (!this.snapshot.isEmpty()) {
147         LOG.warn("Snapshot called again without clearing previous. " +
148           "Doing nothing. Another ongoing flush or did we fail last attempt?");
149       } else {
150         if (!this.kvset.isEmpty()) {
151           this.snapshot = this.kvset;
152           this.kvset = new KeyValueSkipListSet(this.comparator);
153           this.snapshotTimeRangeTracker = this.timeRangeTracker;
154           this.timeRangeTracker = new TimeRangeTracker();
155           // Reset heap to not include any keys
156           this.size.set(DEEP_OVERHEAD);
157           this.snapshotAllocator = this.allocator;
158           // Reset allocator so we get a fresh buffer for the new memstore
159           if (allocator != null) {
160             this.allocator = new MemStoreLAB(conf, chunkPool);
161           }
162           timeOfOldestEdit = Long.MAX_VALUE;
163         }
164       }
165     } finally {
166       this.lock.writeLock().unlock();
167     }
168   }
169 
170   /**
171    * Return the current snapshot.
172    * Called by flusher to get current snapshot made by a previous
173    * call to {@link #snapshot()}
174    * @return Return snapshot.
175    * @see {@link #snapshot()}
176    * @see {@link #clearSnapshot(SortedSet<KeyValue>)}
177    */
178   KeyValueSkipListSet getSnapshot() {
179     return this.snapshot;
180   }
181 
182   /**
183    * The passed snapshot was successfully persisted; it can be let go.
184    * @param ss The snapshot to clean out.
185    * @throws UnexpectedException
186    * @see {@link #snapshot()}
187    */
188   void clearSnapshot(final SortedSet<KeyValue> ss)
189   throws UnexpectedException {
190     MemStoreLAB tmpAllocator = null;
191     this.lock.writeLock().lock();
192     try {
193       if (this.snapshot != ss) {
194         throw new UnexpectedException("Current snapshot is " +
195           this.snapshot + ", was passed " + ss);
196       }
197       // OK. Passed in snapshot is same as current snapshot.  If not-empty,
198       // create a new snapshot and let the old one go.
199       if (!ss.isEmpty()) {
200         this.snapshot = new KeyValueSkipListSet(this.comparator);
201         this.snapshotTimeRangeTracker = new TimeRangeTracker();
202       }
203       if (this.snapshotAllocator != null) {
204         tmpAllocator = this.snapshotAllocator;
205         this.snapshotAllocator = null;
206       }
207     } finally {
208       this.lock.writeLock().unlock();
209     }
210     if (tmpAllocator != null) {
211       tmpAllocator.close();
212     }
213   }
214 
215   /**
216    * Write an update
217    * @param kv
218    * @return approximate size of the passed key and value.
219    */
220   long add(final KeyValue kv) {
221     this.lock.readLock().lock();
222     try {
223       KeyValue toAdd = maybeCloneWithAllocator(kv);
224       return internalAdd(toAdd);
225     } finally {
226       this.lock.readLock().unlock();
227     }
228   }
229 
230   long timeOfOldestEdit() {
231     return timeOfOldestEdit;
232   }
233 
234   private boolean addToKVSet(KeyValue e) {
235     boolean b = this.kvset.add(e);
236     setOldestEditTimeToNow();
237     return b;
238   }
239 
240   private boolean removeFromKVSet(KeyValue e) {
241     boolean b = this.kvset.remove(e);
242     setOldestEditTimeToNow();
243     return b;
244   }
245 
246   void setOldestEditTimeToNow() {
247     if (timeOfOldestEdit == Long.MAX_VALUE) {
248       timeOfOldestEdit = EnvironmentEdgeManager.currentTimeMillis();
249     }
250   }
251 
252   /**
253    * Internal version of add() that doesn't clone KVs with the
254    * allocator, and doesn't take the lock.
255    *
256    * Callers should ensure they already have the read lock taken
257    */
258   private long internalAdd(final KeyValue toAdd) {
259     long s = heapSizeChange(toAdd, addToKVSet(toAdd));
260     timeRangeTracker.includeTimestamp(toAdd);
261     this.size.addAndGet(s);
262     return s;
263   }
264 
265   private KeyValue maybeCloneWithAllocator(KeyValue kv) {
266     if (allocator == null) {
267       return kv;
268     }
269 
270     int len = kv.getLength();
271     Allocation alloc = allocator.allocateBytes(len);
272     if (alloc == null) {
273       // The allocation was too large, allocator decided
274       // not to do anything with it.
275       return kv;
276     }
277     assert alloc != null && alloc.getData() != null;
278     System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
279     KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
280     newKv.setMvccVersion(kv.getMvccVersion());
281     return newKv;
282   }
283 
284   /**
285    * Remove n key from the memstore. Only kvs that have the same key and the
286    * same memstoreTS are removed.  It is ok to not update timeRangeTracker
287    * in this call. It is possible that we can optimize this method by using
288    * tailMap/iterator, but since this method is called rarely (only for
289    * error recovery), we can leave those optimization for the future.
290    * @param kv
291    */
292   void rollback(final KeyValue kv) {
293     this.lock.readLock().lock();
294     try {
295       // If the key is in the snapshot, delete it. We should not update
296       // this.size, because that tracks the size of only the memstore and
297       // not the snapshot. The flush of this snapshot to disk has not
298       // yet started because Store.flush() waits for all rwcc transactions to
299       // commit before starting the flush to disk.
300       KeyValue found = this.snapshot.get(kv);
301       if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
302         this.snapshot.remove(kv);
303       }
304       // If the key is in the memstore, delete it. Update this.size.
305       found = this.kvset.get(kv);
306       if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
307         removeFromKVSet(kv);
308         long s = heapSizeChange(kv, true);
309         this.size.addAndGet(-s);
310       }
311     } finally {
312       this.lock.readLock().unlock();
313     }
314   }
315 
316   /**
317    * Write a delete
318    * @param delete
319    * @return approximate size of the passed key and value.
320    */
321   long delete(final KeyValue delete) {
322     long s = 0;
323     this.lock.readLock().lock();
324     try {
325       KeyValue toAdd = maybeCloneWithAllocator(delete);
326       s += heapSizeChange(toAdd, addToKVSet(toAdd));
327       timeRangeTracker.includeTimestamp(toAdd);
328     } finally {
329       this.lock.readLock().unlock();
330     }
331     this.size.addAndGet(s);
332     return s;
333   }
334 
335   /**
336    * @param kv Find the row that comes after this one.  If null, we return the
337    * first.
338    * @return Next row or null if none found.
339    */
340   KeyValue getNextRow(final KeyValue kv) {
341     this.lock.readLock().lock();
342     try {
343       return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
344     } finally {
345       this.lock.readLock().unlock();
346     }
347   }
348 
349   /*
350    * @param a
351    * @param b
352    * @return Return lowest of a or b or null if both a and b are null
353    */
354   private KeyValue getLowest(final KeyValue a, final KeyValue b) {
355     if (a == null) {
356       return b;
357     }
358     if (b == null) {
359       return a;
360     }
361     return comparator.compareRows(a, b) <= 0? a: b;
362   }
363 
364   /*
365    * @param key Find row that follows this one.  If null, return first.
366    * @param map Set to look in for a row beyond <code>row</code>.
367    * @return Next row or null if none found.  If one found, will be a new
368    * KeyValue -- can be destroyed by subsequent calls to this method.
369    */
370   private KeyValue getNextRow(final KeyValue key,
371       final NavigableSet<KeyValue> set) {
372     KeyValue result = null;
373     SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
374     // Iterate until we fall into the next row; i.e. move off current row
375     for (KeyValue kv: tail) {
376       if (comparator.compareRows(kv, key) <= 0)
377         continue;
378       // Note: Not suppressing deletes or expired cells.  Needs to be handled
379       // by higher up functions.
380       result = kv;
381       break;
382     }
383     return result;
384   }
385 
386   /**
387    * @param state column/delete tracking state
388    */
389   void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
390     this.lock.readLock().lock();
391     try {
392       getRowKeyAtOrBefore(kvset, state);
393       getRowKeyAtOrBefore(snapshot, state);
394     } finally {
395       this.lock.readLock().unlock();
396     }
397   }
398 
399   /*
400    * @param set
401    * @param state Accumulates deletes and candidates.
402    */
403   private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
404       final GetClosestRowBeforeTracker state) {
405     if (set.isEmpty()) {
406       return;
407     }
408     if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
409       // Found nothing in row.  Try backing up.
410       getRowKeyBefore(set, state);
411     }
412   }
413 
414   /*
415    * Walk forward in a row from <code>firstOnRow</code>.  Presumption is that
416    * we have been passed the first possible key on a row.  As we walk forward
417    * we accumulate deletes until we hit a candidate on the row at which point
418    * we return.
419    * @param set
420    * @param firstOnRow First possible key on this row.
421    * @param state
422    * @return True if we found a candidate walking this row.
423    */
424   private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
425       final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
426     boolean foundCandidate = false;
427     SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
428     if (tail.isEmpty()) return foundCandidate;
429     for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
430       KeyValue kv = i.next();
431       // Did we go beyond the target row? If so break.
432       if (state.isTooFar(kv, firstOnRow)) break;
433       if (state.isExpired(kv)) {
434         i.remove();
435         continue;
436       }
437       // If we added something, this row is a contender. break.
438       if (state.handle(kv)) {
439         foundCandidate = true;
440         break;
441       }
442     }
443     return foundCandidate;
444   }
445 
446   /*
447    * Walk backwards through the passed set a row at a time until we run out of
448    * set or until we get a candidate.
449    * @param set
450    * @param state
451    */
452   private void getRowKeyBefore(NavigableSet<KeyValue> set,
453       final GetClosestRowBeforeTracker state) {
454     KeyValue firstOnRow = state.getTargetKey();
455     for (Member p = memberOfPreviousRow(set, state, firstOnRow);
456         p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
457       // Make sure we don't fall out of our table.
458       if (!state.isTargetTable(p.kv)) break;
459       // Stop looking if we've exited the better candidate range.
460       if (!state.isBetterCandidate(p.kv)) break;
461       // Make into firstOnRow
462       firstOnRow = new KeyValue(p.kv.getRow(), HConstants.LATEST_TIMESTAMP);
463       // If we find something, break;
464       if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
465     }
466   }
467 
468   /**
469    * Only used by tests. TODO: Remove
470    *
471    * Given the specs of a column, update it, first by inserting a new record,
472    * then removing the old one.  Since there is only 1 KeyValue involved, the memstoreTS
473    * will be set to 0, thus ensuring that they instantly appear to anyone. The underlying
474    * store will ensure that the insert/delete each are atomic. A scanner/reader will either
475    * get the new value, or the old value and all readers will eventually only see the new
476    * value after the old was removed.
477    *
478    * @param row
479    * @param family
480    * @param qualifier
481    * @param newValue
482    * @param now
483    * @return  Timestamp
484    */
485   long updateColumnValue(byte[] row,
486                                 byte[] family,
487                                 byte[] qualifier,
488                                 long newValue,
489                                 long now) {
490    this.lock.readLock().lock();
491     try {
492       KeyValue firstKv = KeyValue.createFirstOnRow(
493           row, family, qualifier);
494       // Is there a KeyValue in 'snapshot' with the same TS? If so, upgrade the timestamp a bit.
495       SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
496       if (!snSs.isEmpty()) {
497         KeyValue snKv = snSs.first();
498         // is there a matching KV in the snapshot?
499         if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
500           if (snKv.getTimestamp() == now) {
501             // poop,
502             now += 1;
503           }
504         }
505       }
506 
507       // logic here: the new ts MUST be at least 'now'. But it could be larger if necessary.
508       // But the timestamp should also be max(now, mostRecentTsInMemstore)
509 
510       // so we cant add the new KV w/o knowing what's there already, but we also
511       // want to take this chance to delete some kvs. So two loops (sad)
512 
513       SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
514       Iterator<KeyValue> it = ss.iterator();
515       while ( it.hasNext() ) {
516         KeyValue kv = it.next();
517 
518         // if this isnt the row we are interested in, then bail:
519         if (!kv.matchingColumn(family,qualifier) || !kv.matchingRow(firstKv) ) {
520           break; // rows dont match, bail.
521         }
522 
523         // if the qualifier matches and it's a put, just RM it out of the kvset.
524         if (kv.getType() == KeyValue.Type.Put.getCode() &&
525             kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
526           now = kv.getTimestamp();
527         }
528       }
529 
530       // create or update (upsert) a new KeyValue with
531       // 'now' and a 0 memstoreTS == immediately visible
532       List<Cell> cells = new ArrayList<Cell>(1);
533       cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
534       return upsert(cells, 1L);
535     } finally {
536       this.lock.readLock().unlock();
537     }
538   }
539 
540   /**
541    * Update or insert the specified KeyValues.
542    * <p>
543    * For each KeyValue, insert into MemStore.  This will atomically upsert the
544    * value for that row/family/qualifier.  If a KeyValue did already exist,
545    * it will then be removed.
546    * <p>
547    * Currently the memstoreTS is kept at 0 so as each insert happens, it will
548    * be immediately visible.  May want to change this so it is atomic across
549    * all KeyValues.
550    * <p>
551    * This is called under row lock, so Get operations will still see updates
552    * atomically.  Scans will only see each KeyValue update as atomic.
553    *
554    * @param cells
555    * @param readpoint readpoint below which we can safely remove duplicate KVs 
556    * @return change in memstore size
557    */
558   public long upsert(Iterable<Cell> cells, long readpoint) {
559    this.lock.readLock().lock();
560     try {
561       long size = 0;
562       for (Cell cell : cells) {
563         size += upsert(cell, readpoint);
564       }
565       return size;
566     } finally {
567       this.lock.readLock().unlock();
568     }
569   }
570 
571   /**
572    * Inserts the specified KeyValue into MemStore and deletes any existing
573    * versions of the same row/family/qualifier as the specified KeyValue.
574    * <p>
575    * First, the specified KeyValue is inserted into the Memstore.
576    * <p>
577    * If there are any existing KeyValues in this MemStore with the same row,
578    * family, and qualifier, they are removed.
579    * <p>
580    * Callers must hold the read lock.
581    *
582    * @param cell
583    * @return change in size of MemStore
584    */
585   private long upsert(Cell cell, long readpoint) {
586     // Add the KeyValue to the MemStore
587     // Use the internalAdd method here since we (a) already have a lock
588     // and (b) cannot safely use the MSLAB here without potentially
589     // hitting OOME - see TestMemStore.testUpsertMSLAB for a
590     // test that triggers the pathological case if we don't avoid MSLAB
591     // here.
592     KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
593     long addedSize = internalAdd(kv);
594 
595     // Get the KeyValues for the row/family/qualifier regardless of timestamp.
596     // For this case we want to clean up any other puts
597     KeyValue firstKv = KeyValue.createFirstOnRow(
598         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
599         kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
600         kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
601     SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
602     Iterator<KeyValue> it = ss.iterator();
603     // versions visible to oldest scanner
604     int versionsVisible = 0;
605     while ( it.hasNext() ) {
606       KeyValue cur = it.next();
607 
608       if (kv == cur) {
609         // ignore the one just put in
610         continue;
611       }
612       // check that this is the row and column we are interested in, otherwise bail
613       if (kv.matchingRow(cur) && kv.matchingQualifier(cur)) {
614         // only remove Puts that concurrent scanners cannot possibly see
615         if (cur.getType() == KeyValue.Type.Put.getCode() && cur.getMvccVersion() <= readpoint) {
616           if (versionsVisible > 1) {
617             // if we get here we have seen at least one version visible to the oldest scanner,
618             // which means we can prove that no scanner will see this version
619 
620             // false means there was a change, so give us the size.
621             long delta = heapSizeChange(cur, true);
622             addedSize -= delta;
623             this.size.addAndGet(-delta);
624             it.remove();
625             setOldestEditTimeToNow();
626           } else {
627             versionsVisible++;
628           }
629         }
630       } else {
631         // past the row or column, done
632         break;
633       }
634     }
635     return addedSize;
636   }
637 
638   /*
639    * Immutable data structure to hold member found in set and the set it was
640    * found in.  Include set because it is carrying context.
641    */
642   private static class Member {
643     final KeyValue kv;
644     final NavigableSet<KeyValue> set;
645     Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
646       this.kv = kv;
647       this.set = s;
648     }
649   }
650 
651   /*
652    * @param set Set to walk back in.  Pass a first in row or we'll return
653    * same row (loop).
654    * @param state Utility and context.
655    * @param firstOnRow First item on the row after the one we want to find a
656    * member in.
657    * @return Null or member of row previous to <code>firstOnRow</code>
658    */
659   private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
660       final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
661     NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
662     if (head.isEmpty()) return null;
663     for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
664       KeyValue found = i.next();
665       if (state.isExpired(found)) {
666         i.remove();
667         continue;
668       }
669       return new Member(head, found);
670     }
671     return null;
672   }
673 
674   /**
675    * @return scanner on memstore and snapshot in this order.
676    */
677   List<KeyValueScanner> getScanners() {
678     this.lock.readLock().lock();
679     try {
680       return Collections.<KeyValueScanner>singletonList(
681           new MemStoreScanner());
682     } finally {
683       this.lock.readLock().unlock();
684     }
685   }
686 
687   /**
688    * Check if this memstore may contain the required keys
689    * @param scan
690    * @return False if the key definitely does not exist in this Memstore
691    */
692   public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
693     return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
694         snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
695         && (Math.max(timeRangeTracker.getMaximumTimestamp(),
696                      snapshotTimeRangeTracker.getMaximumTimestamp()) >=
697             oldestUnexpiredTS);
698   }
699 
700   public TimeRangeTracker getSnapshotTimeRangeTracker() {
701     return this.snapshotTimeRangeTracker;
702   }
703 
704   /*
705    * MemStoreScanner implements the KeyValueScanner.
706    * It lets the caller scan the contents of a memstore -- both current
707    * map and snapshot.
708    * This behaves as if it were a real scanner but does not maintain position.
709    */
710   protected class MemStoreScanner extends NonLazyKeyValueScanner {
711     // Next row information for either kvset or snapshot
712     private KeyValue kvsetNextRow = null;
713     private KeyValue snapshotNextRow = null;
714 
715     // last iterated KVs for kvset and snapshot (to restore iterator state after reseek)
716     private KeyValue kvsetItRow = null;
717     private KeyValue snapshotItRow = null;
718     
719     // iterator based scanning.
720     private Iterator<KeyValue> kvsetIt;
721     private Iterator<KeyValue> snapshotIt;
722 
723     // The kvset and snapshot at the time of creating this scanner
724     private KeyValueSkipListSet kvsetAtCreation;
725     private KeyValueSkipListSet snapshotAtCreation;
726 
727     // the pre-calculated KeyValue to be returned by peek() or next()
728     private KeyValue theNext;
729 
730     // The allocator and snapshot allocator at the time of creating this scanner
731     volatile MemStoreLAB allocatorAtCreation;
732     volatile MemStoreLAB snapshotAllocatorAtCreation;
733 
734     /*
735     Some notes...
736 
737      So memstorescanner is fixed at creation time. this includes pointers/iterators into
738     existing kvset/snapshot.  during a snapshot creation, the kvset is null, and the
739     snapshot is moved.  since kvset is null there is no point on reseeking on both,
740       we can save us the trouble. During the snapshot->hfile transition, the memstore
741       scanner is re-created by StoreScanner#updateReaders().  StoreScanner should
742       potentially do something smarter by adjusting the existing memstore scanner.
743 
744       But there is a greater problem here, that being once a scanner has progressed
745       during a snapshot scenario, we currently iterate past the kvset then 'finish' up.
746       if a scan lasts a little while, there is a chance for new entries in kvset to
747       become available but we will never see them.  This needs to be handled at the
748       StoreScanner level with coordination with MemStoreScanner.
749 
750       Currently, this problem is only partly managed: during the small amount of time
751       when the StoreScanner has not yet created a new MemStoreScanner, we will miss
752       the adds to kvset in the MemStoreScanner.
753     */
754 
755     MemStoreScanner() {
756       super();
757 
758       kvsetAtCreation = kvset;
759       snapshotAtCreation = snapshot;
760       if (allocator != null) {
761         this.allocatorAtCreation = allocator;
762         this.allocatorAtCreation.incScannerCount();
763       }
764       if (snapshotAllocator != null) {
765         this.snapshotAllocatorAtCreation = snapshotAllocator;
766         this.snapshotAllocatorAtCreation.incScannerCount();
767       }
768     }
769 
770     private KeyValue getNext(Iterator<KeyValue> it) {
771       long readPoint = MultiVersionConsistencyControl.getThreadReadPoint();
772 
773       KeyValue v = null;
774       try {
775         while (it.hasNext()) {
776           v = it.next();
777           if (v.getMvccVersion() <= readPoint) {
778             return v;
779           }
780         }
781 
782         return null;
783       } finally {
784         if (v != null) {
785           // in all cases, remember the last KV iterated to
786           if (it == snapshotIt) {
787             snapshotItRow = v;
788           } else {
789             kvsetItRow = v;
790           }
791         }
792       }
793     }
794 
795     /**
796      *  Set the scanner at the seek key.
797      *  Must be called only once: there is no thread safety between the scanner
798      *   and the memStore.
799      * @param key seek value
800      * @return false if the key is null or if there is no data
801      */
802     @Override
803     public synchronized boolean seek(KeyValue key) {
804       if (key == null) {
805         close();
806         return false;
807       }
808 
809       // kvset and snapshot will never be null.
810       // if tailSet can't find anything, SortedSet is empty (not null).
811       kvsetIt = kvsetAtCreation.tailSet(key).iterator();
812       snapshotIt = snapshotAtCreation.tailSet(key).iterator();
813       kvsetItRow = null;
814       snapshotItRow = null;
815 
816       return seekInSubLists(key);
817     }
818 
819 
820     /**
821      * (Re)initialize the iterators after a seek or a reseek.
822      */
823     private synchronized boolean seekInSubLists(KeyValue key){
824       kvsetNextRow = getNext(kvsetIt);
825       snapshotNextRow = getNext(snapshotIt);
826 
827       // Calculate the next value
828       theNext = getLowest(kvsetNextRow, snapshotNextRow);
829 
830       // has data
831       return (theNext != null);
832     }
833 
834 
835     /**
836      * Move forward on the sub-lists set previously by seek.
837      * @param key seek value (should be non-null)
838      * @return true if there is at least one KV to read, false otherwise
839      */
840     @Override
841     public synchronized boolean reseek(KeyValue key) {
842       /*
843       See HBASE-4195 & HBASE-3855 & HBASE-6591 for the background on this implementation.
844       This code is executed concurrently with flush and puts, without locks.
845       Two points must be known when working on this code:
846       1) It's not possible to use the 'kvTail' and 'snapshot'
847        variables, as they are modified during a flush.
848       2) The ideal implementation for performance would use the sub skip list
849        implicitly pointed by the iterators 'kvsetIt' and
850        'snapshotIt'. Unfortunately the Java API does not offer a method to
851        get it. So we remember the last keys we iterated to and restore
852        the reseeked set to at least that point.
853        */
854 
855       kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
856       snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
857 
858       return seekInSubLists(key);
859     }
860 
861 
862     @Override
863     public synchronized KeyValue peek() {
864       //DebugPrint.println(" MS@" + hashCode() + " peek = " + getLowest());
865       return theNext;
866     }
867 
868     @Override
869     public synchronized KeyValue next() {
870       if (theNext == null) {
871           return null;
872       }
873 
874       final KeyValue ret = theNext;
875 
876       // Advance one of the iterators
877       if (theNext == kvsetNextRow) {
878         kvsetNextRow = getNext(kvsetIt);
879       } else {
880         snapshotNextRow = getNext(snapshotIt);
881       }
882 
883       // Calculate the next value
884       theNext = getLowest(kvsetNextRow, snapshotNextRow);
885 
886       //long readpoint = ReadWriteConsistencyControl.getThreadReadPoint();
887       //DebugPrint.println(" MS@" + hashCode() + " next: " + theNext + " next_next: " +
888       //    getLowest() + " threadpoint=" + readpoint);
889       return ret;
890     }
891 
892     /*
893      * Returns the lower of the two key values, or null if they are both null.
894      * This uses comparator.compare() to compare the KeyValue using the memstore
895      * comparator.
896      */
897     private KeyValue getLowest(KeyValue first, KeyValue second) {
898       if (first == null && second == null) {
899         return null;
900       }
901       if (first != null && second != null) {
902         int compare = comparator.compare(first, second);
903         return (compare <= 0 ? first : second);
904       }
905       return (first != null ? first : second);
906     }
907 
908     /*
909      * Returns the higher of the two key values, or null if they are both null.
910      * This uses comparator.compare() to compare the KeyValue using the memstore
911      * comparator.
912      */
913     private KeyValue getHighest(KeyValue first, KeyValue second) {
914       if (first == null && second == null) {
915         return null;
916       }
917       if (first != null && second != null) {
918         int compare = comparator.compare(first, second);
919         return (compare > 0 ? first : second);
920       }
921       return (first != null ? first : second);
922     }
923 
924     public synchronized void close() {
925       this.kvsetNextRow = null;
926       this.snapshotNextRow = null;
927 
928       this.kvsetIt = null;
929       this.snapshotIt = null;
930       
931       if (allocatorAtCreation != null) {
932         this.allocatorAtCreation.decScannerCount();
933         this.allocatorAtCreation = null;
934       }
935       if (snapshotAllocatorAtCreation != null) {
936         this.snapshotAllocatorAtCreation.decScannerCount();
937         this.snapshotAllocatorAtCreation = null;
938       }
939 
940       this.kvsetItRow = null;
941       this.snapshotItRow = null;
942     }
943 
944     /**
945      * MemStoreScanner returns max value as sequence id because it will
946      * always have the latest data among all files.
947      */
948     @Override
949     public long getSequenceID() {
950       return Long.MAX_VALUE;
951     }
952 
953     @Override
954     public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
955         long oldestUnexpiredTS) {
956       return shouldSeek(scan, oldestUnexpiredTS);
957     }
958   }
959 
960   public final static long FIXED_OVERHEAD = ClassSize.align(
961       ClassSize.OBJECT + (11 * ClassSize.REFERENCE) + Bytes.SIZEOF_LONG);
962 
963   public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
964       ClassSize.REENTRANT_LOCK + ClassSize.ATOMIC_LONG +
965       ClassSize.COPYONWRITE_ARRAYSET + ClassSize.COPYONWRITE_ARRAYLIST +
966       (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
967 
968   /** Used for readability when we don't store memstore timestamp in HFile */
969   public static final boolean NO_PERSISTENT_TS = false;
970 
971   /*
972    * Calculate how the MemStore size has changed.  Includes overhead of the
973    * backing Map.
974    * @param kv
975    * @param notpresent True if the kv was NOT present in the set.
976    * @return Size
977    */
978   static long heapSizeChange(final KeyValue kv, final boolean notpresent) {
979     return notpresent ?
980         ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
981         0;
982   }
983 
984   /**
985    * Get the entire heap usage for this MemStore not including keys in the
986    * snapshot.
987    */
988   @Override
989   public long heapSize() {
990     return size.get();
991   }
992 
993   /**
994    * Get the heap usage of KVs in this MemStore.
995    */
996   public long keySize() {
997     return heapSize() - DEEP_OVERHEAD;
998   }
999 
1000   /**
1001    * Code to help figure if our approximation of object heap sizes is close
1002    * enough.  See hbase-900.  Fills memstores then waits so user can heap
1003    * dump and bring up resultant hprof in something like jprofiler which
1004    * allows you get 'deep size' on objects.
1005    * @param args main args
1006    */
1007   public static void main(String [] args) {
1008     RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
1009     LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
1010       runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
1011     LOG.info("vmInputArguments=" + runtime.getInputArguments());
1012     MemStore memstore1 = new MemStore();
1013     // TODO: x32 vs x64
1014     long size = 0;
1015     final int count = 10000;
1016     byte [] fam = Bytes.toBytes("col");
1017     byte [] qf = Bytes.toBytes("umn");
1018     byte [] empty = new byte[0];
1019     for (int i = 0; i < count; i++) {
1020       // Give each its own ts
1021       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1022     }
1023     LOG.info("memstore1 estimated size=" + size);
1024     for (int i = 0; i < count; i++) {
1025       size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
1026     }
1027     LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
1028     // Make a variably sized memstore.
1029     MemStore memstore2 = new MemStore();
1030     for (int i = 0; i < count; i++) {
1031       size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
1032         new byte[i]));
1033     }
1034     LOG.info("memstore2 estimated size=" + size);
1035     final int seconds = 30;
1036     LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1037     for (int i = 0; i < seconds; i++) {
1038       // Thread.sleep(1000);
1039     }
1040     LOG.info("Exiting.");
1041   }
1042 }