1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.lang.management.ManagementFactory;
23 import java.lang.management.RuntimeMXBean;
24 import java.rmi.UnexpectedException;
25 import java.util.ArrayList;
26 import java.util.Collections;
27 import java.util.Iterator;
28 import java.util.List;
29 import java.util.NavigableSet;
30 import java.util.SortedSet;
31 import java.util.concurrent.atomic.AtomicLong;
32
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35 import org.apache.hadoop.classification.InterfaceAudience;
36 import org.apache.hadoop.conf.Configuration;
37 import org.apache.hadoop.hbase.Cell;
38 import org.apache.hadoop.hbase.HBaseConfiguration;
39 import org.apache.hadoop.hbase.HConstants;
40 import org.apache.hadoop.hbase.KeyValue;
41 import org.apache.hadoop.hbase.KeyValueUtil;
42 import org.apache.hadoop.hbase.client.Scan;
43 import org.apache.hadoop.hbase.io.HeapSize;
44 import org.apache.hadoop.hbase.regionserver.MemStoreLAB.Allocation;
45 import org.apache.hadoop.hbase.util.Bytes;
46 import org.apache.hadoop.hbase.util.ClassSize;
47 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 @InterfaceAudience.Private
66 public class MemStore implements HeapSize {
67 private static final Log LOG = LogFactory.getLog(MemStore.class);
68
69 static final String USEMSLAB_KEY =
70 "hbase.hregion.memstore.mslab.enabled";
71 private static final boolean USEMSLAB_DEFAULT = true;
72
73 private Configuration conf;
74
75
76
77
78
79
80 volatile KeyValueSkipListSet kvset;
81
82
83 volatile KeyValueSkipListSet snapshot;
84
85 final KeyValue.KVComparator comparator;
86
87
88 final AtomicLong size;
89 private volatile long snapshotSize;
90
91
92 volatile long timeOfOldestEdit = Long.MAX_VALUE;
93
94 TimeRangeTracker timeRangeTracker;
95 TimeRangeTracker snapshotTimeRangeTracker;
96
97 MemStoreChunkPool chunkPool;
98 volatile MemStoreLAB allocator;
99 volatile MemStoreLAB snapshotAllocator;
100
101
102
103
104 public MemStore() {
105 this(HBaseConfiguration.create(), KeyValue.COMPARATOR);
106 }
107
108
109
110
111
112 public MemStore(final Configuration conf,
113 final KeyValue.KVComparator c) {
114 this.conf = conf;
115 this.comparator = c;
116 this.kvset = new KeyValueSkipListSet(c);
117 this.snapshot = new KeyValueSkipListSet(c);
118 timeRangeTracker = new TimeRangeTracker();
119 snapshotTimeRangeTracker = new TimeRangeTracker();
120 this.size = new AtomicLong(DEEP_OVERHEAD);
121 this.snapshotSize = 0;
122 if (conf.getBoolean(USEMSLAB_KEY, USEMSLAB_DEFAULT)) {
123 this.chunkPool = MemStoreChunkPool.getPool(conf);
124 this.allocator = new MemStoreLAB(conf, chunkPool);
125 } else {
126 this.allocator = null;
127 this.chunkPool = null;
128 }
129 }
130
131 void dump() {
132 for (KeyValue kv: this.kvset) {
133 LOG.info(kv);
134 }
135 for (KeyValue kv: this.snapshot) {
136 LOG.info(kv);
137 }
138 }
139
140
141
142
143
144
145 void snapshot() {
146
147
148 if (!this.snapshot.isEmpty()) {
149 LOG.warn("Snapshot called again without clearing previous. " +
150 "Doing nothing. Another ongoing flush or did we fail last attempt?");
151 } else {
152 if (!this.kvset.isEmpty()) {
153 this.snapshotSize = keySize();
154 this.snapshot = this.kvset;
155 this.kvset = new KeyValueSkipListSet(this.comparator);
156 this.snapshotTimeRangeTracker = this.timeRangeTracker;
157 this.timeRangeTracker = new TimeRangeTracker();
158
159 this.size.set(DEEP_OVERHEAD);
160 this.snapshotAllocator = this.allocator;
161
162 if (allocator != null) {
163 this.allocator = new MemStoreLAB(conf, chunkPool);
164 }
165 timeOfOldestEdit = Long.MAX_VALUE;
166 }
167 }
168 }
169
170
171
172
173
174
175
176
177
178 KeyValueSkipListSet getSnapshot() {
179 return this.snapshot;
180 }
181
182
183
184
185
186
187
188
189
190 long getFlushableSize() {
191 return this.snapshotSize > 0 ? this.snapshotSize : keySize();
192 }
193
194
195
196
197
198
199
200 void clearSnapshot(final SortedSet<KeyValue> ss)
201 throws UnexpectedException {
202 MemStoreLAB tmpAllocator = null;
203 if (this.snapshot != ss) {
204 throw new UnexpectedException("Current snapshot is " +
205 this.snapshot + ", was passed " + ss);
206 }
207
208
209 if (!ss.isEmpty()) {
210 this.snapshot = new KeyValueSkipListSet(this.comparator);
211 this.snapshotTimeRangeTracker = new TimeRangeTracker();
212 }
213 this.snapshotSize = 0;
214 if (this.snapshotAllocator != null) {
215 tmpAllocator = this.snapshotAllocator;
216 this.snapshotAllocator = null;
217 }
218 if (tmpAllocator != null) {
219 tmpAllocator.close();
220 }
221 }
222
223
224
225
226
227
228 long add(final KeyValue kv) {
229 KeyValue toAdd = maybeCloneWithAllocator(kv);
230 return internalAdd(toAdd);
231 }
232
233 long timeOfOldestEdit() {
234 return timeOfOldestEdit;
235 }
236
237 private boolean addToKVSet(KeyValue e) {
238 boolean b = this.kvset.add(e);
239 setOldestEditTimeToNow();
240 return b;
241 }
242
243 private boolean removeFromKVSet(KeyValue e) {
244 boolean b = this.kvset.remove(e);
245 setOldestEditTimeToNow();
246 return b;
247 }
248
249 void setOldestEditTimeToNow() {
250 if (timeOfOldestEdit == Long.MAX_VALUE) {
251 timeOfOldestEdit = EnvironmentEdgeManager.currentTimeMillis();
252 }
253 }
254
255
256
257
258
259
260
261 private long internalAdd(final KeyValue toAdd) {
262 long s = heapSizeChange(toAdd, addToKVSet(toAdd));
263 timeRangeTracker.includeTimestamp(toAdd);
264 this.size.addAndGet(s);
265 return s;
266 }
267
268 private KeyValue maybeCloneWithAllocator(KeyValue kv) {
269 if (allocator == null) {
270 return kv;
271 }
272
273 int len = kv.getLength();
274 Allocation alloc = allocator.allocateBytes(len);
275 if (alloc == null) {
276
277
278 return kv;
279 }
280 assert alloc.getData() != null;
281 System.arraycopy(kv.getBuffer(), kv.getOffset(), alloc.getData(), alloc.getOffset(), len);
282 KeyValue newKv = new KeyValue(alloc.getData(), alloc.getOffset(), len);
283 newKv.setMvccVersion(kv.getMvccVersion());
284 return newKv;
285 }
286
287
288
289
290
291
292
293
294
295 void rollback(final KeyValue kv) {
296
297
298
299
300
301 KeyValue found = this.snapshot.get(kv);
302 if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
303 this.snapshot.remove(kv);
304 }
305
306 found = this.kvset.get(kv);
307 if (found != null && found.getMvccVersion() == kv.getMvccVersion()) {
308 removeFromKVSet(kv);
309 long s = heapSizeChange(kv, true);
310 this.size.addAndGet(-s);
311 }
312 }
313
314
315
316
317
318
319 long delete(final KeyValue delete) {
320 long s = 0;
321 KeyValue toAdd = maybeCloneWithAllocator(delete);
322 s += heapSizeChange(toAdd, addToKVSet(toAdd));
323 timeRangeTracker.includeTimestamp(toAdd);
324 this.size.addAndGet(s);
325 return s;
326 }
327
328
329
330
331
332
333 KeyValue getNextRow(final KeyValue kv) {
334 return getLowest(getNextRow(kv, this.kvset), getNextRow(kv, this.snapshot));
335 }
336
337
338
339
340
341
342 private KeyValue getLowest(final KeyValue a, final KeyValue b) {
343 if (a == null) {
344 return b;
345 }
346 if (b == null) {
347 return a;
348 }
349 return comparator.compareRows(a, b) <= 0? a: b;
350 }
351
352
353
354
355
356
357
358 private KeyValue getNextRow(final KeyValue key,
359 final NavigableSet<KeyValue> set) {
360 KeyValue result = null;
361 SortedSet<KeyValue> tail = key == null? set: set.tailSet(key);
362
363 for (KeyValue kv: tail) {
364 if (comparator.compareRows(kv, key) <= 0)
365 continue;
366
367
368 result = kv;
369 break;
370 }
371 return result;
372 }
373
374
375
376
377 void getRowKeyAtOrBefore(final GetClosestRowBeforeTracker state) {
378 getRowKeyAtOrBefore(kvset, state);
379 getRowKeyAtOrBefore(snapshot, state);
380 }
381
382
383
384
385
386 private void getRowKeyAtOrBefore(final NavigableSet<KeyValue> set,
387 final GetClosestRowBeforeTracker state) {
388 if (set.isEmpty()) {
389 return;
390 }
391 if (!walkForwardInSingleRow(set, state.getTargetKey(), state)) {
392
393 getRowKeyBefore(set, state);
394 }
395 }
396
397
398
399
400
401
402
403
404
405
406
407 private boolean walkForwardInSingleRow(final SortedSet<KeyValue> set,
408 final KeyValue firstOnRow, final GetClosestRowBeforeTracker state) {
409 boolean foundCandidate = false;
410 SortedSet<KeyValue> tail = set.tailSet(firstOnRow);
411 if (tail.isEmpty()) return foundCandidate;
412 for (Iterator<KeyValue> i = tail.iterator(); i.hasNext();) {
413 KeyValue kv = i.next();
414
415 if (state.isTooFar(kv, firstOnRow)) break;
416 if (state.isExpired(kv)) {
417 i.remove();
418 continue;
419 }
420
421 if (state.handle(kv)) {
422 foundCandidate = true;
423 break;
424 }
425 }
426 return foundCandidate;
427 }
428
429
430
431
432
433
434
435 private void getRowKeyBefore(NavigableSet<KeyValue> set,
436 final GetClosestRowBeforeTracker state) {
437 KeyValue firstOnRow = state.getTargetKey();
438 for (Member p = memberOfPreviousRow(set, state, firstOnRow);
439 p != null; p = memberOfPreviousRow(p.set, state, firstOnRow)) {
440
441 if (!state.isTargetTable(p.kv)) break;
442
443 if (!state.isBetterCandidate(p.kv)) break;
444
445 firstOnRow = new KeyValue(p.kv.getRowArray(), p.kv.getRowOffset(), p.kv.getRowLength(),
446 HConstants.LATEST_TIMESTAMP);
447
448 if (walkForwardInSingleRow(p.set, firstOnRow, state)) break;
449 }
450 }
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469 long updateColumnValue(byte[] row,
470 byte[] family,
471 byte[] qualifier,
472 long newValue,
473 long now) {
474 KeyValue firstKv = KeyValue.createFirstOnRow(
475 row, family, qualifier);
476
477 SortedSet<KeyValue> snSs = snapshot.tailSet(firstKv);
478 if (!snSs.isEmpty()) {
479 KeyValue snKv = snSs.first();
480
481 if (snKv.matchingRow(firstKv) && snKv.matchingQualifier(firstKv)) {
482 if (snKv.getTimestamp() == now) {
483
484 now += 1;
485 }
486 }
487 }
488
489
490
491
492
493
494
495 SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
496 for (KeyValue kv : ss) {
497
498 if (!kv.matchingColumn(family, qualifier) || !kv.matchingRow(firstKv)) {
499 break;
500 }
501
502
503 if (kv.getTypeByte() == KeyValue.Type.Put.getCode() &&
504 kv.getTimestamp() > now && firstKv.matchingQualifier(kv)) {
505 now = kv.getTimestamp();
506 }
507 }
508
509
510
511 List<Cell> cells = new ArrayList<Cell>(1);
512 cells.add(new KeyValue(row, family, qualifier, now, Bytes.toBytes(newValue)));
513 return upsert(cells, 1L);
514 }
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534 public long upsert(Iterable<Cell> cells, long readpoint) {
535 long size = 0;
536 for (Cell cell : cells) {
537 size += upsert(cell, readpoint);
538 }
539 return size;
540 }
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556 private long upsert(Cell cell, long readpoint) {
557
558
559
560
561
562
563 KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
564 long addedSize = internalAdd(kv);
565
566
567
568 KeyValue firstKv = KeyValue.createFirstOnRow(
569 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
570 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
571 kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
572 SortedSet<KeyValue> ss = kvset.tailSet(firstKv);
573 Iterator<KeyValue> it = ss.iterator();
574
575 int versionsVisible = 0;
576 while ( it.hasNext() ) {
577 KeyValue cur = it.next();
578
579 if (kv == cur) {
580
581 continue;
582 }
583
584 if (kv.matchingRow(cur) && kv.matchingQualifier(cur)) {
585
586 if (cur.getTypeByte() == KeyValue.Type.Put.getCode() &&
587 cur.getMvccVersion() <= readpoint) {
588 if (versionsVisible > 1) {
589
590
591
592
593 long delta = heapSizeChange(cur, true);
594 addedSize -= delta;
595 this.size.addAndGet(-delta);
596 it.remove();
597 setOldestEditTimeToNow();
598 } else {
599 versionsVisible++;
600 }
601 }
602 } else {
603
604 break;
605 }
606 }
607 return addedSize;
608 }
609
610
611
612
613
614 private static class Member {
615 final KeyValue kv;
616 final NavigableSet<KeyValue> set;
617 Member(final NavigableSet<KeyValue> s, final KeyValue kv) {
618 this.kv = kv;
619 this.set = s;
620 }
621 }
622
623
624
625
626
627
628
629
630
631 private Member memberOfPreviousRow(NavigableSet<KeyValue> set,
632 final GetClosestRowBeforeTracker state, final KeyValue firstOnRow) {
633 NavigableSet<KeyValue> head = set.headSet(firstOnRow, false);
634 if (head.isEmpty()) return null;
635 for (Iterator<KeyValue> i = head.descendingIterator(); i.hasNext();) {
636 KeyValue found = i.next();
637 if (state.isExpired(found)) {
638 i.remove();
639 continue;
640 }
641 return new Member(head, found);
642 }
643 return null;
644 }
645
646
647
648
649 List<KeyValueScanner> getScanners() {
650 return Collections.<KeyValueScanner>singletonList(
651 new MemStoreScanner(MultiVersionConsistencyControl.getThreadReadPoint()));
652 }
653
654
655
656
657
658
659 public boolean shouldSeek(Scan scan, long oldestUnexpiredTS) {
660 return (timeRangeTracker.includesTimeRange(scan.getTimeRange()) ||
661 snapshotTimeRangeTracker.includesTimeRange(scan.getTimeRange()))
662 && (Math.max(timeRangeTracker.getMaximumTimestamp(),
663 snapshotTimeRangeTracker.getMaximumTimestamp()) >=
664 oldestUnexpiredTS);
665 }
666
667 public TimeRangeTracker getSnapshotTimeRangeTracker() {
668 return this.snapshotTimeRangeTracker;
669 }
670
671
672
673
674
675
676
677 protected class MemStoreScanner extends NonLazyKeyValueScanner {
678
679 private KeyValue kvsetNextRow = null;
680 private KeyValue snapshotNextRow = null;
681
682
683 private KeyValue kvsetItRow = null;
684 private KeyValue snapshotItRow = null;
685
686
687 private Iterator<KeyValue> kvsetIt;
688 private Iterator<KeyValue> snapshotIt;
689
690
691 private KeyValueSkipListSet kvsetAtCreation;
692 private KeyValueSkipListSet snapshotAtCreation;
693
694
695 private KeyValue theNext;
696
697
698 volatile MemStoreLAB allocatorAtCreation;
699 volatile MemStoreLAB snapshotAllocatorAtCreation;
700
701 private long readPoint;
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724 MemStoreScanner(long readPoint) {
725 super();
726
727 this.readPoint = readPoint;
728 kvsetAtCreation = kvset;
729 snapshotAtCreation = snapshot;
730 if (allocator != null) {
731 this.allocatorAtCreation = allocator;
732 this.allocatorAtCreation.incScannerCount();
733 }
734 if (snapshotAllocator != null) {
735 this.snapshotAllocatorAtCreation = snapshotAllocator;
736 this.snapshotAllocatorAtCreation.incScannerCount();
737 }
738 }
739
740 private KeyValue getNext(Iterator<KeyValue> it) {
741 KeyValue v = null;
742 try {
743 while (it.hasNext()) {
744 v = it.next();
745 if (v.getMvccVersion() <= this.readPoint) {
746 return v;
747 }
748 }
749
750 return null;
751 } finally {
752 if (v != null) {
753
754 if (it == snapshotIt) {
755 snapshotItRow = v;
756 } else {
757 kvsetItRow = v;
758 }
759 }
760 }
761 }
762
763
764
765
766
767
768
769
770 @Override
771 public synchronized boolean seek(KeyValue key) {
772 if (key == null) {
773 close();
774 return false;
775 }
776
777
778
779 kvsetIt = kvsetAtCreation.tailSet(key).iterator();
780 snapshotIt = snapshotAtCreation.tailSet(key).iterator();
781 kvsetItRow = null;
782 snapshotItRow = null;
783
784 return seekInSubLists(key);
785 }
786
787
788
789
790
791 private synchronized boolean seekInSubLists(KeyValue key){
792 kvsetNextRow = getNext(kvsetIt);
793 snapshotNextRow = getNext(snapshotIt);
794
795
796 theNext = getLowest(kvsetNextRow, snapshotNextRow);
797
798
799 return (theNext != null);
800 }
801
802
803
804
805
806
807
808 @Override
809 public synchronized boolean reseek(KeyValue key) {
810
811
812
813
814
815
816
817
818
819
820
821
822
823 kvsetIt = kvsetAtCreation.tailSet(getHighest(key, kvsetItRow)).iterator();
824 snapshotIt = snapshotAtCreation.tailSet(getHighest(key, snapshotItRow)).iterator();
825
826 return seekInSubLists(key);
827 }
828
829
830 @Override
831 public synchronized KeyValue peek() {
832
833 return theNext;
834 }
835
836 @Override
837 public synchronized KeyValue next() {
838 if (theNext == null) {
839 return null;
840 }
841
842 final KeyValue ret = theNext;
843
844
845 if (theNext == kvsetNextRow) {
846 kvsetNextRow = getNext(kvsetIt);
847 } else {
848 snapshotNextRow = getNext(snapshotIt);
849 }
850
851
852 theNext = getLowest(kvsetNextRow, snapshotNextRow);
853
854
855
856
857 return ret;
858 }
859
860
861
862
863
864
865 private KeyValue getLowest(KeyValue first, KeyValue second) {
866 if (first == null && second == null) {
867 return null;
868 }
869 if (first != null && second != null) {
870 int compare = comparator.compare(first, second);
871 return (compare <= 0 ? first : second);
872 }
873 return (first != null ? first : second);
874 }
875
876
877
878
879
880
881 private KeyValue getHighest(KeyValue first, KeyValue second) {
882 if (first == null && second == null) {
883 return null;
884 }
885 if (first != null && second != null) {
886 int compare = comparator.compare(first, second);
887 return (compare > 0 ? first : second);
888 }
889 return (first != null ? first : second);
890 }
891
892 public synchronized void close() {
893 this.kvsetNextRow = null;
894 this.snapshotNextRow = null;
895
896 this.kvsetIt = null;
897 this.snapshotIt = null;
898
899 if (allocatorAtCreation != null) {
900 this.allocatorAtCreation.decScannerCount();
901 this.allocatorAtCreation = null;
902 }
903 if (snapshotAllocatorAtCreation != null) {
904 this.snapshotAllocatorAtCreation.decScannerCount();
905 this.snapshotAllocatorAtCreation = null;
906 }
907
908 this.kvsetItRow = null;
909 this.snapshotItRow = null;
910 }
911
912
913
914
915
916 @Override
917 public long getSequenceID() {
918 return Long.MAX_VALUE;
919 }
920
921 @Override
922 public boolean shouldUseScanner(Scan scan, SortedSet<byte[]> columns,
923 long oldestUnexpiredTS) {
924 return shouldSeek(scan, oldestUnexpiredTS);
925 }
926 }
927
928 public final static long FIXED_OVERHEAD = ClassSize.align(
929 ClassSize.OBJECT + (10 * ClassSize.REFERENCE) + (2 * Bytes.SIZEOF_LONG));
930
931 public final static long DEEP_OVERHEAD = ClassSize.align(FIXED_OVERHEAD +
932 ClassSize.ATOMIC_LONG + (2 * ClassSize.TIMERANGE_TRACKER) +
933 (2 * ClassSize.KEYVALUE_SKIPLIST_SET) + (2 * ClassSize.CONCURRENT_SKIPLISTMAP));
934
935
936
937
938
939
940
941
942 static long heapSizeChange(final KeyValue kv, final boolean notpresent) {
943 return notpresent ?
944 ClassSize.align(ClassSize.CONCURRENT_SKIPLISTMAP_ENTRY + kv.heapSize()):
945 0;
946 }
947
948
949
950
951
952 @Override
953 public long heapSize() {
954 return size.get();
955 }
956
957
958
959
960 public long keySize() {
961 return heapSize() - DEEP_OVERHEAD;
962 }
963
964
965
966
967
968
969
970
971 public static void main(String [] args) {
972 RuntimeMXBean runtime = ManagementFactory.getRuntimeMXBean();
973 LOG.info("vmName=" + runtime.getVmName() + ", vmVendor=" +
974 runtime.getVmVendor() + ", vmVersion=" + runtime.getVmVersion());
975 LOG.info("vmInputArguments=" + runtime.getInputArguments());
976 MemStore memstore1 = new MemStore();
977
978 long size = 0;
979 final int count = 10000;
980 byte [] fam = Bytes.toBytes("col");
981 byte [] qf = Bytes.toBytes("umn");
982 byte [] empty = new byte[0];
983 for (int i = 0; i < count; i++) {
984
985 size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
986 }
987 LOG.info("memstore1 estimated size=" + size);
988 for (int i = 0; i < count; i++) {
989 size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
990 }
991 LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
992
993 MemStore memstore2 = new MemStore();
994 for (int i = 0; i < count; i++) {
995 size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
996 new byte[i]));
997 }
998 LOG.info("memstore2 estimated size=" + size);
999 final int seconds = 30;
1000 LOG.info("Waiting " + seconds + " seconds while heap dump is taken");
1001 for (int i = 0; i < seconds; i++) {
1002
1003 }
1004 LOG.info("Exiting.");
1005 }
1006 }