1 /** 2 * Copyright 2011 The Apache Software Foundation 3 * 4 * Licensed to the Apache Software Foundation (ASF) under one 5 * or more contributor license agreements. See the NOTICE file 6 * distributed with this work for additional information 7 * regarding copyright ownership. The ASF licenses this file 8 * to you under the Apache License, Version 2.0 (the 9 * "License"); you may not use this file except in compliance 10 * with the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 package org.apache.hadoop.hbase.regionserver; 21 22 import java.util.concurrent.atomic.AtomicInteger; 23 import java.util.concurrent.atomic.AtomicReference; 24 25 import org.apache.hadoop.conf.Configuration; 26 import com.google.common.base.Preconditions; 27 28 /** 29 * A memstore-local allocation buffer. 30 * <p> 31 * The MemStoreLAB is basically a bump-the-pointer allocator that allocates 32 * big (2MB) byte[] chunks from and then doles it out to threads that request 33 * slices into the array. 34 * <p> 35 * The purpose of this class is to combat heap fragmentation in the 36 * regionserver. By ensuring that all KeyValues in a given memstore refer 37 * only to large chunks of contiguous memory, we ensure that large blocks 38 * get freed up when the memstore is flushed. 39 * <p> 40 * Without the MSLAB, the byte array allocated during insertion end up 41 * interleaved throughout the heap, and the old generation gets progressively 42 * more fragmented until a stop-the-world compacting collection occurs. 43 * <p> 44 * TODO: we should probably benchmark whether word-aligning the allocations 45 * would provide a performance improvement - probably would speed up the 46 * Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached 47 * anyway 48 */ 49 public class MemStoreLAB { 50 private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>(); 51 52 final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize"; 53 final static int CHUNK_SIZE_DEFAULT = 2048 * 1024; 54 final int chunkSize; 55 56 final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation"; 57 final static int MAX_ALLOC_DEFAULT = 256 * 1024; // allocs bigger than this don't go through allocator 58 final int maxAlloc; 59 60 public MemStoreLAB() { 61 this(new Configuration()); 62 } 63 64 public MemStoreLAB(Configuration conf) { 65 chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT); 66 maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT); 67 68 // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one! 69 Preconditions.checkArgument( 70 maxAlloc <= chunkSize, 71 MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY); 72 } 73 74 /** 75 * Allocate a slice of the given length. 76 * 77 * If the size is larger than the maximum size specified for this 78 * allocator, returns null. 79 */ 80 public Allocation allocateBytes(int size) { 81 Preconditions.checkArgument(size >= 0, "negative size"); 82 83 // Callers should satisfy large allocations directly from JVM since they 84 // don't cause fragmentation as badly. 85 if (size > maxAlloc) { 86 return null; 87 } 88 89 while (true) { 90 Chunk c = getOrMakeChunk(); 91 92 // Try to allocate from this chunk 93 int allocOffset = c.alloc(size); 94 if (allocOffset != -1) { 95 // We succeeded - this is the common case - small alloc 96 // from a big buffer 97 return new Allocation(c.data, allocOffset); 98 } 99 100 // not enough space! 101 // try to retire this chunk 102 tryRetireChunk(c); 103 } 104 } 105 106 /** 107 * Try to retire the current chunk if it is still 108 * <code>c</code>. Postcondition is that curChunk.get() 109 * != c 110 */ 111 private void tryRetireChunk(Chunk c) { 112 @SuppressWarnings("unused") 113 boolean weRetiredIt = curChunk.compareAndSet(c, null); 114 // If the CAS succeeds, that means that we won the race 115 // to retire the chunk. We could use this opportunity to 116 // update metrics on external fragmentation. 117 // 118 // If the CAS fails, that means that someone else already 119 // retired the chunk for us. 120 } 121 122 /** 123 * Get the current chunk, or, if there is no current chunk, 124 * allocate a new one from the JVM. 125 */ 126 private Chunk getOrMakeChunk() { 127 while (true) { 128 // Try to get the chunk 129 Chunk c = curChunk.get(); 130 if (c != null) { 131 return c; 132 } 133 134 // No current chunk, so we want to allocate one. We race 135 // against other allocators to CAS in an uninitialized chunk 136 // (which is cheap to allocate) 137 c = new Chunk(chunkSize); 138 if (curChunk.compareAndSet(null, c)) { 139 // we won race - now we need to actually do the expensive 140 // allocation step 141 c.init(); 142 return c; 143 } 144 // someone else won race - that's fine, we'll try to grab theirs 145 // in the next iteration of the loop. 146 } 147 } 148 149 /** 150 * A chunk of memory out of which allocations are sliced. 151 */ 152 private static class Chunk { 153 /** Actual underlying data */ 154 private byte[] data; 155 156 private static final int UNINITIALIZED = -1; 157 private static final int OOM = -2; 158 /** 159 * Offset for the next allocation, or the sentinel value -1 160 * which implies that the chunk is still uninitialized. 161 * */ 162 private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED); 163 164 /** Total number of allocations satisfied from this buffer */ 165 private AtomicInteger allocCount = new AtomicInteger(); 166 167 /** Size of chunk in bytes */ 168 private final int size; 169 170 /** 171 * Create an uninitialized chunk. Note that memory is not allocated yet, so 172 * this is cheap. 173 * @param size in bytes 174 */ 175 private Chunk(int size) { 176 this.size = size; 177 } 178 179 /** 180 * Actually claim the memory for this chunk. This should only be called from 181 * the thread that constructed the chunk. It is thread-safe against other 182 * threads calling alloc(), who will block until the allocation is complete. 183 */ 184 public void init() { 185 assert nextFreeOffset.get() == UNINITIALIZED; 186 try { 187 data = new byte[size]; 188 } catch (OutOfMemoryError e) { 189 boolean failInit = nextFreeOffset.compareAndSet(UNINITIALIZED, OOM); 190 assert failInit; // should be true. 191 throw e; 192 } 193 // Mark that it's ready for use 194 boolean initted = nextFreeOffset.compareAndSet( 195 UNINITIALIZED, 0); 196 // We should always succeed the above CAS since only one thread 197 // calls init()! 198 Preconditions.checkState(initted, 199 "Multiple threads tried to init same chunk"); 200 } 201 202 /** 203 * Try to allocate <code>size</code> bytes from the chunk. 204 * @return the offset of the successful allocation, or -1 to indicate not-enough-space 205 */ 206 public int alloc(int size) { 207 while (true) { 208 int oldOffset = nextFreeOffset.get(); 209 if (oldOffset == UNINITIALIZED) { 210 // The chunk doesn't have its data allocated yet. 211 // Since we found this in curChunk, we know that whoever 212 // CAS-ed it there is allocating it right now. So spin-loop 213 // shouldn't spin long! 214 Thread.yield(); 215 continue; 216 } 217 if (oldOffset == OOM) { 218 // doh we ran out of ram. return -1 to chuck this away. 219 return -1; 220 } 221 222 if (oldOffset + size > data.length) { 223 return -1; // alloc doesn't fit 224 } 225 226 // Try to atomically claim this chunk 227 if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) { 228 // we got the alloc 229 allocCount.incrementAndGet(); 230 return oldOffset; 231 } 232 // we raced and lost alloc, try again 233 } 234 } 235 236 @Override 237 public String toString() { 238 return "Chunk@" + System.identityHashCode(this) + 239 " allocs=" + allocCount.get() + "waste=" + 240 (data.length - nextFreeOffset.get()); 241 } 242 } 243 244 /** 245 * The result of a single allocation. Contains the chunk that the 246 * allocation points into, and the offset in this array where the 247 * slice begins. 248 */ 249 public static class Allocation { 250 private final byte[] data; 251 private final int offset; 252 253 private Allocation(byte[] data, int off) { 254 this.data = data; 255 this.offset = off; 256 } 257 258 @Override 259 public String toString() { 260 return "Allocation(data=" + data + 261 " with capacity=" + data.length + 262 ", off=" + offset + ")"; 263 } 264 265 byte[] getData() { 266 return data; 267 } 268 269 int getOffset() { 270 return offset; 271 } 272 } 273 }