View Javadoc

1   /**
2    * Copyright 2011 The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   * distributed under the License is distributed on an "AS IS" BASIS,
16   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17   * See the License for the specific language governing permissions and
18   * limitations under the License.
19   */
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.util.concurrent.atomic.AtomicInteger;
23  import java.util.concurrent.atomic.AtomicReference;
24  
25  import org.apache.hadoop.conf.Configuration;
26  import com.google.common.base.Preconditions;
27  
28  /**
29   * A memstore-local allocation buffer.
30   * <p>
31   * The MemStoreLAB is basically a bump-the-pointer allocator that allocates
32   * big (2MB) byte[] chunks from and then doles it out to threads that request
33   * slices into the array.
34   * <p>
35   * The purpose of this class is to combat heap fragmentation in the
36   * regionserver. By ensuring that all KeyValues in a given memstore refer
37   * only to large chunks of contiguous memory, we ensure that large blocks
38   * get freed up when the memstore is flushed.
39   * <p>
40   * Without the MSLAB, the byte array allocated during insertion end up
41   * interleaved throughout the heap, and the old generation gets progressively
42   * more fragmented until a stop-the-world compacting collection occurs.
43   * <p>
44   * TODO: we should probably benchmark whether word-aligning the allocations
45   * would provide a performance improvement - probably would speed up the
46   * Bytes.toLong/Bytes.toInt calls in KeyValue, but some of those are cached
47   * anyway
48   */
49  public class MemStoreLAB {
50    private AtomicReference<Chunk> curChunk = new AtomicReference<Chunk>();
51  
52    final static String CHUNK_SIZE_KEY = "hbase.hregion.memstore.mslab.chunksize";
53    final static int CHUNK_SIZE_DEFAULT = 2048 * 1024;
54    final int chunkSize;
55  
56    final static String MAX_ALLOC_KEY = "hbase.hregion.memstore.mslab.max.allocation";
57    final static int MAX_ALLOC_DEFAULT = 256  * 1024; // allocs bigger than this don't go through allocator
58    final int maxAlloc;
59  
60    public MemStoreLAB() {
61      this(new Configuration());
62    }
63  
64    public MemStoreLAB(Configuration conf) {
65      chunkSize = conf.getInt(CHUNK_SIZE_KEY, CHUNK_SIZE_DEFAULT);
66      maxAlloc = conf.getInt(MAX_ALLOC_KEY, MAX_ALLOC_DEFAULT);
67  
68      // if we don't exclude allocations >CHUNK_SIZE, we'd infiniteloop on one!
69      Preconditions.checkArgument(
70        maxAlloc <= chunkSize,
71        MAX_ALLOC_KEY + " must be less than " + CHUNK_SIZE_KEY);
72    }
73  
74    /**
75     * Allocate a slice of the given length.
76     *
77     * If the size is larger than the maximum size specified for this
78     * allocator, returns null.
79     */
80    public Allocation allocateBytes(int size) {
81      Preconditions.checkArgument(size >= 0, "negative size");
82  
83      // Callers should satisfy large allocations directly from JVM since they
84      // don't cause fragmentation as badly.
85      if (size > maxAlloc) {
86        return null;
87      }
88  
89      while (true) {
90        Chunk c = getOrMakeChunk();
91  
92        // Try to allocate from this chunk
93        int allocOffset = c.alloc(size);
94        if (allocOffset != -1) {
95          // We succeeded - this is the common case - small alloc
96          // from a big buffer
97          return new Allocation(c.data, allocOffset);
98        }
99  
100       // not enough space!
101       // try to retire this chunk
102       tryRetireChunk(c);
103     }
104   }
105 
106   /**
107    * Try to retire the current chunk if it is still
108    * <code>c</code>. Postcondition is that curChunk.get()
109    * != c
110    */
111   private void tryRetireChunk(Chunk c) {
112     @SuppressWarnings("unused")
113     boolean weRetiredIt = curChunk.compareAndSet(c, null);
114     // If the CAS succeeds, that means that we won the race
115     // to retire the chunk. We could use this opportunity to
116     // update metrics on external fragmentation.
117     //
118     // If the CAS fails, that means that someone else already
119     // retired the chunk for us.
120   }
121 
122   /**
123    * Get the current chunk, or, if there is no current chunk,
124    * allocate a new one from the JVM.
125    */
126   private Chunk getOrMakeChunk() {
127     while (true) {
128       // Try to get the chunk
129       Chunk c = curChunk.get();
130       if (c != null) {
131         return c;
132       }
133 
134       // No current chunk, so we want to allocate one. We race
135       // against other allocators to CAS in an uninitialized chunk
136       // (which is cheap to allocate)
137       c = new Chunk(chunkSize);
138       if (curChunk.compareAndSet(null, c)) {
139         // we won race - now we need to actually do the expensive
140         // allocation step
141         c.init();
142         return c;
143       }
144       // someone else won race - that's fine, we'll try to grab theirs
145       // in the next iteration of the loop.
146     }
147   }
148 
149   /**
150    * A chunk of memory out of which allocations are sliced.
151    */
152   private static class Chunk {
153     /** Actual underlying data */
154     private byte[] data;
155 
156     private static final int UNINITIALIZED = -1;
157     /**
158      * Offset for the next allocation, or the sentinel value -1
159      * which implies that the chunk is still uninitialized.
160      * */
161     private AtomicInteger nextFreeOffset = new AtomicInteger(UNINITIALIZED);
162 
163     /** Total number of allocations satisfied from this buffer */
164     private AtomicInteger allocCount = new AtomicInteger();
165 
166     /** Size of chunk in bytes */
167     private final int size;
168 
169     /**
170      * Create an uninitialized chunk. Note that memory is not allocated yet, so
171      * this is cheap.
172      * @param size in bytes
173      */
174     private Chunk(int size) {
175       this.size = size;
176     }
177 
178     /**
179      * Actually claim the memory for this chunk. This should only be called from
180      * the thread that constructed the chunk. It is thread-safe against other
181      * threads calling alloc(), who will block until the allocation is complete.
182      */
183     public void init() {
184       assert nextFreeOffset.get() == UNINITIALIZED;
185       data = new byte[size];
186       // Mark that it's ready for use
187       boolean initted = nextFreeOffset.compareAndSet(
188           UNINITIALIZED, 0);
189       // We should always succeed the above CAS since only one thread
190       // calls init()!
191       Preconditions.checkState(initted,
192           "Multiple threads tried to init same chunk");
193     }
194 
195     /**
196      * Try to allocate <code>size</code> bytes from the chunk.
197      * @return the offset of the successful allocation, or -1 to indicate not-enough-space
198      */
199     public int alloc(int size) {
200       while (true) {
201         int oldOffset = nextFreeOffset.get();
202         if (oldOffset == UNINITIALIZED) {
203           // The chunk doesn't have its data allocated yet.
204           // Since we found this in curChunk, we know that whoever
205           // CAS-ed it there is allocating it right now. So spin-loop
206           // shouldn't spin long!
207           Thread.yield();
208           continue;
209         }
210 
211         if (oldOffset + size > data.length) {
212           return -1; // alloc doesn't fit
213         }
214 
215         // Try to atomically claim this chunk
216         if (nextFreeOffset.compareAndSet(oldOffset, oldOffset + size)) {
217           // we got the alloc
218           allocCount.incrementAndGet();
219           return oldOffset;
220         }
221         // we raced and lost alloc, try again
222       }
223     }
224 
225     @Override
226     public String toString() {
227       return "Chunk@" + System.identityHashCode(this) +
228         " allocs=" + allocCount.get() + "waste=" +
229         (data.length - nextFreeOffset.get());
230     }
231   }
232 
233   /**
234    * The result of a single allocation. Contains the chunk that the
235    * allocation points into, and the offset in this array where the
236    * slice begins.
237    */
238   public static class Allocation {
239     private final byte[] data;
240     private final int offset;
241 
242     private Allocation(byte[] data, int off) {
243       this.data = data;
244       this.offset = off;
245     }
246 
247     @Override
248     public String toString() {
249       return "Allocation(data=" + data +
250         " with capacity=" + data.length +
251         ", off=" + offset + ")";
252     }
253 
254     byte[] getData() {
255       return data;
256     }
257 
258     int getOffset() {
259       return offset;
260     }
261   }
262 }