View Javadoc

1   /**
2    * Copyright The Apache Software Foundation
3    *
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *     http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing, software
15   
16   * distributed under the License is distributed on an "AS IS" BASIS,
17   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18   * See the License for the specific language governing permissions and
19   * limitations under the License.
20   */
21  package org.apache.hadoop.hbase.io.hfile.bucket;
22  
23  import java.io.File;
24  import java.io.FileInputStream;
25  import java.io.FileNotFoundException;
26  import java.io.FileOutputStream;
27  import java.io.IOException;
28  import java.io.ObjectInputStream;
29  import java.io.ObjectOutputStream;
30  import java.io.Serializable;
31  import java.nio.ByteBuffer;
32  import java.util.ArrayList;
33  import java.util.Comparator;
34  import java.util.List;
35  import java.util.Map;
36  import java.util.PriorityQueue;
37  import java.util.Set;
38  import java.util.concurrent.ArrayBlockingQueue;
39  import java.util.concurrent.BlockingQueue;
40  import java.util.concurrent.ConcurrentHashMap;
41  import java.util.concurrent.Executors;
42  import java.util.concurrent.ScheduledExecutorService;
43  import java.util.concurrent.TimeUnit;
44  import java.util.concurrent.atomic.AtomicLong;
45  import java.util.concurrent.locks.Lock;
46  import java.util.concurrent.locks.ReentrantLock;
47  
48  import org.apache.commons.logging.Log;
49  import org.apache.commons.logging.LogFactory;
50  import org.apache.hadoop.classification.InterfaceAudience;
51  import org.apache.hadoop.conf.Configuration;
52  import org.apache.hadoop.hbase.io.HeapSize;
53  import org.apache.hadoop.hbase.io.hfile.BlockCache;
54  import org.apache.hadoop.hbase.io.hfile.BlockCacheColumnFamilySummary;
55  import org.apache.hadoop.hbase.io.hfile.BlockCacheKey;
56  import org.apache.hadoop.hbase.io.hfile.CacheStats;
57  import org.apache.hadoop.hbase.io.hfile.Cacheable;
58  import org.apache.hadoop.hbase.io.hfile.CacheableDeserializer;
59  import org.apache.hadoop.hbase.io.hfile.CacheableDeserializerIdManager;
60  import org.apache.hadoop.hbase.io.hfile.CombinedBlockCache;
61  import org.apache.hadoop.hbase.io.hfile.HFileBlock;
62  import org.apache.hadoop.hbase.regionserver.StoreFile;
63  import org.apache.hadoop.hbase.util.ConcurrentIndex;
64  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
65  import org.apache.hadoop.hbase.util.HasThread;
66  import org.apache.hadoop.hbase.util.IdLock;
67  import org.apache.hadoop.util.StringUtils;
68  
69  import com.google.common.collect.ImmutableList;
70  import com.google.common.util.concurrent.ThreadFactoryBuilder;
71  
72  /**
73   * BucketCache uses {@link BucketAllocator} to allocate/free block, and use
74   * {@link BucketCache#ramCache} and {@link BucketCache#backingMap} in order to
75   * determine whether a given element hit. It could uses memory
76   * {@link ByteBufferIOEngine} or file {@link FileIOEngine}to store/read the
77   * block data.
78   * 
79   * Eviction is using similar algorithm as
80   * {@link org.apache.hadoop.hbase.io.hfile.LruBlockCache}
81   * 
82   * BucketCache could be used as mainly a block cache(see
83   * {@link CombinedBlockCache}), combined with LruBlockCache to decrease CMS and
84   * fragment by GC.
85   * 
86   * Also could be used as a secondary cache(e.g. using Fusionio to store block)
87   * to enlarge cache space by
88   * {@link org.apache.hadoop.hbase.io.hfile.LruBlockCache#setVictimCache}
89   */
90  @InterfaceAudience.Private
91  public class BucketCache implements BlockCache, HeapSize {
92    static final Log LOG = LogFactory.getLog(BucketCache.class);
93  
94    /** Priority buckets */
95    private static final float DEFAULT_SINGLE_FACTOR = 0.25f;
96    private static final float DEFAULT_MULTI_FACTOR = 0.50f;
97    private static final float DEFAULT_MEMORY_FACTOR = 0.25f;
98    private static final float DEFAULT_EXTRA_FREE_FACTOR = 0.10f;
99  
100   private static final float DEFAULT_ACCEPT_FACTOR = 0.95f;
101   private static final float DEFAULT_MIN_FACTOR = 0.85f;
102 
103   /** Statistics thread */
104   private static final int statThreadPeriod = 3 * 60;
105 
106   final static int DEFAULT_WRITER_THREADS = 3;
107   final static int DEFAULT_WRITER_QUEUE_ITEMS = 64;
108 
109   // Store/read block data
110   IOEngine ioEngine;
111 
112   // Store the block in this map before writing it to cache
113   private ConcurrentHashMap<BlockCacheKey, RAMQueueEntry> ramCache;
114   // In this map, store the block's meta data like offset, length
115   private ConcurrentHashMap<BlockCacheKey, BucketEntry> backingMap;
116 
117   /**
118    * Flag if the cache is enabled or not... We shut it off if there are IO
119    * errors for some time, so that Bucket IO exceptions/errors don't bring down
120    * the HBase server.
121    */
122   private volatile boolean cacheEnabled;
123 
124   private ArrayList<BlockingQueue<RAMQueueEntry>> writerQueues = 
125       new ArrayList<BlockingQueue<RAMQueueEntry>>();
126   WriterThread writerThreads[];
127 
128 
129 
130   /** Volatile boolean to track if free space is in process or not */
131   private volatile boolean freeInProgress = false;
132   private Lock freeSpaceLock = new ReentrantLock();
133 
134   private UniqueIndexMap<Integer> deserialiserMap = new UniqueIndexMap<Integer>();
135 
136   private final AtomicLong realCacheSize = new AtomicLong(0);
137   private final AtomicLong heapSize = new AtomicLong(0);
138   /** Current number of cached elements */
139   private final AtomicLong blockNumber = new AtomicLong(0);
140   private final AtomicLong failedBlockAdditions = new AtomicLong(0);
141 
142   /** Cache access count (sequential ID) */
143   private final AtomicLong accessCount = new AtomicLong(0);
144 
145   private final Object[] cacheWaitSignals;
146   private static final int DEFAULT_CACHE_WAIT_TIME = 50;
147   // Used in test now. If the flag is false and the cache speed is very fast,
148   // bucket cache will skip some blocks when caching. If the flag is true, we
149   // will wait blocks flushed to IOEngine for some time when caching
150   boolean wait_when_cache = false;
151 
152   private BucketCacheStats cacheStats = new BucketCacheStats();
153 
154   private String persistencePath;
155   private long cacheCapacity;
156   /** Approximate block size */
157   private final long blockSize;
158 
159   /** Duration of IO errors tolerated before we disable cache, 1 min as default */
160   private final int ioErrorsTolerationDuration;
161   // 1 min
162   public static final int DEFAULT_ERROR_TOLERATION_DURATION = 60 * 1000;
163   // Start time of first IO error when reading or writing IO Engine, it will be
164   // reset after a successful read/write.
165   private volatile long ioErrorStartTime = -1;
166 
167   /**
168    * A "sparse lock" implementation allowing to lock on a particular block
169    * identified by offset. The purpose of this is to avoid freeing the block
170    * which is being read.
171    * 
172    * TODO:We could extend the IdLock to IdReadWriteLock for better.
173    */
174   private IdLock offsetLock = new IdLock();
175 
176   private final ConcurrentIndex<String, BlockCacheKey> blocksByHFile =
177       new ConcurrentIndex<String, BlockCacheKey>(new Comparator<BlockCacheKey>() {
178         @Override
179         public int compare(BlockCacheKey a, BlockCacheKey b) {
180           if (a.getOffset() == b.getOffset()) {
181             return 0;
182           } else if (a.getOffset() < b.getOffset()) {
183             return -1;
184           }
185           return 1;
186         }
187       });
188 
189   /** Statistics thread schedule pool (for heavy debugging, could remove) */
190   private final ScheduledExecutorService scheduleThreadPool =
191     Executors.newScheduledThreadPool(1,
192       new ThreadFactoryBuilder()
193         .setNameFormat("BucketCache Statistics #%d")
194         .setDaemon(true)
195         .build());
196 
197   // Allocate or free space for the block
198   private BucketAllocator bucketAllocator;
199   
200   public BucketCache(String ioEngineName, long capacity, int writerThreadNum,
201       int writerQLen, String persistencePath) throws FileNotFoundException,
202       IOException {
203     this(ioEngineName, capacity, writerThreadNum, writerQLen, persistencePath,
204         DEFAULT_ERROR_TOLERATION_DURATION);
205   }
206   
207   public BucketCache(String ioEngineName, long capacity, int writerThreadNum,
208       int writerQLen, String persistencePath, int ioErrorsTolerationDuration)
209       throws FileNotFoundException, IOException {
210     this.ioEngine = getIOEngineFromName(ioEngineName, capacity);
211     this.writerThreads = new WriterThread[writerThreadNum];
212     this.cacheWaitSignals = new Object[writerThreadNum];
213     long blockNumCapacity = capacity / 16384;
214     if (blockNumCapacity >= Integer.MAX_VALUE) {
215       // Enough for about 32TB of cache!
216       throw new IllegalArgumentException("Cache capacity is too large, only support 32TB now");
217     }
218 
219     this.cacheCapacity = capacity;
220     this.persistencePath = persistencePath;
221     this.blockSize = StoreFile.DEFAULT_BLOCKSIZE_SMALL;
222     this.ioErrorsTolerationDuration = ioErrorsTolerationDuration;
223 
224     bucketAllocator = new BucketAllocator(capacity);
225     for (int i = 0; i < writerThreads.length; ++i) {
226       writerQueues.add(new ArrayBlockingQueue<RAMQueueEntry>(writerQLen));
227       this.cacheWaitSignals[i] = new Object();
228     }
229 
230     assert writerQueues.size() == writerThreads.length;
231     this.ramCache = new ConcurrentHashMap<BlockCacheKey, RAMQueueEntry>();
232 
233     this.backingMap = new ConcurrentHashMap<BlockCacheKey, BucketEntry>((int) blockNumCapacity);
234 
235     if (ioEngine.isPersistent() && persistencePath != null) {
236       try {
237         retrieveFromFile();
238       } catch (IOException ioex) {
239         LOG.error("Can't restore from file because of", ioex);
240       } catch (ClassNotFoundException cnfe) {
241         LOG.error("Can't restore from file in rebuild because can't deserialise",cnfe);
242         throw new RuntimeException(cnfe);
243       }
244     }
245     final String threadName = Thread.currentThread().getName();
246     this.cacheEnabled = true;
247     for (int i = 0; i < writerThreads.length; ++i) {
248       writerThreads[i] = new WriterThread(writerQueues.get(i), i);
249       writerThreads[i].setName(threadName + "-BucketCacheWriter-" + i);
250       writerThreads[i].start();
251     }
252     // Run the statistics thread periodically to print the cache statistics log
253     this.scheduleThreadPool.scheduleAtFixedRate(new StatisticsThread(this),
254         statThreadPeriod, statThreadPeriod, TimeUnit.SECONDS);
255     LOG.info("Started bucket cache");
256   }
257 
258   /**
259    * Get the IOEngine from the IO engine name
260    * @param ioEngineName
261    * @param capacity
262    * @return the IOEngine
263    * @throws IOException
264    */
265   private IOEngine getIOEngineFromName(String ioEngineName, long capacity)
266       throws IOException {
267     if (ioEngineName.startsWith("file:"))
268       return new FileIOEngine(ioEngineName.substring(5), capacity);
269     else if (ioEngineName.startsWith("offheap"))
270       return new ByteBufferIOEngine(capacity, true);
271     else if (ioEngineName.startsWith("heap"))
272       return new ByteBufferIOEngine(capacity, false);
273     else
274       throw new IllegalArgumentException(
275           "Don't understand io engine name for cache - prefix with file:, heap or offheap");
276   }
277 
278   /**
279    * Cache the block with the specified name and buffer.
280    * @param cacheKey block's cache key
281    * @param buf block buffer
282    */
283   @Override
284   public void cacheBlock(BlockCacheKey cacheKey, Cacheable buf) {
285     cacheBlock(cacheKey, buf, false);
286   }
287 
288   /**
289    * Cache the block with the specified name and buffer.
290    * @param cacheKey block's cache key
291    * @param cachedItem block buffer
292    * @param inMemory if block is in-memory
293    */
294   @Override
295   public void cacheBlock(BlockCacheKey cacheKey, Cacheable cachedItem, boolean inMemory) {
296     cacheBlockWithWait(cacheKey, cachedItem, inMemory, wait_when_cache);
297   }
298 
299   /**
300    * Cache the block to ramCache
301    * @param cacheKey block's cache key
302    * @param cachedItem block buffer
303    * @param inMemory if block is in-memory
304    * @param wait if true, blocking wait when queue is full
305    */
306   public void cacheBlockWithWait(BlockCacheKey cacheKey, Cacheable cachedItem,
307       boolean inMemory, boolean wait) {
308     if (!cacheEnabled)
309       return;
310 
311     if (backingMap.containsKey(cacheKey) || ramCache.containsKey(cacheKey))
312       return;
313 
314     /*
315      * Stuff the entry into the RAM cache so it can get drained to the
316      * persistent store
317      */
318     RAMQueueEntry re = new RAMQueueEntry(cacheKey, cachedItem,
319         accessCount.incrementAndGet(), inMemory);
320     ramCache.put(cacheKey, re);
321     int queueNum = (cacheKey.hashCode() & 0x7FFFFFFF) % writerQueues.size();
322     BlockingQueue<RAMQueueEntry> bq = writerQueues.get(queueNum);
323     boolean successfulAddition = bq.offer(re);
324     if (!successfulAddition && wait) {
325       synchronized (cacheWaitSignals[queueNum]) {
326         try {
327           cacheWaitSignals[queueNum].wait(DEFAULT_CACHE_WAIT_TIME);
328         } catch (InterruptedException ie) {
329           Thread.currentThread().interrupt();
330         }
331       }
332       successfulAddition = bq.offer(re);
333     }
334     if (!successfulAddition) {
335         ramCache.remove(cacheKey);
336         failedBlockAdditions.incrementAndGet();
337     } else {
338       this.blockNumber.incrementAndGet();
339       this.heapSize.addAndGet(cachedItem.heapSize());
340       blocksByHFile.put(cacheKey.getHfileName(), cacheKey);
341     }
342   }
343 
344   /**
345    * Get the buffer of the block with the specified key.
346    * @param key block's cache key
347    * @param caching true if the caller caches blocks on cache misses
348    * @param repeat Whether this is a repeat lookup for the same block
349    * @return buffer of specified cache key, or null if not in cache
350    */
351   @Override
352   public Cacheable getBlock(BlockCacheKey key, boolean caching, boolean repeat) {
353     if (!cacheEnabled)
354       return null;
355     RAMQueueEntry re = ramCache.get(key);
356     if (re != null) {
357       cacheStats.hit(caching);
358       re.access(accessCount.incrementAndGet());
359       return re.getData();
360     }
361     BucketEntry bucketEntry = backingMap.get(key);
362     if(bucketEntry!=null) {
363       long start = System.nanoTime();
364       IdLock.Entry lockEntry = null;
365       try {
366         lockEntry = offsetLock.getLockEntry(bucketEntry.offset());
367         if (bucketEntry.equals(backingMap.get(key))) {
368           int len = bucketEntry.getLength();
369           ByteBuffer bb = ByteBuffer.allocate(len);
370           ioEngine.read(bb, bucketEntry.offset());
371           Cacheable cachedBlock = bucketEntry.deserializerReference(
372               deserialiserMap).deserialize(bb, true);
373           long timeTaken = System.nanoTime() - start;
374           cacheStats.hit(caching);
375           cacheStats.ioHit(timeTaken);
376           bucketEntry.access(accessCount.incrementAndGet());
377           if (this.ioErrorStartTime > 0) {
378             ioErrorStartTime = -1;
379           }
380           return cachedBlock;
381         }
382       } catch (IOException ioex) {
383         LOG.error("Failed reading block " + key + " from bucket cache", ioex);
384         checkIOErrorIsTolerated();
385       } finally {
386         if (lockEntry != null) {
387           offsetLock.releaseLockEntry(lockEntry);
388         }
389       }
390     }
391     if(!repeat)cacheStats.miss(caching);
392     return null;
393   }
394 
395   @Override
396   public boolean evictBlock(BlockCacheKey cacheKey) {
397     if (!cacheEnabled) return false;
398     RAMQueueEntry removedBlock = ramCache.remove(cacheKey);
399     if (removedBlock != null) {
400       this.blockNumber.decrementAndGet();
401       this.heapSize.addAndGet(-1 * removedBlock.getData().heapSize());
402     }
403     BucketEntry bucketEntry = backingMap.get(cacheKey);
404     if (bucketEntry != null) {
405       IdLock.Entry lockEntry = null;
406       try {
407         lockEntry = offsetLock.getLockEntry(bucketEntry.offset());
408         if (bucketEntry.equals(backingMap.remove(cacheKey))) {
409           bucketAllocator.freeBlock(bucketEntry.offset());
410           realCacheSize.addAndGet(-1 * bucketEntry.getLength());
411           blocksByHFile.remove(cacheKey.getHfileName(), cacheKey);
412           if (removedBlock == null) {
413             this.blockNumber.decrementAndGet();
414           }
415         } else {
416           return false;
417         }
418       } catch (IOException ie) {
419         LOG.warn("Failed evicting block " + cacheKey);
420         return false;
421       } finally {
422         if (lockEntry != null) {
423           offsetLock.releaseLockEntry(lockEntry);
424         }
425       }
426     }
427     cacheStats.evicted();
428     return true;
429   }
430   
431   /*
432    * Statistics thread.  Periodically prints the cache statistics to the log.
433    */
434   private static class StatisticsThread extends Thread {
435     BucketCache bucketCache;
436 
437     public StatisticsThread(BucketCache bucketCache) {
438       super("BucketCache.StatisticsThread");
439       setDaemon(true);
440       this.bucketCache = bucketCache;
441     }
442     @Override
443     public void run() {
444       bucketCache.logStats();
445     }
446   }
447   
448   public void logStats() {
449     if (!LOG.isDebugEnabled()) return;
450     // Log size
451     long totalSize = bucketAllocator.getTotalSize();
452     long usedSize = bucketAllocator.getUsedSize();
453     long freeSize = totalSize - usedSize;
454     long cacheSize = this.realCacheSize.get();
455     LOG.debug("BucketCache Stats: " +
456         "failedBlockAdditions=" + this.failedBlockAdditions.get() + ", " +
457         "total=" + StringUtils.byteDesc(totalSize) + ", " +
458         "free=" + StringUtils.byteDesc(freeSize) + ", " +
459         "usedSize=" + StringUtils.byteDesc(usedSize) +", " +
460         "cacheSize=" + StringUtils.byteDesc(cacheSize) +", " +
461         "accesses=" + cacheStats.getRequestCount() + ", " +
462         "hits=" + cacheStats.getHitCount() + ", " +
463         "IOhitsPerSecond=" + cacheStats.getIOHitsPerSecond() + ", " +
464         "IOTimePerHit=" + String.format("%.2f", cacheStats.getIOTimePerHit())+ ", " +
465         "hitRatio=" + (cacheStats.getHitCount() == 0 ? "0," : 
466           (StringUtils.formatPercent(cacheStats.getHitRatio(), 2)+ ", ")) +
467         "cachingAccesses=" + cacheStats.getRequestCachingCount() + ", " +
468         "cachingHits=" + cacheStats.getHitCachingCount() + ", " +
469         "cachingHitsRatio=" +(cacheStats.getHitCachingCount() == 0 ? "0," : 
470           (StringUtils.formatPercent(cacheStats.getHitCachingRatio(), 2)+ ", ")) +
471         "evictions=" + cacheStats.getEvictionCount() + ", " +
472         "evicted=" + cacheStats.getEvictedCount() + ", " +
473         "evictedPerRun=" + cacheStats.evictedPerEviction());
474     cacheStats.reset();
475   }
476 
477   private long acceptableSize() {
478     return (long) Math.floor(bucketAllocator.getTotalSize() * DEFAULT_ACCEPT_FACTOR);
479   }
480 
481   private long minSize() {
482     return (long) Math.floor(bucketAllocator.getTotalSize() * DEFAULT_MIN_FACTOR);
483   }
484 
485   private long singleSize() {
486     return (long) Math.floor(bucketAllocator.getTotalSize()
487         * DEFAULT_SINGLE_FACTOR * DEFAULT_MIN_FACTOR);
488   }
489 
490   private long multiSize() {
491     return (long) Math.floor(bucketAllocator.getTotalSize() * DEFAULT_MULTI_FACTOR
492         * DEFAULT_MIN_FACTOR);
493   }
494 
495   private long memorySize() {
496     return (long) Math.floor(bucketAllocator.getTotalSize() * DEFAULT_MEMORY_FACTOR
497         * DEFAULT_MIN_FACTOR);
498   }
499 
500   /**
501    * Free the space if the used size reaches acceptableSize() or one size block
502    * couldn't be allocated. When freeing the space, we use the LRU algorithm and
503    * ensure there must be some blocks evicted
504    */
505   private void freeSpace() {
506     // Ensure only one freeSpace progress at a time
507     if (!freeSpaceLock.tryLock()) return;
508     try {
509       freeInProgress = true;
510       long bytesToFreeWithoutExtra = 0;
511       /*
512        * Calculate free byte for each bucketSizeinfo
513        */
514       StringBuffer msgBuffer = new StringBuffer();
515       BucketAllocator.IndexStatistics[] stats = bucketAllocator.getIndexStatistics();
516       long[] bytesToFreeForBucket = new long[stats.length];
517       for (int i = 0; i < stats.length; i++) {
518         bytesToFreeForBucket[i] = 0;
519         long freeGoal = (long) Math.floor(stats[i].totalCount()
520             * (1 - DEFAULT_MIN_FACTOR));
521         freeGoal = Math.max(freeGoal, 1);
522         if (stats[i].freeCount() < freeGoal) {
523           bytesToFreeForBucket[i] = stats[i].itemSize()
524           * (freeGoal - stats[i].freeCount());
525           bytesToFreeWithoutExtra += bytesToFreeForBucket[i];
526           msgBuffer.append("Free for bucketSize(" + stats[i].itemSize() + ")="
527               + StringUtils.byteDesc(bytesToFreeForBucket[i]) + ", ");
528         }
529       }
530       msgBuffer.append("Free for total="
531           + StringUtils.byteDesc(bytesToFreeWithoutExtra) + ", ");
532 
533       if (bytesToFreeWithoutExtra <= 0) {
534         return;
535       }
536       long currentSize = bucketAllocator.getUsedSize();
537       long totalSize=bucketAllocator.getTotalSize();
538       LOG.debug("Bucket cache free space started; Attempting to  " + msgBuffer.toString()
539           + " of current used=" + StringUtils.byteDesc(currentSize)
540           + ",actual cacheSize=" + StringUtils.byteDesc(realCacheSize.get())
541           + ",total=" + StringUtils.byteDesc(totalSize));
542       
543       long bytesToFreeWithExtra = (long) Math.floor(bytesToFreeWithoutExtra
544           * (1 + DEFAULT_EXTRA_FREE_FACTOR));
545 
546       // Instantiate priority buckets
547       BucketEntryGroup bucketSingle = new BucketEntryGroup(bytesToFreeWithExtra,
548           blockSize, singleSize());
549       BucketEntryGroup bucketMulti = new BucketEntryGroup(bytesToFreeWithExtra,
550           blockSize, multiSize());
551       BucketEntryGroup bucketMemory = new BucketEntryGroup(bytesToFreeWithExtra,
552           blockSize, memorySize());
553 
554       // Scan entire map putting bucket entry into appropriate bucket entry
555       // group
556       for (Map.Entry<BlockCacheKey, BucketEntry> bucketEntryWithKey : backingMap.entrySet()) {
557         switch (bucketEntryWithKey.getValue().getPriority()) {
558           case SINGLE: {
559             bucketSingle.add(bucketEntryWithKey);
560             break;
561           }
562           case MULTI: {
563             bucketMulti.add(bucketEntryWithKey);
564             break;
565           }
566           case MEMORY: {
567             bucketMemory.add(bucketEntryWithKey);
568             break;
569           }
570         }
571       }
572 
573       PriorityQueue<BucketEntryGroup> bucketQueue = new PriorityQueue<BucketEntryGroup>(3);
574 
575       bucketQueue.add(bucketSingle);
576       bucketQueue.add(bucketMulti);
577       bucketQueue.add(bucketMemory);
578 
579       int remainingBuckets = 3;
580       long bytesFreed = 0;
581 
582       BucketEntryGroup bucketGroup;
583       while ((bucketGroup = bucketQueue.poll()) != null) {
584         long overflow = bucketGroup.overflow();
585         if (overflow > 0) {
586           long bucketBytesToFree = Math.min(overflow,
587               (bytesToFreeWithoutExtra - bytesFreed) / remainingBuckets);
588           bytesFreed += bucketGroup.free(bucketBytesToFree);
589         }
590         remainingBuckets--;
591       }
592 
593       /**
594        * Check whether need extra free because some bucketSizeinfo still needs
595        * free space
596        */
597       stats = bucketAllocator.getIndexStatistics();
598       boolean needFreeForExtra = false;
599       for (int i = 0; i < stats.length; i++) {
600         long freeGoal = (long) Math.floor(stats[i].totalCount()
601             * (1 - DEFAULT_MIN_FACTOR));
602         freeGoal = Math.max(freeGoal, 1);
603         if (stats[i].freeCount() < freeGoal) {
604           needFreeForExtra = true;
605           break;
606         }
607       }
608 
609       if (needFreeForExtra) {
610         bucketQueue.clear();
611         remainingBuckets = 2;
612 
613         bucketQueue.add(bucketSingle);
614         bucketQueue.add(bucketMulti);
615 
616         while ((bucketGroup = bucketQueue.poll()) != null) {
617           long bucketBytesToFree = (bytesToFreeWithExtra - bytesFreed)
618               / remainingBuckets;
619           bytesFreed += bucketGroup.free(bucketBytesToFree);
620           remainingBuckets--;
621         }
622       }
623 
624       if (LOG.isDebugEnabled()) {
625         long single = bucketSingle.totalSize();
626         long multi = bucketMulti.totalSize();
627         long memory = bucketMemory.totalSize();
628         LOG.debug("Bucket cache free space completed; " + "freed="
629             + StringUtils.byteDesc(bytesFreed) + ", " + "total="
630             + StringUtils.byteDesc(totalSize) + ", " + "single="
631             + StringUtils.byteDesc(single) + ", " + "multi="
632             + StringUtils.byteDesc(multi) + ", " + "memory="
633             + StringUtils.byteDesc(memory));
634       }
635 
636     } finally {
637       cacheStats.evict();
638       freeInProgress = false;
639       freeSpaceLock.unlock();
640     }
641   }
642 
643   // This handles flushing the RAM cache to IOEngine.
644   private class WriterThread extends HasThread {
645     BlockingQueue<RAMQueueEntry> inputQueue;
646     final int threadNO;
647     boolean writerEnabled = true;
648 
649     WriterThread(BlockingQueue<RAMQueueEntry> queue, int threadNO) {
650       super();
651       this.inputQueue = queue;
652       this.threadNO = threadNO;
653       setDaemon(true);
654     }
655     
656     // Used for test
657     void disableWriter() {
658       this.writerEnabled = false;
659     }
660 
661     public void run() {
662       List<RAMQueueEntry> entries = new ArrayList<RAMQueueEntry>();
663       try {
664         while (cacheEnabled && writerEnabled) {
665           try {
666             // Blocks
667             entries.add(inputQueue.take());
668             inputQueue.drainTo(entries);
669             synchronized (cacheWaitSignals[threadNO]) {
670               cacheWaitSignals[threadNO].notifyAll();
671             }
672           } catch (InterruptedException ie) {
673             if (!cacheEnabled) break;
674           }
675           doDrain(entries);
676         }
677       } catch (Throwable t) {
678         LOG.warn("Failed doing drain", t);
679       }
680       LOG.info(this.getName() + " exiting, cacheEnabled=" + cacheEnabled);
681     }
682 
683     /**
684      * Flush the entries in ramCache to IOEngine and add bucket entry to
685      * backingMap
686      * @param entries
687      * @throws InterruptedException
688      */
689     private void doDrain(List<RAMQueueEntry> entries)
690         throws InterruptedException {
691       BucketEntry[] bucketEntries = new BucketEntry[entries.size()];
692       RAMQueueEntry[] ramEntries = new RAMQueueEntry[entries.size()];
693       int done = 0;
694       while (entries.size() > 0 && cacheEnabled) {
695         // Keep going in case we throw...
696         RAMQueueEntry ramEntry = null;
697         try {
698           ramEntry = entries.remove(entries.size() - 1);
699           if (ramEntry == null) {
700             LOG.warn("Couldn't get the entry from RAM queue, who steals it?");
701             continue;
702           }
703           BucketEntry bucketEntry = ramEntry.writeToCache(ioEngine,
704               bucketAllocator, deserialiserMap, realCacheSize);
705           ramEntries[done] = ramEntry;
706           bucketEntries[done++] = bucketEntry;
707           if (ioErrorStartTime > 0) {
708             ioErrorStartTime = -1;
709           }
710         } catch (BucketAllocatorException fle) {
711           LOG.warn("Failed allocating for block "
712               + (ramEntry == null ? "" : ramEntry.getKey()), fle);
713         } catch (CacheFullException cfe) {
714           if (!freeInProgress) {
715             freeSpace();
716           } else {
717             Thread.sleep(50);
718           }
719         } catch (IOException ioex) {
720           LOG.error("Failed writing to bucket cache", ioex);
721           checkIOErrorIsTolerated();
722         }
723       }
724 
725       // Make sure that the data pages we have written are on the media before
726       // we update the map.
727       try {
728         ioEngine.sync();
729       } catch (IOException ioex) {
730         LOG.error("Faild syncing IO engine", ioex);
731         checkIOErrorIsTolerated();
732         // Since we failed sync, free the blocks in bucket allocator
733         for (int i = 0; i < done; ++i) {
734           if (bucketEntries[i] != null) {
735             bucketAllocator.freeBlock(bucketEntries[i].offset());
736           }
737         }
738         done = 0;
739       }
740 
741       for (int i = 0; i < done; ++i) {
742         if (bucketEntries[i] != null) {
743           backingMap.put(ramEntries[i].getKey(), bucketEntries[i]);
744         }
745         RAMQueueEntry ramCacheEntry = ramCache.remove(ramEntries[i].getKey());
746         if (ramCacheEntry != null) {
747           heapSize.addAndGet(-1 * ramEntries[i].getData().heapSize());
748         }
749       }
750 
751       if (bucketAllocator.getUsedSize() > acceptableSize()) {
752         freeSpace();
753       }
754     }
755   }
756 
757   
758 
759   private void persistToFile() throws IOException {
760     assert !cacheEnabled;
761     FileOutputStream fos = null;
762     ObjectOutputStream oos = null;
763     try {
764       if (!ioEngine.isPersistent())
765         throw new IOException(
766             "Attempt to persist non-persistent cache mappings!");
767       fos = new FileOutputStream(persistencePath, false);
768       oos = new ObjectOutputStream(fos);
769       oos.writeLong(cacheCapacity);
770       oos.writeUTF(ioEngine.getClass().getName());
771       oos.writeUTF(backingMap.getClass().getName());
772       oos.writeObject(deserialiserMap);
773       oos.writeObject(backingMap);
774     } finally {
775       if (oos != null) oos.close();
776       if (fos != null) fos.close();
777     }
778   }
779 
780   @SuppressWarnings("unchecked")
781   private void retrieveFromFile() throws IOException, BucketAllocatorException,
782       ClassNotFoundException {
783     File persistenceFile = new File(persistencePath);
784     if (!persistenceFile.exists()) {
785       return;
786     }
787     assert !cacheEnabled;
788     FileInputStream fis = null;
789     ObjectInputStream ois = null;
790     try {
791       if (!ioEngine.isPersistent())
792         throw new IOException(
793             "Attempt to restore non-persistent cache mappings!");
794       fis = new FileInputStream(persistencePath);
795       ois = new ObjectInputStream(fis);
796       long capacitySize = ois.readLong();
797       if (capacitySize != cacheCapacity)
798         throw new IOException("Mismatched cache capacity:"
799             + StringUtils.byteDesc(capacitySize) + ", expected: "
800             + StringUtils.byteDesc(cacheCapacity));
801       String ioclass = ois.readUTF();
802       String mapclass = ois.readUTF();
803       if (!ioEngine.getClass().getName().equals(ioclass))
804         throw new IOException("Class name for IO engine mismatch: " + ioclass
805             + ", expected:" + ioEngine.getClass().getName());
806       if (!backingMap.getClass().getName().equals(mapclass))
807         throw new IOException("Class name for cache map mismatch: " + mapclass
808             + ", expected:" + backingMap.getClass().getName());
809       UniqueIndexMap<Integer> deserMap = (UniqueIndexMap<Integer>) ois
810           .readObject();
811       BucketAllocator allocator = new BucketAllocator(cacheCapacity,
812           backingMap, this.realCacheSize);
813       backingMap = (ConcurrentHashMap<BlockCacheKey, BucketEntry>) ois
814           .readObject();
815       bucketAllocator = allocator;
816       deserialiserMap = deserMap;
817     } finally {
818       if (ois != null) ois.close();
819       if (fis != null) fis.close();
820       if (!persistenceFile.delete()) {
821         throw new IOException("Failed deleting persistence file "
822             + persistenceFile.getAbsolutePath());
823       }
824     }
825   }
826 
827   /**
828    * Check whether we tolerate IO error this time. If the duration of IOEngine
829    * throwing errors exceeds ioErrorsDurationTimeTolerated, we will disable the
830    * cache
831    */
832   private void checkIOErrorIsTolerated() {
833     long now = EnvironmentEdgeManager.currentTimeMillis();
834     if (this.ioErrorStartTime > 0) {
835       if (cacheEnabled
836           && (now - ioErrorStartTime) > this.ioErrorsTolerationDuration) {
837         LOG.error("IO errors duration time has exceeded "
838             + ioErrorsTolerationDuration
839             + "ms, disabing cache, please check your IOEngine");
840         disableCache();
841       }
842     } else {
843       this.ioErrorStartTime = now;
844     }
845   }
846 
847   /**
848    * Used to shut down the cache -or- turn it off in the case of something
849    * broken.
850    */
851   private void disableCache() {
852     if (!cacheEnabled)
853       return;
854     cacheEnabled = false;
855     ioEngine.shutdown();
856     this.scheduleThreadPool.shutdown();
857     for (int i = 0; i < writerThreads.length; ++i)
858       writerThreads[i].interrupt();
859     this.ramCache.clear();
860     if (!ioEngine.isPersistent() || persistencePath == null) {
861       this.backingMap.clear();
862     }
863   }
864 
865   private void join() throws InterruptedException {
866     for (int i = 0; i < writerThreads.length; ++i)
867       writerThreads[i].join();
868   }
869 
870   @Override
871   public void shutdown() {
872     disableCache();
873     LOG.info("Shutdown bucket cache: IO persistent=" + ioEngine.isPersistent()
874         + "; path to write=" + persistencePath);
875     if (ioEngine.isPersistent() && persistencePath != null) {
876       try {
877         join();
878         persistToFile();
879       } catch (IOException ex) {
880         LOG.error("Unable to persist data on exit: " + ex.toString(), ex);
881       } catch (InterruptedException e) {
882         LOG.warn("Failed to persist data on exit", e);
883       }
884     }
885   }
886 
887   @Override
888   public CacheStats getStats() {
889     return cacheStats;
890   }
891 
892   BucketAllocator getAllocator() {
893     return this.bucketAllocator;
894   }
895 
896   public long heapSize() {
897     return this.heapSize.get();
898   }
899 
900   /**
901    * Returns the total size of the block cache, in bytes.
902    * @return size of cache, in bytes
903    */
904   @Override
905   public long size() {
906     return this.realCacheSize.get();
907   }
908 
909   @Override
910   public long getFreeSize() {
911     return this.bucketAllocator.getFreeSize();
912   }
913 
914   @Override
915   public long getBlockCount() {
916     return this.blockNumber.get();
917   }
918 
919   /**
920    * Returns the occupied size of the block cache, in bytes.
921    * @return occupied space in cache, in bytes
922    */
923   @Override
924   public long getCurrentSize() {
925     return this.bucketAllocator.getUsedSize();
926   }
927 
928   @Override
929   public long getEvictedCount() {
930     return cacheStats.getEvictedCount();
931   }
932 
933   /**
934    * Evicts all blocks for a specific HFile. 
935    * <p>
936    * This is used for evict-on-close to remove all blocks of a specific HFile.
937    * 
938    * @return the number of blocks evicted
939    */
940   @Override
941   public int evictBlocksByHfileName(String hfileName) {
942     // Copy the list to avoid ConcurrentModificationException
943     // as evictBlockKey removes the key from the index
944     Set<BlockCacheKey> keySet = blocksByHFile.values(hfileName);
945     if (keySet == null) {
946       return 0;
947     }
948     int numEvicted = 0;
949     List<BlockCacheKey> keysForHFile = ImmutableList.copyOf(keySet);
950     for (BlockCacheKey key : keysForHFile) {
951       if (evictBlock(key)) {
952           ++numEvicted;
953       }
954     }
955     
956     return numEvicted;
957   }
958 
959 
960   @Override
961   public List<BlockCacheColumnFamilySummary> getBlockCacheColumnFamilySummaries(
962       Configuration conf) {
963     throw new UnsupportedOperationException();
964   }
965 
966   static enum BlockPriority {
967     /**
968      * Accessed a single time (used for scan-resistance)
969      */
970     SINGLE,
971     /**
972      * Accessed multiple times
973      */
974     MULTI,
975     /**
976      * Block from in-memory store
977      */
978     MEMORY
979   };
980 
981   /**
982    * Item in cache. We expect this to be where most memory goes. Java uses 8
983    * bytes just for object headers; after this, we want to use as little as
984    * possible - so we only use 8 bytes, but in order to do so we end up messing
985    * around with all this Java casting stuff. Offset stored as 5 bytes that make
986    * up the long. Doubt we'll see devices this big for ages. Offsets are divided
987    * by 256. So 5 bytes gives us 256TB or so.
988    */
989   static class BucketEntry implements Serializable, Comparable<BucketEntry> {
990     private static final long serialVersionUID = -6741504807982257534L;
991     private int offsetBase;
992     private int length;
993     private byte offset1;
994     byte deserialiserIndex;
995     private volatile long accessTime;
996     private BlockPriority priority;
997 
998     BucketEntry(long offset, int length, long accessTime, boolean inMemory) {
999       setOffset(offset);
1000       this.length = length;
1001       this.accessTime = accessTime;
1002       if (inMemory) {
1003         this.priority = BlockPriority.MEMORY;
1004       } else {
1005         this.priority = BlockPriority.SINGLE;
1006       }
1007     }
1008 
1009     long offset() { // Java has no unsigned numbers
1010       long o = ((long) offsetBase) & 0xFFFFFFFF;
1011       o += (((long) (offset1)) & 0xFF) << 32;
1012       return o << 8;
1013     }
1014 
1015     private void setOffset(long value) {
1016       assert (value & 0xFF) == 0;
1017       value >>= 8;
1018       offsetBase = (int) value;
1019       offset1 = (byte) (value >> 32);
1020     }
1021 
1022     public int getLength() {
1023       return length;
1024     }
1025 
1026     protected CacheableDeserializer<Cacheable> deserializerReference(
1027         UniqueIndexMap<Integer> deserialiserMap) {
1028       return CacheableDeserializerIdManager.getDeserializer(deserialiserMap
1029           .unmap(deserialiserIndex));
1030     }
1031 
1032     protected void setDeserialiserReference(
1033         CacheableDeserializer<Cacheable> deserializer,
1034         UniqueIndexMap<Integer> deserialiserMap) {
1035       this.deserialiserIndex = ((byte) deserialiserMap.map(deserializer
1036           .getDeserialiserIdentifier()));
1037     }
1038 
1039     /**
1040      * Block has been accessed. Update its local access time.
1041      */
1042     public void access(long accessTime) {
1043       this.accessTime = accessTime;
1044       if (this.priority == BlockPriority.SINGLE) {
1045         this.priority = BlockPriority.MULTI;
1046       }
1047     }
1048     
1049     public BlockPriority getPriority() {
1050       return this.priority;
1051     }
1052 
1053     @Override
1054     public int compareTo(BucketEntry that) {
1055       if(this.accessTime == that.accessTime) return 0;
1056       return this.accessTime < that.accessTime ? 1 : -1;
1057     }
1058 
1059     @Override
1060     public boolean equals(Object that) {
1061       return this == that;
1062     }
1063   }
1064 
1065   /**
1066    * Used to group bucket entries into priority buckets. There will be a
1067    * BucketEntryGroup for each priority (single, multi, memory). Once bucketed,
1068    * the eviction algorithm takes the appropriate number of elements out of each
1069    * according to configuration parameters and their relative sizes.
1070    */
1071   private class BucketEntryGroup implements Comparable<BucketEntryGroup> {
1072 
1073     private CachedEntryQueue queue;
1074     private long totalSize = 0;
1075     private long bucketSize;
1076 
1077     public BucketEntryGroup(long bytesToFree, long blockSize, long bucketSize) {
1078       this.bucketSize = bucketSize;
1079       queue = new CachedEntryQueue(bytesToFree, blockSize);
1080       totalSize = 0;
1081     }
1082 
1083     public void add(Map.Entry<BlockCacheKey, BucketEntry> block) {
1084       totalSize += block.getValue().getLength();
1085       queue.add(block);
1086     }
1087 
1088     public long free(long toFree) {
1089       Map.Entry<BlockCacheKey, BucketEntry> entry;
1090       long freedBytes = 0;
1091       while ((entry = queue.pollLast()) != null) {
1092         evictBlock(entry.getKey());
1093         freedBytes += entry.getValue().getLength();
1094         if (freedBytes >= toFree) {
1095           return freedBytes;
1096         }
1097       }
1098       return freedBytes;
1099     }
1100 
1101     public long overflow() {
1102       return totalSize - bucketSize;
1103     }
1104 
1105     public long totalSize() {
1106       return totalSize;
1107     }
1108 
1109     @Override
1110     public int compareTo(BucketEntryGroup that) {
1111       if (this.overflow() == that.overflow())
1112         return 0;
1113       return this.overflow() > that.overflow() ? 1 : -1;
1114     }
1115 
1116     @Override
1117     public boolean equals(Object that) {
1118       return this == that;
1119     }
1120 
1121   }
1122 
1123   /**
1124    * Block Entry stored in the memory with key,data and so on
1125    */
1126   private static class RAMQueueEntry {
1127     private BlockCacheKey key;
1128     private Cacheable data;
1129     private long accessTime;
1130     private boolean inMemory;
1131 
1132     public RAMQueueEntry(BlockCacheKey bck, Cacheable data, long accessTime,
1133         boolean inMemory) {
1134       this.key = bck;
1135       this.data = data;
1136       this.accessTime = accessTime;
1137       this.inMemory = inMemory;
1138     }
1139 
1140     public Cacheable getData() {
1141       return data;
1142     }
1143 
1144     public BlockCacheKey getKey() {
1145       return key;
1146     }
1147 
1148     public void access(long accessTime) {
1149       this.accessTime = accessTime;
1150     }
1151 
1152     public BucketEntry writeToCache(final IOEngine ioEngine,
1153         final BucketAllocator bucketAllocator,
1154         final UniqueIndexMap<Integer> deserialiserMap,
1155         final AtomicLong realCacheSize) throws CacheFullException, IOException,
1156         BucketAllocatorException {
1157       int len = data.getSerializedLength();
1158       // This cacheable thing can't be serialized...
1159       if (len == 0) return null;
1160       long offset = bucketAllocator.allocateBlock(len);
1161       BucketEntry bucketEntry = new BucketEntry(offset, len, accessTime,
1162           inMemory);
1163       bucketEntry.setDeserialiserReference(data.getDeserializer(), deserialiserMap);
1164       try {
1165         if (data instanceof HFileBlock) {
1166           ByteBuffer sliceBuf = ((HFileBlock) data).getBufferReadOnlyWithHeader();
1167           sliceBuf.rewind();
1168           assert len == sliceBuf.limit() + HFileBlock.EXTRA_SERIALIZATION_SPACE;
1169           ByteBuffer extraInfoBuffer = ByteBuffer.allocate(HFileBlock.EXTRA_SERIALIZATION_SPACE);
1170           ((HFileBlock) data).serializeExtraInfo(extraInfoBuffer);
1171           ioEngine.write(sliceBuf, offset);
1172           ioEngine.write(extraInfoBuffer, offset + len - HFileBlock.EXTRA_SERIALIZATION_SPACE);
1173         } else {
1174           ByteBuffer bb = ByteBuffer.allocate(len);
1175           data.serialize(bb);
1176           ioEngine.write(bb, offset);
1177         }
1178       } catch (IOException ioe) {
1179         // free it in bucket allocator
1180         bucketAllocator.freeBlock(offset);
1181         throw ioe;
1182       }
1183       
1184       realCacheSize.addAndGet(len);
1185       return bucketEntry;
1186     }
1187   }
1188 
1189   /**
1190    * Only used in test
1191    * @throws InterruptedException
1192    */
1193   void stopWriterThreads() throws InterruptedException {
1194     for (WriterThread writerThread : writerThreads) {
1195       writerThread.disableWriter();
1196       writerThread.interrupt();
1197       writerThread.join();
1198     }
1199   }
1200 
1201 }