View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  package org.apache.hadoop.hbase.regionserver;
20  
21  import java.util.LinkedList;
22  
23  import org.apache.hadoop.classification.InterfaceAudience;
24  import org.apache.hadoop.hbase.util.Bytes;
25  import org.apache.hadoop.hbase.util.ClassSize;
26  
27  import org.apache.commons.logging.LogFactory;
28  import org.apache.commons.logging.Log;
29  
30  /**
31   * Manages the read/write consistency within memstore. This provides
32   * an interface for readers to determine what entries to ignore, and
33   * a mechanism for writers to obtain new write numbers, then "commit"
34   * the new writes for readers to read (thus forming atomic transactions).
35   */
36  @InterfaceAudience.Private
37  public class MultiVersionConsistencyControl {
38    private volatile long memstoreRead = 0;
39    private volatile long memstoreWrite = 0;
40  
41    private final Object readWaiters = new Object();
42  
43    // This is the pending queue of writes.
44    private final LinkedList<WriteEntry> writeQueue =
45        new LinkedList<WriteEntry>();
46  
47    private static final ThreadLocal<Long> perThreadReadPoint =
48        new ThreadLocal<Long>() {
49         @Override
50        protected
51         Long initialValue() {
52           return Long.MAX_VALUE;
53         }
54    };
55  
56    /**
57     * Default constructor. Initializes the memstoreRead/Write points to 0.
58     */
59    public MultiVersionConsistencyControl() {
60      this.memstoreRead = this.memstoreWrite = 0;
61    }
62  
63    /**
64     * Initializes the memstoreRead/Write points appropriately.
65     * @param startPoint
66     */
67    public void initialize(long startPoint) {
68      synchronized (writeQueue) {
69        if (this.memstoreWrite != this.memstoreRead) {
70          throw new RuntimeException("Already used this mvcc. Too late to initialize");
71        }
72  
73        this.memstoreRead = this.memstoreWrite = startPoint;
74      }
75    }
76  
77    /**
78     * Get this thread's read point. Used primarily by the memstore scanner to
79     * know which values to skip (ie: have not been completed/committed to
80     * memstore).
81     */
82    public static long getThreadReadPoint() {
83        return perThreadReadPoint.get();
84    }
85  
86    /**
87     * Set the thread read point to the given value. The thread MVCC
88     * is used by the Memstore scanner so it knows which values to skip.
89     * Give it a value of 0 if you want everything.
90     */
91    public static void setThreadReadPoint(long readPoint) {
92      perThreadReadPoint.set(readPoint);
93    }
94  
95    /**
96     * Set the thread MVCC read point to whatever the current read point is in
97     * this particular instance of MVCC.  Returns the new thread read point value.
98     */
99    public static long resetThreadReadPoint(MultiVersionConsistencyControl mvcc) {
100     perThreadReadPoint.set(mvcc.memstoreReadPoint());
101     return getThreadReadPoint();
102   }
103 
104   /**
105    * Set the thread MVCC read point to 0 (include everything).
106    */
107   public static void resetThreadReadPoint() {
108     perThreadReadPoint.set(0L);
109   }
110 
111   /**
112    * Generate and return a {@link WriteEntry} with a new write number.
113    * To complete the WriteEntry and wait for it to be visible,
114    * call {@link #completeMemstoreInsert(WriteEntry)}.
115    */
116   public WriteEntry beginMemstoreInsert() {
117     synchronized (writeQueue) {
118       long nextWriteNumber = ++memstoreWrite;
119       WriteEntry e = new WriteEntry(nextWriteNumber);
120       writeQueue.add(e);
121       return e;
122     }
123   }
124 
125   /**
126    * Complete a {@link WriteEntry} that was created by {@link #beginMemstoreInsert()}.
127    *
128    * At the end of this call, the global read point is at least as large as the write point
129    * of the passed in WriteEntry.  Thus, the write is visible to MVCC readers.
130    */
131   public void completeMemstoreInsert(WriteEntry e) {
132     advanceMemstore(e);
133     waitForRead(e);
134   }
135 
136   /**
137    * Mark the {@link WriteEntry} as complete and advance the read point as
138    * much as possible.
139    *
140    * How much is the read point advanced?
141    * Let S be the set of all write numbers that are completed and where all previous write numbers
142    * are also completed.  Then, the read point is advanced to the supremum of S.
143    *
144    * @param e
145    * @return true if e is visible to MVCC readers (that is, readpoint >= e.writeNumber)
146    */
147   boolean advanceMemstore(WriteEntry e) {
148     synchronized (writeQueue) {
149       e.markCompleted();
150 
151       long nextReadValue = -1;
152       boolean ranOnce=false;
153       while (!writeQueue.isEmpty()) {
154         ranOnce=true;
155         WriteEntry queueFirst = writeQueue.getFirst();
156 
157         if (nextReadValue > 0) {
158           if (nextReadValue+1 != queueFirst.getWriteNumber()) {
159             throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "
160                 + nextReadValue + " next: " + queueFirst.getWriteNumber());
161           }
162         }
163 
164         if (queueFirst.isCompleted()) {
165           nextReadValue = queueFirst.getWriteNumber();
166           writeQueue.removeFirst();
167         } else {
168           break;
169         }
170       }
171 
172       if (!ranOnce) {
173         throw new RuntimeException("never was a first");
174       }
175 
176       if (nextReadValue > 0) {
177         synchronized (readWaiters) {
178           memstoreRead = nextReadValue;
179           readWaiters.notifyAll();
180         }
181       }
182       if (memstoreRead >= e.getWriteNumber()) {
183         return true;
184       }
185       return false;
186     }
187   }
188 
189   /**
190    * Wait for the global readPoint to advance upto
191    * the specified transaction number.
192    */
193   public void waitForRead(WriteEntry e) {
194     boolean interrupted = false;
195     synchronized (readWaiters) {
196       while (memstoreRead < e.getWriteNumber()) {
197         try {
198           readWaiters.wait(0);
199         } catch (InterruptedException ie) {
200           // We were interrupted... finish the loop -- i.e. cleanup --and then
201           // on our way out, reset the interrupt flag.
202           interrupted = true;
203         }
204       }
205     }
206     if (interrupted) Thread.currentThread().interrupt();
207   }
208 
209   public long memstoreReadPoint() {
210     return memstoreRead;
211   }
212 
213 
214   public static class WriteEntry {
215     private long writeNumber;
216     private boolean completed = false;
217     WriteEntry(long writeNumber) {
218       this.writeNumber = writeNumber;
219     }
220     void markCompleted() {
221       this.completed = true;
222     }
223     boolean isCompleted() {
224       return this.completed;
225     }
226     long getWriteNumber() {
227       return this.writeNumber;
228     }
229   }
230 
231   public static final long FIXED_SIZE = ClassSize.align(
232       ClassSize.OBJECT +
233       2 * Bytes.SIZEOF_LONG +
234       2 * ClassSize.REFERENCE);
235 
236 }