View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.regionserver;
21  
22  import java.io.IOException;
23  import java.util.NavigableSet;
24  
25  import org.apache.hadoop.classification.InterfaceAudience;
26  import org.apache.hadoop.hbase.HConstants;
27  import org.apache.hadoop.hbase.KeyValue;
28  import org.apache.hadoop.hbase.client.Scan;
29  import org.apache.hadoop.hbase.filter.Filter;
30  import org.apache.hadoop.hbase.filter.Filter.ReturnCode;
31  import org.apache.hadoop.hbase.io.TimeRange;
32  import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult;
33  import org.apache.hadoop.hbase.util.Bytes;
34  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
35  
36  import com.google.common.base.Preconditions;
37  
38  /**
39   * A query matcher that is specifically designed for the scan case.
40   */
41  @InterfaceAudience.Private
42  public class ScanQueryMatcher {
43    // Optimization so we can skip lots of compares when we decide to skip
44    // to the next row.
45    private boolean stickyNextRow;
46    private final byte[] stopRow;
47  
48    private final TimeRange tr;
49  
50    private final Filter filter;
51  
52    /** Keeps track of deletes */
53    private final DeleteTracker deletes;
54  
55    /*
56     * The following three booleans define how we deal with deletes.
57     * There are three different aspects:
58     * 1. Whether to keep delete markers. This is used in compactions.
59     *    Minor compactions always keep delete markers.
60     * 2. Whether to keep deleted rows. This is also used in compactions,
61     *    if the store is set to keep deleted rows. This implies keeping
62     *    the delete markers as well.
63     *    In this case deleted rows are subject to the normal max version
64     *    and TTL/min version rules just like "normal" rows.
65     * 3. Whether a scan can do time travel queries even before deleted
66     *    marker to reach deleted rows.
67     */
68    /** whether to retain delete markers */
69    private boolean retainDeletesInOutput;
70  
71    /** whether to return deleted rows */
72    private final boolean keepDeletedCells;
73    /** whether time range queries can see rows "behind" a delete */
74    private final boolean seePastDeleteMarkers;
75  
76  
77    /** Keeps track of columns and versions */
78    private final ColumnTracker columns;
79  
80    /** Key to seek to in memstore and StoreFiles */
81    private final KeyValue startKey;
82  
83    /** Row comparator for the region this query is for */
84    private final KeyValue.KeyComparator rowComparator;
85  
86    /* row is not private for tests */
87    /** Row the query is on */
88    byte [] row;
89    int rowOffset;
90    short rowLength;
91    
92    /**
93     * Oldest put in any of the involved store files
94     * Used to decide whether it is ok to delete
95     * family delete marker of this store keeps
96     * deleted KVs.
97     */
98    private final long earliestPutTs;
99  
100   /** readPoint over which the KVs are unconditionally included */
101   protected long maxReadPointToTrackVersions;
102 
103   private byte[] dropDeletesFromRow = null, dropDeletesToRow = null;
104 
105   /**
106    * This variable shows whether there is an null column in the query. There
107    * always exists a null column in the wildcard column query.
108    * There maybe exists a null column in the explicit column query based on the
109    * first column.
110    * */
111   private boolean hasNullColumn = true;
112 
113   // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete
114   // marker is always removed during a major compaction. If set to non-zero
115   // value then major compaction will try to keep a delete marker around for
116   // the given number of milliseconds. We want to keep the delete markers
117   // around a bit longer because old puts might appear out-of-order. For
118   // example, during log replication between two clusters.
119   //
120   // If the delete marker has lived longer than its column-family's TTL then
121   // the delete marker will be removed even if time.to.purge.deletes has not
122   // passed. This is because all the Puts that this delete marker can influence
123   // would have also expired. (Removing of delete markers on col family TTL will
124   // not happen if min-versions is set to non-zero)
125   //
126   // But, if time.to.purge.deletes has not expired then a delete
127   // marker will not be removed just because there are no Puts that it is
128   // currently influencing. This is because Puts, that this delete can
129   // influence.  may appear out of order.
130   private final long timeToPurgeDeletes;
131   
132   private final boolean isUserScan;
133 
134   /**
135    * Construct a QueryMatcher for a scan
136    * @param scan
137    * @param scanInfo The store's immutable scan info
138    * @param columns
139    * @param scanType Type of the scan
140    * @param earliestPutTs Earliest put seen in any of the store files.
141    * @param oldestUnexpiredTS the oldest timestamp we are interested in,
142    *  based on TTL
143    */
144   public ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
145       NavigableSet<byte[]> columns, ScanType scanType,
146       long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) {
147     this.tr = scan.getTimeRange();
148     this.rowComparator = scanInfo.getComparator().getRawComparator();
149     this.deletes =  new ScanDeleteTracker();
150     this.stopRow = scan.getStopRow();
151     this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(),
152         scanInfo.getFamily());
153     this.filter = scan.getFilter();
154     this.earliestPutTs = earliestPutTs;
155     this.maxReadPointToTrackVersions = readPointToUse;
156     this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes();
157 
158     /* how to deal with deletes */
159     this.isUserScan = scanType == ScanType.USER_SCAN;
160     // keep deleted cells: if compaction or raw scan
161     this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw();
162     // retain deletes: if minor compaction or raw scan
163     this.retainDeletesInOutput = scanType == ScanType.COMPACT_RETAIN_DELETES || scan.isRaw();
164     // seePastDeleteMarker: user initiated scans
165     this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan;
166 
167     int maxVersions =
168         scan.isRaw() ? scan.getMaxVersions() : Math.min(scan.getMaxVersions(),
169           scanInfo.getMaxVersions());
170 
171     // Single branch to deal with two types of reads (columns vs all in family)
172     if (columns == null || columns.size() == 0) {
173       // there is always a null column in the wildcard column query.
174       hasNullColumn = true;
175 
176       // use a specialized scan for wildcard column tracker.
177       this.columns = new ScanWildcardColumnTracker(
178           scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
179     } else {
180       // whether there is null column in the explicit column query
181       hasNullColumn = (columns.first().length == 0);
182 
183       // We can share the ExplicitColumnTracker, diff is we reset
184       // between rows, not between storefiles.
185       this.columns = new ExplicitColumnTracker(columns,
186           scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
187     }
188   }
189 
190   /**
191    * Construct a QueryMatcher for a scan that drop deletes from a limited range of rows.
192    * @param scan
193    * @param scanInfo The store's immutable scan info
194    * @param columns
195    * @param earliestPutTs Earliest put seen in any of the store files.
196    * @param oldestUnexpiredTS the oldest timestamp we are interested in,
197    *  based on TTL
198    * @param dropDeletesFromRow The inclusive left bound of the range; can be EMPTY_START_ROW.
199    * @param dropDeletesToRow The exclusive right bound of the range; can be EMPTY_END_ROW.
200    */
201   public ScanQueryMatcher(Scan scan, ScanInfo scanInfo, NavigableSet<byte[]> columns,
202       long readPointToUse, long earliestPutTs, long oldestUnexpiredTS,
203       byte[] dropDeletesFromRow, byte[] dropDeletesToRow) {
204     this(scan, scanInfo, columns, ScanType.COMPACT_RETAIN_DELETES, readPointToUse, earliestPutTs,
205         oldestUnexpiredTS);
206     Preconditions.checkArgument((dropDeletesFromRow != null) && (dropDeletesToRow != null));
207     this.dropDeletesFromRow = dropDeletesFromRow;
208     this.dropDeletesToRow = dropDeletesToRow;
209   }
210 
211   /*
212    * Constructor for tests
213    */
214   ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
215       NavigableSet<byte[]> columns, long oldestUnexpiredTS) {
216     this(scan, scanInfo, columns, ScanType.USER_SCAN,
217           Long.MAX_VALUE, /* max Readpoint to track versions */
218         HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS);
219   }
220 
221   /**
222    *
223    * @return  whether there is an null column in the query
224    */
225   public boolean hasNullColumnInQuery() {
226     return hasNullColumn;
227   }
228 
229   /**
230    * Determines if the caller should do one of several things:
231    * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)
232    * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL)
233    * - include the current KeyValue (MatchCode.INCLUDE)
234    * - ignore the current KeyValue (MatchCode.SKIP)
235    * - got to the next row (MatchCode.DONE)
236    *
237    * @param kv KeyValue to check
238    * @return The match code instance.
239    * @throws IOException in case there is an internal consistency problem
240    *      caused by a data corruption.
241    */
242   public MatchCode match(KeyValue kv) throws IOException {
243     if (filter != null && filter.filterAllRemaining()) {
244       return MatchCode.DONE_SCAN;
245     }
246 
247     byte [] bytes = kv.getBuffer();
248     int offset = kv.getOffset();
249     int initialOffset = offset;
250 
251     int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT);
252     offset += KeyValue.ROW_OFFSET;
253 
254     short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT);
255     offset += Bytes.SIZEOF_SHORT;
256 
257     int ret = this.rowComparator.compareRows(row, this.rowOffset, this.rowLength,
258         bytes, offset, rowLength);
259     if (ret <= -1) {
260       return MatchCode.DONE;
261     } else if (ret >= 1) {
262       // could optimize this, if necessary?
263       // Could also be called SEEK_TO_CURRENT_ROW, but this
264       // should be rare/never happens.
265       return MatchCode.SEEK_NEXT_ROW;
266     }
267 
268     // optimize case.
269     if (this.stickyNextRow)
270         return MatchCode.SEEK_NEXT_ROW;
271 
272     if (this.columns.done()) {
273       stickyNextRow = true;
274       return MatchCode.SEEK_NEXT_ROW;
275     }
276 
277     //Passing rowLength
278     offset += rowLength;
279 
280     //Skipping family
281     byte familyLength = bytes [offset];
282     offset += familyLength + 1;
283 
284     int qualLength = keyLength + KeyValue.ROW_OFFSET -
285       (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE;
286 
287     long timestamp = kv.getTimestamp();
288     // check for early out based on timestamp alone
289     if (columns.isDone(timestamp)) {
290         return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
291     }
292 
293     /*
294      * The delete logic is pretty complicated now.
295      * This is corroborated by the following:
296      * 1. The store might be instructed to keep deleted rows around.
297      * 2. A scan can optionally see past a delete marker now.
298      * 3. If deleted rows are kept, we have to find out when we can
299      *    remove the delete markers.
300      * 4. Family delete markers are always first (regardless of their TS)
301      * 5. Delete markers should not be counted as version
302      * 6. Delete markers affect puts of the *same* TS
303      * 7. Delete marker need to be version counted together with puts
304      *    they affect
305      */
306     byte type = kv.getType();
307     if (kv.isDelete()) {
308       if (!keepDeletedCells) {
309         // first ignore delete markers if the scanner can do so, and the
310         // range does not include the marker
311         //
312         // during flushes and compactions also ignore delete markers newer
313         // than the readpoint of any open scanner, this prevents deleted
314         // rows that could still be seen by a scanner from being collected
315         boolean includeDeleteMarker = seePastDeleteMarkers ?
316             tr.withinTimeRange(timestamp) :
317             tr.withinOrAfterTimeRange(timestamp);
318         if (includeDeleteMarker
319             && kv.getMvccVersion() <= maxReadPointToTrackVersions) {
320           this.deletes.add(bytes, offset, qualLength, timestamp, type);
321         }
322         // Can't early out now, because DelFam come before any other keys
323       }
324       if (retainDeletesInOutput
325           || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes)
326           || kv.getMvccVersion() > maxReadPointToTrackVersions) {
327         // always include or it is not time yet to check whether it is OK
328         // to purge deltes or not
329         if (!isUserScan) {
330           // if this is not a user scan (compaction), we can filter this deletemarker right here
331           // otherwise (i.e. a "raw" scan) we fall through to normal version and timerange checking
332           return MatchCode.INCLUDE;
333         }
334       } else if (keepDeletedCells) {
335         if (timestamp < earliestPutTs) {
336           // keeping delete rows, but there are no puts older than
337           // this delete in the store files.
338           return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
339         }
340         // else: fall through and do version counting on the
341         // delete markers
342       } else {
343         return MatchCode.SKIP;
344       }
345       // note the following next else if...
346       // delete marker are not subject to other delete markers
347     } else if (!this.deletes.isEmpty()) {
348       DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength,
349           timestamp);
350       switch (deleteResult) {
351         case FAMILY_DELETED:
352         case COLUMN_DELETED:
353           return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
354         case VERSION_DELETED:
355         case FAMILY_VERSION_DELETED:
356           return MatchCode.SKIP;
357         case NOT_DELETED:
358           break;
359         default:
360           throw new RuntimeException("UNEXPECTED");
361         }
362     }
363 
364     int timestampComparison = tr.compare(timestamp);
365     if (timestampComparison >= 1) {
366       return MatchCode.SKIP;
367     } else if (timestampComparison <= -1) {
368       return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
369     }
370 
371     /**
372      * Filters should be checked before checking column trackers. If we do
373      * otherwise, as was previously being done, ColumnTracker may increment its
374      * counter for even that KV which may be discarded later on by Filter. This
375      * would lead to incorrect results in certain cases.
376      */
377     ReturnCode filterResponse = ReturnCode.SKIP;
378     if (filter != null) {
379       filterResponse = filter.filterKeyValue(kv);
380       if (filterResponse == ReturnCode.SKIP) {
381         return MatchCode.SKIP;
382       } else if (filterResponse == ReturnCode.NEXT_COL) {
383         return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
384       } else if (filterResponse == ReturnCode.NEXT_ROW) {
385         stickyNextRow = true;
386         return MatchCode.SEEK_NEXT_ROW;
387       } else if (filterResponse == ReturnCode.SEEK_NEXT_USING_HINT) {
388         return MatchCode.SEEK_NEXT_USING_HINT;
389       }
390     }
391 
392     MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength,
393         timestamp, type, kv.getMvccVersion() > maxReadPointToTrackVersions);
394     /*
395      * According to current implementation, colChecker can only be
396      * SEEK_NEXT_COL, SEEK_NEXT_ROW, SKIP or INCLUDE. Therefore, always return
397      * the MatchCode. If it is SEEK_NEXT_ROW, also set stickyNextRow.
398      */
399     if (colChecker == MatchCode.SEEK_NEXT_ROW) {
400       stickyNextRow = true;
401     } else if (filter != null && colChecker == MatchCode.INCLUDE &&
402                filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL) {
403       return MatchCode.INCLUDE_AND_SEEK_NEXT_COL;
404     }
405     return colChecker;
406 
407   }
408 
409   /** Handle partial-drop-deletes. As we match keys in order, when we have a range from which
410    * we can drop deletes, we can set retainDeletesInOutput to false for the duration of this
411    * range only, and maintain consistency. */
412   private void checkPartialDropDeleteRange(byte [] row, int offset, short length) {
413     // If partial-drop-deletes are used, initially, dropDeletesFromRow and dropDeletesToRow
414     // are both set, and the matcher is set to retain deletes. We assume ordered keys. When
415     // dropDeletesFromRow is leq current kv, we start dropping deletes and reset
416     // dropDeletesFromRow; thus the 2nd "if" starts to apply.
417     if ((dropDeletesFromRow != null)
418         && ((dropDeletesFromRow == HConstants.EMPTY_START_ROW)
419           || (Bytes.compareTo(row, offset, length,
420               dropDeletesFromRow, 0, dropDeletesFromRow.length) >= 0))) {
421       retainDeletesInOutput = false;
422       dropDeletesFromRow = null;
423     }
424     // If dropDeletesFromRow is null and dropDeletesToRow is set, we are inside the partial-
425     // drop-deletes range. When dropDeletesToRow is leq current kv, we stop dropping deletes,
426     // and reset dropDeletesToRow so that we don't do any more compares.
427     if ((dropDeletesFromRow == null)
428         && (dropDeletesToRow != null) && (dropDeletesToRow != HConstants.EMPTY_END_ROW)
429         && (Bytes.compareTo(row, offset, length,
430             dropDeletesToRow, 0, dropDeletesToRow.length) >= 0)) {
431       retainDeletesInOutput = true;
432       dropDeletesToRow = null;
433     }
434   }
435 
436   public boolean moreRowsMayExistAfter(KeyValue kv) {
437     if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) &&
438         rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(),
439             kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) {
440       // KV >= STOPROW
441       // then NO there is nothing left.
442       return false;
443     } else {
444       return true;
445     }
446   }
447 
448   /**
449    * Set current row
450    * @param row
451    */
452   public void setRow(byte [] row, int offset, short length) {
453     checkPartialDropDeleteRange(row, offset, length);
454     this.row = row;
455     this.rowOffset = offset;
456     this.rowLength = length;
457     reset();
458   }
459 
460   public void reset() {
461     this.deletes.reset();
462     this.columns.reset();
463 
464     stickyNextRow = false;
465   }
466 
467   /**
468    *
469    * @return the start key
470    */
471   public KeyValue getStartKey() {
472     return this.startKey;
473   }
474 
475   /**
476    *
477    * @return the Filter
478    */
479   Filter getFilter() {
480     return this.filter;
481   }
482 
483   public KeyValue getNextKeyHint(KeyValue kv) throws IOException {
484     if (filter == null) {
485       return null;
486     } else {
487       return filter.getNextKeyHint(kv);
488     }
489   }
490 
491   public KeyValue getKeyForNextColumn(KeyValue kv) {
492     ColumnCount nextColumn = columns.getColumnHint();
493     if (nextColumn == null) {
494       return KeyValue.createLastOnRow(
495           kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
496           kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
497           kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
498     } else {
499       return KeyValue.createFirstOnRow(
500           kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
501           kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
502           nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength());
503     }
504   }
505 
506   public KeyValue getKeyForNextRow(KeyValue kv) {
507     return KeyValue.createLastOnRow(
508         kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
509         null, 0, 0,
510         null, 0, 0);
511   }
512 
513   /**
514    * {@link #match} return codes.  These instruct the scanner moving through
515    * memstores and StoreFiles what to do with the current KeyValue.
516    * <p>
517    * Additionally, this contains "early-out" language to tell the scanner to
518    * move on to the next File (memstore or Storefile), or to return immediately.
519    */
520   public static enum MatchCode {
521     /**
522      * Include KeyValue in the returned result
523      */
524     INCLUDE,
525 
526     /**
527      * Do not include KeyValue in the returned result
528      */
529     SKIP,
530 
531     /**
532      * Do not include, jump to next StoreFile or memstore (in time order)
533      */
534     NEXT,
535 
536     /**
537      * Do not include, return current result
538      */
539     DONE,
540 
541     /**
542      * These codes are used by the ScanQueryMatcher
543      */
544 
545     /**
546      * Done with the row, seek there.
547      */
548     SEEK_NEXT_ROW,
549     /**
550      * Done with column, seek to next.
551      */
552     SEEK_NEXT_COL,
553 
554     /**
555      * Done with scan, thanks to the row filter.
556      */
557     DONE_SCAN,
558 
559     /*
560      * Seek to next key which is given as hint.
561      */
562     SEEK_NEXT_USING_HINT,
563 
564     /**
565      * Include KeyValue and done with column, seek to next.
566      */
567     INCLUDE_AND_SEEK_NEXT_COL,
568 
569     /**
570      * Include KeyValue and done with row, seek to next.
571      */
572     INCLUDE_AND_SEEK_NEXT_ROW,
573   }
574 }