1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.io.IOException;
23 import java.util.NavigableSet;
24
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.hbase.Cell;
27 import org.apache.hadoop.hbase.HConstants;
28 import org.apache.hadoop.hbase.KeyValue;
29 import org.apache.hadoop.hbase.client.Scan;
30 import org.apache.hadoop.hbase.filter.Filter;
31 import org.apache.hadoop.hbase.filter.Filter.ReturnCode;
32 import org.apache.hadoop.hbase.io.TimeRange;
33 import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult;
34 import org.apache.hadoop.hbase.util.Bytes;
35 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
36
37 import com.google.common.base.Preconditions;
38
39 /**
40 * A query matcher that is specifically designed for the scan case.
41 */
42 @InterfaceAudience.Private
43 public class ScanQueryMatcher {
44 // Optimization so we can skip lots of compares when we decide to skip
45 // to the next row.
46 private boolean stickyNextRow;
47 private final byte[] stopRow;
48
49 private final TimeRange tr;
50
51 private final Filter filter;
52
53 /** Keeps track of deletes */
54 private final DeleteTracker deletes;
55
56 /*
57 * The following three booleans define how we deal with deletes.
58 * There are three different aspects:
59 * 1. Whether to keep delete markers. This is used in compactions.
60 * Minor compactions always keep delete markers.
61 * 2. Whether to keep deleted rows. This is also used in compactions,
62 * if the store is set to keep deleted rows. This implies keeping
63 * the delete markers as well.
64 * In this case deleted rows are subject to the normal max version
65 * and TTL/min version rules just like "normal" rows.
66 * 3. Whether a scan can do time travel queries even before deleted
67 * marker to reach deleted rows.
68 */
69 /** whether to retain delete markers */
70 private boolean retainDeletesInOutput;
71
72 /** whether to return deleted rows */
73 private final boolean keepDeletedCells;
74 /** whether time range queries can see rows "behind" a delete */
75 private final boolean seePastDeleteMarkers;
76
77
78 /** Keeps track of columns and versions */
79 private final ColumnTracker columns;
80
81 /** Key to seek to in memstore and StoreFiles */
82 private final KeyValue startKey;
83
84 /** Row comparator for the region this query is for */
85 private final KeyValue.KVComparator rowComparator;
86
87 /* row is not private for tests */
88 /** Row the query is on */
89 byte [] row;
90 int rowOffset;
91 short rowLength;
92
93 /**
94 * Oldest put in any of the involved store files
95 * Used to decide whether it is ok to delete
96 * family delete marker of this store keeps
97 * deleted KVs.
98 */
99 private final long earliestPutTs;
100
101 /** readPoint over which the KVs are unconditionally included */
102 protected long maxReadPointToTrackVersions;
103
104 private byte[] dropDeletesFromRow = null, dropDeletesToRow = null;
105
106 /**
107 * This variable shows whether there is an null column in the query. There
108 * always exists a null column in the wildcard column query.
109 * There maybe exists a null column in the explicit column query based on the
110 * first column.
111 * */
112 private boolean hasNullColumn = true;
113
114 // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete
115 // marker is always removed during a major compaction. If set to non-zero
116 // value then major compaction will try to keep a delete marker around for
117 // the given number of milliseconds. We want to keep the delete markers
118 // around a bit longer because old puts might appear out-of-order. For
119 // example, during log replication between two clusters.
120 //
121 // If the delete marker has lived longer than its column-family's TTL then
122 // the delete marker will be removed even if time.to.purge.deletes has not
123 // passed. This is because all the Puts that this delete marker can influence
124 // would have also expired. (Removing of delete markers on col family TTL will
125 // not happen if min-versions is set to non-zero)
126 //
127 // But, if time.to.purge.deletes has not expired then a delete
128 // marker will not be removed just because there are no Puts that it is
129 // currently influencing. This is because Puts, that this delete can
130 // influence. may appear out of order.
131 private final long timeToPurgeDeletes;
132
133 private final boolean isUserScan;
134
135 /**
136 * Construct a QueryMatcher for a scan
137 * @param scan
138 * @param scanInfo The store's immutable scan info
139 * @param columns
140 * @param scanType Type of the scan
141 * @param earliestPutTs Earliest put seen in any of the store files.
142 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
143 * based on TTL
144 */
145 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
146 NavigableSet<byte[]> columns, ScanType scanType,
147 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) {
148 this.tr = scan.getTimeRange();
149 this.rowComparator = scanInfo.getComparator();
150 this.deletes = new ScanDeleteTracker();
151 this.stopRow = scan.getStopRow();
152 this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(),
153 scanInfo.getFamily());
154 this.filter = scan.getFilter();
155 this.earliestPutTs = earliestPutTs;
156 this.maxReadPointToTrackVersions = readPointToUse;
157 this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes();
158
159 /* how to deal with deletes */
160 this.isUserScan = scanType == ScanType.USER_SCAN;
161 // keep deleted cells: if compaction or raw scan
162 this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw();
163 // retain deletes: if minor compaction or raw scan
164 this.retainDeletesInOutput = scanType == ScanType.COMPACT_RETAIN_DELETES || scan.isRaw();
165 // seePastDeleteMarker: user initiated scans
166 this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan;
167
168 int maxVersions =
169 scan.isRaw() ? scan.getMaxVersions() : Math.min(scan.getMaxVersions(),
170 scanInfo.getMaxVersions());
171
172 // Single branch to deal with two types of reads (columns vs all in family)
173 if (columns == null || columns.size() == 0) {
174 // there is always a null column in the wildcard column query.
175 hasNullColumn = true;
176
177 // use a specialized scan for wildcard column tracker.
178 this.columns = new ScanWildcardColumnTracker(
179 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
180 } else {
181 // whether there is null column in the explicit column query
182 hasNullColumn = (columns.first().length == 0);
183
184 // We can share the ExplicitColumnTracker, diff is we reset
185 // between rows, not between storefiles.
186 byte[] attr = scan.getAttribute(Scan.HINT_LOOKAHEAD);
187 this.columns = new ExplicitColumnTracker(columns, scanInfo.getMinVersions(), maxVersions,
188 oldestUnexpiredTS, attr == null ? 0 : Bytes.toInt(attr));
189 }
190 }
191
192 /**
193 * Construct a QueryMatcher for a scan that drop deletes from a limited range of rows.
194 * @param scan
195 * @param scanInfo The store's immutable scan info
196 * @param columns
197 * @param earliestPutTs Earliest put seen in any of the store files.
198 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
199 * based on TTL
200 * @param dropDeletesFromRow The inclusive left bound of the range; can be EMPTY_START_ROW.
201 * @param dropDeletesToRow The exclusive right bound of the range; can be EMPTY_END_ROW.
202 */
203 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo, NavigableSet<byte[]> columns,
204 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS,
205 byte[] dropDeletesFromRow, byte[] dropDeletesToRow) {
206 this(scan, scanInfo, columns, ScanType.COMPACT_RETAIN_DELETES, readPointToUse, earliestPutTs,
207 oldestUnexpiredTS);
208 Preconditions.checkArgument((dropDeletesFromRow != null) && (dropDeletesToRow != null));
209 this.dropDeletesFromRow = dropDeletesFromRow;
210 this.dropDeletesToRow = dropDeletesToRow;
211 }
212
213 /*
214 * Constructor for tests
215 */
216 ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
217 NavigableSet<byte[]> columns, long oldestUnexpiredTS) {
218 this(scan, scanInfo, columns, ScanType.USER_SCAN,
219 Long.MAX_VALUE, /* max Readpoint to track versions */
220 HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS);
221 }
222
223 /**
224 *
225 * @return whether there is an null column in the query
226 */
227 public boolean hasNullColumnInQuery() {
228 return hasNullColumn;
229 }
230
231 /**
232 * Determines if the caller should do one of several things:
233 * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)
234 * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL)
235 * - include the current KeyValue (MatchCode.INCLUDE)
236 * - ignore the current KeyValue (MatchCode.SKIP)
237 * - got to the next row (MatchCode.DONE)
238 *
239 * @param kv KeyValue to check
240 * @return The match code instance.
241 * @throws IOException in case there is an internal consistency problem
242 * caused by a data corruption.
243 */
244 public MatchCode match(KeyValue kv) throws IOException {
245 if (filter != null && filter.filterAllRemaining()) {
246 return MatchCode.DONE_SCAN;
247 }
248
249 byte [] bytes = kv.getBuffer();
250 int offset = kv.getOffset();
251
252 int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT);
253 offset += KeyValue.ROW_OFFSET;
254
255 int initialOffset = offset;
256
257 short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT);
258 offset += Bytes.SIZEOF_SHORT;
259
260 int ret = this.rowComparator.compareRows(row, this.rowOffset, this.rowLength,
261 bytes, offset, rowLength);
262 if (ret <= -1) {
263 return MatchCode.DONE;
264 } else if (ret >= 1) {
265 // could optimize this, if necessary?
266 // Could also be called SEEK_TO_CURRENT_ROW, but this
267 // should be rare/never happens.
268 return MatchCode.SEEK_NEXT_ROW;
269 }
270
271 // optimize case.
272 if (this.stickyNextRow)
273 return MatchCode.SEEK_NEXT_ROW;
274
275 if (this.columns.done()) {
276 stickyNextRow = true;
277 return MatchCode.SEEK_NEXT_ROW;
278 }
279
280 //Passing rowLength
281 offset += rowLength;
282
283 //Skipping family
284 byte familyLength = bytes [offset];
285 offset += familyLength + 1;
286
287 int qualLength = keyLength -
288 (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE;
289
290 long timestamp = Bytes.toLong(bytes, initialOffset + keyLength - KeyValue.TIMESTAMP_TYPE_SIZE);
291 // check for early out based on timestamp alone
292 if (columns.isDone(timestamp)) {
293 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
294 }
295
296 /*
297 * The delete logic is pretty complicated now.
298 * This is corroborated by the following:
299 * 1. The store might be instructed to keep deleted rows around.
300 * 2. A scan can optionally see past a delete marker now.
301 * 3. If deleted rows are kept, we have to find out when we can
302 * remove the delete markers.
303 * 4. Family delete markers are always first (regardless of their TS)
304 * 5. Delete markers should not be counted as version
305 * 6. Delete markers affect puts of the *same* TS
306 * 7. Delete marker need to be version counted together with puts
307 * they affect
308 */
309 byte type = bytes[initialOffset + keyLength - 1];
310 if (kv.isDelete()) {
311 if (!keepDeletedCells) {
312 // first ignore delete markers if the scanner can do so, and the
313 // range does not include the marker
314 //
315 // during flushes and compactions also ignore delete markers newer
316 // than the readpoint of any open scanner, this prevents deleted
317 // rows that could still be seen by a scanner from being collected
318 boolean includeDeleteMarker = seePastDeleteMarkers ?
319 tr.withinTimeRange(timestamp) :
320 tr.withinOrAfterTimeRange(timestamp);
321 if (includeDeleteMarker
322 && kv.getMvccVersion() <= maxReadPointToTrackVersions) {
323 this.deletes.add(bytes, offset, qualLength, timestamp, type);
324 }
325 // Can't early out now, because DelFam come before any other keys
326 }
327 if (retainDeletesInOutput
328 || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes)
329 || kv.getMvccVersion() > maxReadPointToTrackVersions) {
330 // always include or it is not time yet to check whether it is OK
331 // to purge deltes or not
332 if (!isUserScan) {
333 // if this is not a user scan (compaction), we can filter this deletemarker right here
334 // otherwise (i.e. a "raw" scan) we fall through to normal version and timerange checking
335 return MatchCode.INCLUDE;
336 }
337 } else if (keepDeletedCells) {
338 if (timestamp < earliestPutTs) {
339 // keeping delete rows, but there are no puts older than
340 // this delete in the store files.
341 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
342 }
343 // else: fall through and do version counting on the
344 // delete markers
345 } else {
346 return MatchCode.SKIP;
347 }
348 // note the following next else if...
349 // delete marker are not subject to other delete markers
350 } else if (!this.deletes.isEmpty()) {
351 DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength,
352 timestamp);
353 switch (deleteResult) {
354 case FAMILY_DELETED:
355 case COLUMN_DELETED:
356 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
357 case VERSION_DELETED:
358 case FAMILY_VERSION_DELETED:
359 return MatchCode.SKIP;
360 case NOT_DELETED:
361 break;
362 default:
363 throw new RuntimeException("UNEXPECTED");
364 }
365 }
366
367 int timestampComparison = tr.compare(timestamp);
368 if (timestampComparison >= 1) {
369 return MatchCode.SKIP;
370 } else if (timestampComparison <= -1) {
371 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
372 }
373
374 // STEP 1: Check if the column is part of the requested columns
375 MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength, type);
376 if (colChecker == MatchCode.INCLUDE) {
377 ReturnCode filterResponse = ReturnCode.SKIP;
378 // STEP 2: Yes, the column is part of the requested columns. Check if filter is present
379 if (filter != null) {
380 // STEP 3: Filter the key value and return if it filters out
381 filterResponse = filter.filterKeyValue(kv);
382 switch (filterResponse) {
383 case SKIP:
384 return MatchCode.SKIP;
385 case NEXT_COL:
386 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
387 case NEXT_ROW:
388 stickyNextRow = true;
389 return MatchCode.SEEK_NEXT_ROW;
390 case SEEK_NEXT_USING_HINT:
391 return MatchCode.SEEK_NEXT_USING_HINT;
392 default:
393 //It means it is either include or include and seek next
394 break;
395 }
396 }
397 /*
398 * STEP 4: Reaching this step means the column is part of the requested columns and either
399 * the filter is null or the filter has returned INCLUDE or INCLUDE_AND_NEXT_COL response.
400 * Now check the number of versions needed. This method call returns SKIP, INCLUDE,
401 * INCLUDE_AND_SEEK_NEXT_ROW, INCLUDE_AND_SEEK_NEXT_COL.
402 *
403 * FilterResponse ColumnChecker Desired behavior
404 * INCLUDE SKIP row has already been included, SKIP.
405 * INCLUDE INCLUDE INCLUDE
406 * INCLUDE INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL
407 * INCLUDE INCLUDE_AND_SEEK_NEXT_ROW INCLUDE_AND_SEEK_NEXT_ROW
408 * INCLUDE_AND_SEEK_NEXT_COL SKIP row has already been included, SKIP.
409 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE INCLUDE_AND_SEEK_NEXT_COL
410 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL
411 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_ROW INCLUDE_AND_SEEK_NEXT_ROW
412 *
413 * In all the above scenarios, we return the column checker return value except for
414 * FilterResponse (INCLUDE_AND_SEEK_NEXT_COL) and ColumnChecker(INCLUDE)
415 */
416 colChecker =
417 columns.checkVersions(bytes, offset, qualLength, timestamp, type,
418 kv.getMvccVersion() > maxReadPointToTrackVersions);
419 //Optimize with stickyNextRow
420 stickyNextRow = colChecker == MatchCode.INCLUDE_AND_SEEK_NEXT_ROW ? true : stickyNextRow;
421 return (filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL &&
422 colChecker == MatchCode.INCLUDE) ? MatchCode.INCLUDE_AND_SEEK_NEXT_COL
423 : colChecker;
424 }
425 stickyNextRow = (colChecker == MatchCode.SEEK_NEXT_ROW) ? true
426 : stickyNextRow;
427 return colChecker;
428 }
429
430 /** Handle partial-drop-deletes. As we match keys in order, when we have a range from which
431 * we can drop deletes, we can set retainDeletesInOutput to false for the duration of this
432 * range only, and maintain consistency. */
433 private void checkPartialDropDeleteRange(byte [] row, int offset, short length) {
434 // If partial-drop-deletes are used, initially, dropDeletesFromRow and dropDeletesToRow
435 // are both set, and the matcher is set to retain deletes. We assume ordered keys. When
436 // dropDeletesFromRow is leq current kv, we start dropping deletes and reset
437 // dropDeletesFromRow; thus the 2nd "if" starts to apply.
438 if ((dropDeletesFromRow != null)
439 && ((dropDeletesFromRow == HConstants.EMPTY_START_ROW)
440 || (Bytes.compareTo(row, offset, length,
441 dropDeletesFromRow, 0, dropDeletesFromRow.length) >= 0))) {
442 retainDeletesInOutput = false;
443 dropDeletesFromRow = null;
444 }
445 // If dropDeletesFromRow is null and dropDeletesToRow is set, we are inside the partial-
446 // drop-deletes range. When dropDeletesToRow is leq current kv, we stop dropping deletes,
447 // and reset dropDeletesToRow so that we don't do any more compares.
448 if ((dropDeletesFromRow == null)
449 && (dropDeletesToRow != null) && (dropDeletesToRow != HConstants.EMPTY_END_ROW)
450 && (Bytes.compareTo(row, offset, length,
451 dropDeletesToRow, 0, dropDeletesToRow.length) >= 0)) {
452 retainDeletesInOutput = true;
453 dropDeletesToRow = null;
454 }
455 }
456
457 public boolean moreRowsMayExistAfter(KeyValue kv) {
458 if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) &&
459 rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(),
460 kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) {
461 // KV >= STOPROW
462 // then NO there is nothing left.
463 return false;
464 } else {
465 return true;
466 }
467 }
468
469 /**
470 * Set current row
471 * @param row
472 */
473 public void setRow(byte [] row, int offset, short length) {
474 checkPartialDropDeleteRange(row, offset, length);
475 this.row = row;
476 this.rowOffset = offset;
477 this.rowLength = length;
478 reset();
479 }
480
481 public void reset() {
482 this.deletes.reset();
483 this.columns.reset();
484
485 stickyNextRow = false;
486 }
487
488 /**
489 *
490 * @return the start key
491 */
492 public KeyValue getStartKey() {
493 return this.startKey;
494 }
495
496 /**
497 *
498 * @return the Filter
499 */
500 Filter getFilter() {
501 return this.filter;
502 }
503
504 public Cell getNextKeyHint(Cell kv) throws IOException {
505 if (filter == null) {
506 return null;
507 } else {
508 return filter.getNextCellHint(kv);
509 }
510 }
511
512 public KeyValue getKeyForNextColumn(KeyValue kv) {
513 ColumnCount nextColumn = columns.getColumnHint();
514 if (nextColumn == null) {
515 return KeyValue.createLastOnRow(
516 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
517 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
518 kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
519 } else {
520 return KeyValue.createFirstOnRow(
521 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
522 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
523 nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength());
524 }
525 }
526
527 public KeyValue getKeyForNextRow(KeyValue kv) {
528 return KeyValue.createLastOnRow(
529 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
530 null, 0, 0,
531 null, 0, 0);
532 }
533
534 //Used only for testing purposes
535 static MatchCode checkColumn(ColumnTracker columnTracker, byte[] bytes, int offset,
536 int length, long ttl, byte type, boolean ignoreCount) throws IOException {
537 MatchCode matchCode = columnTracker.checkColumn(bytes, offset, length, type);
538 if (matchCode == MatchCode.INCLUDE) {
539 return columnTracker.checkVersions(bytes, offset, length, ttl, type, ignoreCount);
540 }
541 return matchCode;
542 }
543
544 /**
545 * {@link #match} return codes. These instruct the scanner moving through
546 * memstores and StoreFiles what to do with the current KeyValue.
547 * <p>
548 * Additionally, this contains "early-out" language to tell the scanner to
549 * move on to the next File (memstore or Storefile), or to return immediately.
550 */
551 public static enum MatchCode {
552 /**
553 * Include KeyValue in the returned result
554 */
555 INCLUDE,
556
557 /**
558 * Do not include KeyValue in the returned result
559 */
560 SKIP,
561
562 /**
563 * Do not include, jump to next StoreFile or memstore (in time order)
564 */
565 NEXT,
566
567 /**
568 * Do not include, return current result
569 */
570 DONE,
571
572 /**
573 * These codes are used by the ScanQueryMatcher
574 */
575
576 /**
577 * Done with the row, seek there.
578 */
579 SEEK_NEXT_ROW,
580 /**
581 * Done with column, seek to next.
582 */
583 SEEK_NEXT_COL,
584
585 /**
586 * Done with scan, thanks to the row filter.
587 */
588 DONE_SCAN,
589
590 /*
591 * Seek to next key which is given as hint.
592 */
593 SEEK_NEXT_USING_HINT,
594
595 /**
596 * Include KeyValue and done with column, seek to next.
597 */
598 INCLUDE_AND_SEEK_NEXT_COL,
599
600 /**
601 * Include KeyValue and done with row, seek to next.
602 */
603 INCLUDE_AND_SEEK_NEXT_ROW,
604 }
605 }