1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.io.IOException;
23 import java.util.NavigableSet;
24
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.hbase.Cell;
27 import org.apache.hadoop.hbase.HConstants;
28 import org.apache.hadoop.hbase.KeyValue;
29 import org.apache.hadoop.hbase.client.Scan;
30 import org.apache.hadoop.hbase.filter.Filter;
31 import org.apache.hadoop.hbase.filter.Filter.ReturnCode;
32 import org.apache.hadoop.hbase.io.TimeRange;
33 import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult;
34 import org.apache.hadoop.hbase.util.Bytes;
35 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
36
37 import com.google.common.base.Preconditions;
38
39 /**
40 * A query matcher that is specifically designed for the scan case.
41 */
42 @InterfaceAudience.Private
43 public class ScanQueryMatcher {
44 // Optimization so we can skip lots of compares when we decide to skip
45 // to the next row.
46 private boolean stickyNextRow;
47 private final byte[] stopRow;
48
49 private final TimeRange tr;
50
51 private final Filter filter;
52
53 /** Keeps track of deletes */
54 private final DeleteTracker deletes;
55
56 /*
57 * The following three booleans define how we deal with deletes.
58 * There are three different aspects:
59 * 1. Whether to keep delete markers. This is used in compactions.
60 * Minor compactions always keep delete markers.
61 * 2. Whether to keep deleted rows. This is also used in compactions,
62 * if the store is set to keep deleted rows. This implies keeping
63 * the delete markers as well.
64 * In this case deleted rows are subject to the normal max version
65 * and TTL/min version rules just like "normal" rows.
66 * 3. Whether a scan can do time travel queries even before deleted
67 * marker to reach deleted rows.
68 */
69 /** whether to retain delete markers */
70 private boolean retainDeletesInOutput;
71
72 /** whether to return deleted rows */
73 private final boolean keepDeletedCells;
74 /** whether time range queries can see rows "behind" a delete */
75 private final boolean seePastDeleteMarkers;
76
77
78 /** Keeps track of columns and versions */
79 private final ColumnTracker columns;
80
81 /** Key to seek to in memstore and StoreFiles */
82 private final KeyValue startKey;
83
84 /** Row comparator for the region this query is for */
85 private final KeyValue.KVComparator rowComparator;
86
87 /* row is not private for tests */
88 /** Row the query is on */
89 byte [] row;
90 int rowOffset;
91 short rowLength;
92
93 /**
94 * Oldest put in any of the involved store files
95 * Used to decide whether it is ok to delete
96 * family delete marker of this store keeps
97 * deleted KVs.
98 */
99 private final long earliestPutTs;
100
101 /** readPoint over which the KVs are unconditionally included */
102 protected long maxReadPointToTrackVersions;
103
104 private byte[] dropDeletesFromRow = null, dropDeletesToRow = null;
105
106 /**
107 * This variable shows whether there is an null column in the query. There
108 * always exists a null column in the wildcard column query.
109 * There maybe exists a null column in the explicit column query based on the
110 * first column.
111 * */
112 private boolean hasNullColumn = true;
113
114 // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete
115 // marker is always removed during a major compaction. If set to non-zero
116 // value then major compaction will try to keep a delete marker around for
117 // the given number of milliseconds. We want to keep the delete markers
118 // around a bit longer because old puts might appear out-of-order. For
119 // example, during log replication between two clusters.
120 //
121 // If the delete marker has lived longer than its column-family's TTL then
122 // the delete marker will be removed even if time.to.purge.deletes has not
123 // passed. This is because all the Puts that this delete marker can influence
124 // would have also expired. (Removing of delete markers on col family TTL will
125 // not happen if min-versions is set to non-zero)
126 //
127 // But, if time.to.purge.deletes has not expired then a delete
128 // marker will not be removed just because there are no Puts that it is
129 // currently influencing. This is because Puts, that this delete can
130 // influence. may appear out of order.
131 private final long timeToPurgeDeletes;
132
133 private final boolean isUserScan;
134
135 /**
136 * Construct a QueryMatcher for a scan
137 * @param scan
138 * @param scanInfo The store's immutable scan info
139 * @param columns
140 * @param scanType Type of the scan
141 * @param earliestPutTs Earliest put seen in any of the store files.
142 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
143 * based on TTL
144 */
145 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
146 NavigableSet<byte[]> columns, ScanType scanType,
147 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) {
148 this.tr = scan.getTimeRange();
149 this.rowComparator = scanInfo.getComparator();
150 this.deletes = new ScanDeleteTracker();
151 this.stopRow = scan.getStopRow();
152 this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(),
153 scanInfo.getFamily());
154 this.filter = scan.getFilter();
155 this.earliestPutTs = earliestPutTs;
156 this.maxReadPointToTrackVersions = readPointToUse;
157 this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes();
158
159 /* how to deal with deletes */
160 this.isUserScan = scanType == ScanType.USER_SCAN;
161 // keep deleted cells: if compaction or raw scan
162 this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw();
163 // retain deletes: if minor compaction or raw scan
164 this.retainDeletesInOutput = scanType == ScanType.COMPACT_RETAIN_DELETES || scan.isRaw();
165 // seePastDeleteMarker: user initiated scans
166 this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan;
167
168 int maxVersions =
169 scan.isRaw() ? scan.getMaxVersions() : Math.min(scan.getMaxVersions(),
170 scanInfo.getMaxVersions());
171
172 // Single branch to deal with two types of reads (columns vs all in family)
173 if (columns == null || columns.size() == 0) {
174 // there is always a null column in the wildcard column query.
175 hasNullColumn = true;
176
177 // use a specialized scan for wildcard column tracker.
178 this.columns = new ScanWildcardColumnTracker(
179 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
180 } else {
181 // whether there is null column in the explicit column query
182 hasNullColumn = (columns.first().length == 0);
183
184 // We can share the ExplicitColumnTracker, diff is we reset
185 // between rows, not between storefiles.
186 this.columns = new ExplicitColumnTracker(columns,
187 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
188 }
189 }
190
191 /**
192 * Construct a QueryMatcher for a scan that drop deletes from a limited range of rows.
193 * @param scan
194 * @param scanInfo The store's immutable scan info
195 * @param columns
196 * @param earliestPutTs Earliest put seen in any of the store files.
197 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
198 * based on TTL
199 * @param dropDeletesFromRow The inclusive left bound of the range; can be EMPTY_START_ROW.
200 * @param dropDeletesToRow The exclusive right bound of the range; can be EMPTY_END_ROW.
201 */
202 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo, NavigableSet<byte[]> columns,
203 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS,
204 byte[] dropDeletesFromRow, byte[] dropDeletesToRow) {
205 this(scan, scanInfo, columns, ScanType.COMPACT_RETAIN_DELETES, readPointToUse, earliestPutTs,
206 oldestUnexpiredTS);
207 Preconditions.checkArgument((dropDeletesFromRow != null) && (dropDeletesToRow != null));
208 this.dropDeletesFromRow = dropDeletesFromRow;
209 this.dropDeletesToRow = dropDeletesToRow;
210 }
211
212 /*
213 * Constructor for tests
214 */
215 ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
216 NavigableSet<byte[]> columns, long oldestUnexpiredTS) {
217 this(scan, scanInfo, columns, ScanType.USER_SCAN,
218 Long.MAX_VALUE, /* max Readpoint to track versions */
219 HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS);
220 }
221
222 /**
223 *
224 * @return whether there is an null column in the query
225 */
226 public boolean hasNullColumnInQuery() {
227 return hasNullColumn;
228 }
229
230 /**
231 * Determines if the caller should do one of several things:
232 * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)
233 * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL)
234 * - include the current KeyValue (MatchCode.INCLUDE)
235 * - ignore the current KeyValue (MatchCode.SKIP)
236 * - got to the next row (MatchCode.DONE)
237 *
238 * @param kv KeyValue to check
239 * @return The match code instance.
240 * @throws IOException in case there is an internal consistency problem
241 * caused by a data corruption.
242 */
243 public MatchCode match(KeyValue kv) throws IOException {
244 if (filter != null && filter.filterAllRemaining()) {
245 return MatchCode.DONE_SCAN;
246 }
247
248 byte [] bytes = kv.getBuffer();
249 int offset = kv.getOffset();
250
251 int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT);
252 offset += KeyValue.ROW_OFFSET;
253
254 int initialOffset = offset;
255
256 short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT);
257 offset += Bytes.SIZEOF_SHORT;
258
259 int ret = this.rowComparator.compareRows(row, this.rowOffset, this.rowLength,
260 bytes, offset, rowLength);
261 if (ret <= -1) {
262 return MatchCode.DONE;
263 } else if (ret >= 1) {
264 // could optimize this, if necessary?
265 // Could also be called SEEK_TO_CURRENT_ROW, but this
266 // should be rare/never happens.
267 return MatchCode.SEEK_NEXT_ROW;
268 }
269
270 // optimize case.
271 if (this.stickyNextRow)
272 return MatchCode.SEEK_NEXT_ROW;
273
274 if (this.columns.done()) {
275 stickyNextRow = true;
276 return MatchCode.SEEK_NEXT_ROW;
277 }
278
279 //Passing rowLength
280 offset += rowLength;
281
282 //Skipping family
283 byte familyLength = bytes [offset];
284 offset += familyLength + 1;
285
286 int qualLength = keyLength -
287 (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE;
288
289 long timestamp = Bytes.toLong(bytes, initialOffset + keyLength - KeyValue.TIMESTAMP_TYPE_SIZE);
290 // check for early out based on timestamp alone
291 if (columns.isDone(timestamp)) {
292 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
293 }
294
295 /*
296 * The delete logic is pretty complicated now.
297 * This is corroborated by the following:
298 * 1. The store might be instructed to keep deleted rows around.
299 * 2. A scan can optionally see past a delete marker now.
300 * 3. If deleted rows are kept, we have to find out when we can
301 * remove the delete markers.
302 * 4. Family delete markers are always first (regardless of their TS)
303 * 5. Delete markers should not be counted as version
304 * 6. Delete markers affect puts of the *same* TS
305 * 7. Delete marker need to be version counted together with puts
306 * they affect
307 */
308 byte type = bytes[initialOffset + keyLength - 1];
309 if (kv.isDelete()) {
310 if (!keepDeletedCells) {
311 // first ignore delete markers if the scanner can do so, and the
312 // range does not include the marker
313 //
314 // during flushes and compactions also ignore delete markers newer
315 // than the readpoint of any open scanner, this prevents deleted
316 // rows that could still be seen by a scanner from being collected
317 boolean includeDeleteMarker = seePastDeleteMarkers ?
318 tr.withinTimeRange(timestamp) :
319 tr.withinOrAfterTimeRange(timestamp);
320 if (includeDeleteMarker
321 && kv.getMvccVersion() <= maxReadPointToTrackVersions) {
322 this.deletes.add(bytes, offset, qualLength, timestamp, type);
323 }
324 // Can't early out now, because DelFam come before any other keys
325 }
326 if (retainDeletesInOutput
327 || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes)
328 || kv.getMvccVersion() > maxReadPointToTrackVersions) {
329 // always include or it is not time yet to check whether it is OK
330 // to purge deltes or not
331 if (!isUserScan) {
332 // if this is not a user scan (compaction), we can filter this deletemarker right here
333 // otherwise (i.e. a "raw" scan) we fall through to normal version and timerange checking
334 return MatchCode.INCLUDE;
335 }
336 } else if (keepDeletedCells) {
337 if (timestamp < earliestPutTs) {
338 // keeping delete rows, but there are no puts older than
339 // this delete in the store files.
340 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
341 }
342 // else: fall through and do version counting on the
343 // delete markers
344 } else {
345 return MatchCode.SKIP;
346 }
347 // note the following next else if...
348 // delete marker are not subject to other delete markers
349 } else if (!this.deletes.isEmpty()) {
350 DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength,
351 timestamp);
352 switch (deleteResult) {
353 case FAMILY_DELETED:
354 case COLUMN_DELETED:
355 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
356 case VERSION_DELETED:
357 case FAMILY_VERSION_DELETED:
358 return MatchCode.SKIP;
359 case NOT_DELETED:
360 break;
361 default:
362 throw new RuntimeException("UNEXPECTED");
363 }
364 }
365
366 int timestampComparison = tr.compare(timestamp);
367 if (timestampComparison >= 1) {
368 return MatchCode.SKIP;
369 } else if (timestampComparison <= -1) {
370 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
371 }
372
373 // STEP 1: Check if the column is part of the requested columns
374 MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength, type);
375 if (colChecker == MatchCode.INCLUDE) {
376 ReturnCode filterResponse = ReturnCode.SKIP;
377 // STEP 2: Yes, the column is part of the requested columns. Check if filter is present
378 if (filter != null) {
379 // STEP 3: Filter the key value and return if it filters out
380 filterResponse = filter.filterKeyValue(kv);
381 switch (filterResponse) {
382 case SKIP:
383 return MatchCode.SKIP;
384 case NEXT_COL:
385 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
386 case NEXT_ROW:
387 stickyNextRow = true;
388 return MatchCode.SEEK_NEXT_ROW;
389 case SEEK_NEXT_USING_HINT:
390 return MatchCode.SEEK_NEXT_USING_HINT;
391 default:
392 //It means it is either include or include and seek next
393 break;
394 }
395 }
396 /*
397 * STEP 4: Reaching this step means the column is part of the requested columns and either
398 * the filter is null or the filter has returned INCLUDE or INCLUDE_AND_NEXT_COL response.
399 * Now check the number of versions needed. This method call returns SKIP, INCLUDE,
400 * INCLUDE_AND_SEEK_NEXT_ROW, INCLUDE_AND_SEEK_NEXT_COL.
401 *
402 * FilterResponse ColumnChecker Desired behavior
403 * INCLUDE SKIP row has already been included, SKIP.
404 * INCLUDE INCLUDE INCLUDE
405 * INCLUDE INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL
406 * INCLUDE INCLUDE_AND_SEEK_NEXT_ROW INCLUDE_AND_SEEK_NEXT_ROW
407 * INCLUDE_AND_SEEK_NEXT_COL SKIP row has already been included, SKIP.
408 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE INCLUDE_AND_SEEK_NEXT_COL
409 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_COL
410 * INCLUDE_AND_SEEK_NEXT_COL INCLUDE_AND_SEEK_NEXT_ROW INCLUDE_AND_SEEK_NEXT_ROW
411 *
412 * In all the above scenarios, we return the column checker return value except for
413 * FilterResponse (INCLUDE_AND_SEEK_NEXT_COL) and ColumnChecker(INCLUDE)
414 */
415 colChecker =
416 columns.checkVersions(bytes, offset, qualLength, timestamp, type,
417 kv.getMvccVersion() > maxReadPointToTrackVersions);
418 //Optimize with stickyNextRow
419 stickyNextRow = colChecker == MatchCode.INCLUDE_AND_SEEK_NEXT_ROW ? true : stickyNextRow;
420 return (filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL &&
421 colChecker == MatchCode.INCLUDE) ? MatchCode.INCLUDE_AND_SEEK_NEXT_COL
422 : colChecker;
423 }
424 stickyNextRow = (colChecker == MatchCode.SEEK_NEXT_ROW) ? true
425 : stickyNextRow;
426 return colChecker;
427 }
428
429 /** Handle partial-drop-deletes. As we match keys in order, when we have a range from which
430 * we can drop deletes, we can set retainDeletesInOutput to false for the duration of this
431 * range only, and maintain consistency. */
432 private void checkPartialDropDeleteRange(byte [] row, int offset, short length) {
433 // If partial-drop-deletes are used, initially, dropDeletesFromRow and dropDeletesToRow
434 // are both set, and the matcher is set to retain deletes. We assume ordered keys. When
435 // dropDeletesFromRow is leq current kv, we start dropping deletes and reset
436 // dropDeletesFromRow; thus the 2nd "if" starts to apply.
437 if ((dropDeletesFromRow != null)
438 && ((dropDeletesFromRow == HConstants.EMPTY_START_ROW)
439 || (Bytes.compareTo(row, offset, length,
440 dropDeletesFromRow, 0, dropDeletesFromRow.length) >= 0))) {
441 retainDeletesInOutput = false;
442 dropDeletesFromRow = null;
443 }
444 // If dropDeletesFromRow is null and dropDeletesToRow is set, we are inside the partial-
445 // drop-deletes range. When dropDeletesToRow is leq current kv, we stop dropping deletes,
446 // and reset dropDeletesToRow so that we don't do any more compares.
447 if ((dropDeletesFromRow == null)
448 && (dropDeletesToRow != null) && (dropDeletesToRow != HConstants.EMPTY_END_ROW)
449 && (Bytes.compareTo(row, offset, length,
450 dropDeletesToRow, 0, dropDeletesToRow.length) >= 0)) {
451 retainDeletesInOutput = true;
452 dropDeletesToRow = null;
453 }
454 }
455
456 public boolean moreRowsMayExistAfter(KeyValue kv) {
457 if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) &&
458 rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(),
459 kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) {
460 // KV >= STOPROW
461 // then NO there is nothing left.
462 return false;
463 } else {
464 return true;
465 }
466 }
467
468 /**
469 * Set current row
470 * @param row
471 */
472 public void setRow(byte [] row, int offset, short length) {
473 checkPartialDropDeleteRange(row, offset, length);
474 this.row = row;
475 this.rowOffset = offset;
476 this.rowLength = length;
477 reset();
478 }
479
480 public void reset() {
481 this.deletes.reset();
482 this.columns.reset();
483
484 stickyNextRow = false;
485 }
486
487 /**
488 *
489 * @return the start key
490 */
491 public KeyValue getStartKey() {
492 return this.startKey;
493 }
494
495 /**
496 *
497 * @return the Filter
498 */
499 Filter getFilter() {
500 return this.filter;
501 }
502
503 public Cell getNextKeyHint(Cell kv) throws IOException {
504 if (filter == null) {
505 return null;
506 } else {
507 return filter.getNextCellHint(kv);
508 }
509 }
510
511 public KeyValue getKeyForNextColumn(KeyValue kv) {
512 ColumnCount nextColumn = columns.getColumnHint();
513 if (nextColumn == null) {
514 return KeyValue.createLastOnRow(
515 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
516 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
517 kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
518 } else {
519 return KeyValue.createFirstOnRow(
520 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
521 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
522 nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength());
523 }
524 }
525
526 public KeyValue getKeyForNextRow(KeyValue kv) {
527 return KeyValue.createLastOnRow(
528 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
529 null, 0, 0,
530 null, 0, 0);
531 }
532
533 //Used only for testing purposes
534 static MatchCode checkColumn(ColumnTracker columnTracker, byte[] bytes, int offset,
535 int length, long ttl, byte type, boolean ignoreCount) throws IOException {
536 MatchCode matchCode = columnTracker.checkColumn(bytes, offset, length, type);
537 if (matchCode == MatchCode.INCLUDE) {
538 return columnTracker.checkVersions(bytes, offset, length, ttl, type, ignoreCount);
539 }
540 return matchCode;
541 }
542
543 /**
544 * {@link #match} return codes. These instruct the scanner moving through
545 * memstores and StoreFiles what to do with the current KeyValue.
546 * <p>
547 * Additionally, this contains "early-out" language to tell the scanner to
548 * move on to the next File (memstore or Storefile), or to return immediately.
549 */
550 public static enum MatchCode {
551 /**
552 * Include KeyValue in the returned result
553 */
554 INCLUDE,
555
556 /**
557 * Do not include KeyValue in the returned result
558 */
559 SKIP,
560
561 /**
562 * Do not include, jump to next StoreFile or memstore (in time order)
563 */
564 NEXT,
565
566 /**
567 * Do not include, return current result
568 */
569 DONE,
570
571 /**
572 * These codes are used by the ScanQueryMatcher
573 */
574
575 /**
576 * Done with the row, seek there.
577 */
578 SEEK_NEXT_ROW,
579 /**
580 * Done with column, seek to next.
581 */
582 SEEK_NEXT_COL,
583
584 /**
585 * Done with scan, thanks to the row filter.
586 */
587 DONE_SCAN,
588
589 /*
590 * Seek to next key which is given as hint.
591 */
592 SEEK_NEXT_USING_HINT,
593
594 /**
595 * Include KeyValue and done with column, seek to next.
596 */
597 INCLUDE_AND_SEEK_NEXT_COL,
598
599 /**
600 * Include KeyValue and done with row, seek to next.
601 */
602 INCLUDE_AND_SEEK_NEXT_ROW,
603 }
604 }