1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.io.IOException;
23 import java.util.NavigableSet;
24
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.hbase.HConstants;
27 import org.apache.hadoop.hbase.KeyValue;
28 import org.apache.hadoop.hbase.client.Scan;
29 import org.apache.hadoop.hbase.filter.Filter;
30 import org.apache.hadoop.hbase.filter.Filter.ReturnCode;
31 import org.apache.hadoop.hbase.io.TimeRange;
32 import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult;
33 import org.apache.hadoop.hbase.util.Bytes;
34 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
35
36 import com.google.common.base.Preconditions;
37
38 /**
39 * A query matcher that is specifically designed for the scan case.
40 */
41 @InterfaceAudience.Private
42 public class ScanQueryMatcher {
43 // Optimization so we can skip lots of compares when we decide to skip
44 // to the next row.
45 private boolean stickyNextRow;
46 private final byte[] stopRow;
47
48 private final TimeRange tr;
49
50 private final Filter filter;
51
52 /** Keeps track of deletes */
53 private final DeleteTracker deletes;
54
55 /*
56 * The following three booleans define how we deal with deletes.
57 * There are three different aspects:
58 * 1. Whether to keep delete markers. This is used in compactions.
59 * Minor compactions always keep delete markers.
60 * 2. Whether to keep deleted rows. This is also used in compactions,
61 * if the store is set to keep deleted rows. This implies keeping
62 * the delete markers as well.
63 * In this case deleted rows are subject to the normal max version
64 * and TTL/min version rules just like "normal" rows.
65 * 3. Whether a scan can do time travel queries even before deleted
66 * marker to reach deleted rows.
67 */
68 /** whether to retain delete markers */
69 private boolean retainDeletesInOutput;
70
71 /** whether to return deleted rows */
72 private final boolean keepDeletedCells;
73 /** whether time range queries can see rows "behind" a delete */
74 private final boolean seePastDeleteMarkers;
75
76
77 /** Keeps track of columns and versions */
78 private final ColumnTracker columns;
79
80 /** Key to seek to in memstore and StoreFiles */
81 private final KeyValue startKey;
82
83 /** Row comparator for the region this query is for */
84 private final KeyValue.KeyComparator rowComparator;
85
86 /* row is not private for tests */
87 /** Row the query is on */
88 byte [] row;
89 int rowOffset;
90 short rowLength;
91
92 /**
93 * Oldest put in any of the involved store files
94 * Used to decide whether it is ok to delete
95 * family delete marker of this store keeps
96 * deleted KVs.
97 */
98 private final long earliestPutTs;
99
100 /** readPoint over which the KVs are unconditionally included */
101 protected long maxReadPointToTrackVersions;
102
103 private byte[] dropDeletesFromRow = null, dropDeletesToRow = null;
104
105 /**
106 * This variable shows whether there is an null column in the query. There
107 * always exists a null column in the wildcard column query.
108 * There maybe exists a null column in the explicit column query based on the
109 * first column.
110 * */
111 private boolean hasNullColumn = true;
112
113 // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete
114 // marker is always removed during a major compaction. If set to non-zero
115 // value then major compaction will try to keep a delete marker around for
116 // the given number of milliseconds. We want to keep the delete markers
117 // around a bit longer because old puts might appear out-of-order. For
118 // example, during log replication between two clusters.
119 //
120 // If the delete marker has lived longer than its column-family's TTL then
121 // the delete marker will be removed even if time.to.purge.deletes has not
122 // passed. This is because all the Puts that this delete marker can influence
123 // would have also expired. (Removing of delete markers on col family TTL will
124 // not happen if min-versions is set to non-zero)
125 //
126 // But, if time.to.purge.deletes has not expired then a delete
127 // marker will not be removed just because there are no Puts that it is
128 // currently influencing. This is because Puts, that this delete can
129 // influence. may appear out of order.
130 private final long timeToPurgeDeletes;
131
132 private final boolean isUserScan;
133
134 /**
135 * Construct a QueryMatcher for a scan
136 * @param scan
137 * @param scanInfo The store's immutable scan info
138 * @param columns
139 * @param scanType Type of the scan
140 * @param earliestPutTs Earliest put seen in any of the store files.
141 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
142 * based on TTL
143 */
144 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
145 NavigableSet<byte[]> columns, ScanType scanType,
146 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) {
147 this.tr = scan.getTimeRange();
148 this.rowComparator = scanInfo.getComparator().getRawComparator();
149 this.deletes = new ScanDeleteTracker();
150 this.stopRow = scan.getStopRow();
151 this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(),
152 scanInfo.getFamily());
153 this.filter = scan.getFilter();
154 this.earliestPutTs = earliestPutTs;
155 this.maxReadPointToTrackVersions = readPointToUse;
156 this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes();
157
158 /* how to deal with deletes */
159 this.isUserScan = scanType == ScanType.USER_SCAN;
160 // keep deleted cells: if compaction or raw scan
161 this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw();
162 // retain deletes: if minor compaction or raw scan
163 this.retainDeletesInOutput = scanType == ScanType.COMPACT_RETAIN_DELETES || scan.isRaw();
164 // seePastDeleteMarker: user initiated scans
165 this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan;
166
167 int maxVersions =
168 scan.isRaw() ? scan.getMaxVersions() : Math.min(scan.getMaxVersions(),
169 scanInfo.getMaxVersions());
170
171 // Single branch to deal with two types of reads (columns vs all in family)
172 if (columns == null || columns.size() == 0) {
173 // there is always a null column in the wildcard column query.
174 hasNullColumn = true;
175
176 // use a specialized scan for wildcard column tracker.
177 this.columns = new ScanWildcardColumnTracker(
178 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
179 } else {
180 // whether there is null column in the explicit column query
181 hasNullColumn = (columns.first().length == 0);
182
183 // We can share the ExplicitColumnTracker, diff is we reset
184 // between rows, not between storefiles.
185 this.columns = new ExplicitColumnTracker(columns,
186 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
187 }
188 }
189
190 /**
191 * Construct a QueryMatcher for a scan that drop deletes from a limited range of rows.
192 * @param scan
193 * @param scanInfo The store's immutable scan info
194 * @param columns
195 * @param earliestPutTs Earliest put seen in any of the store files.
196 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
197 * based on TTL
198 * @param dropDeletesFromRow The inclusive left bound of the range; can be EMPTY_START_ROW.
199 * @param dropDeletesToRow The exclusive right bound of the range; can be EMPTY_END_ROW.
200 */
201 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo, NavigableSet<byte[]> columns,
202 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS,
203 byte[] dropDeletesFromRow, byte[] dropDeletesToRow) {
204 this(scan, scanInfo, columns, ScanType.COMPACT_RETAIN_DELETES, readPointToUse, earliestPutTs,
205 oldestUnexpiredTS);
206 Preconditions.checkArgument((dropDeletesFromRow != null) && (dropDeletesToRow != null));
207 this.dropDeletesFromRow = dropDeletesFromRow;
208 this.dropDeletesToRow = dropDeletesToRow;
209 }
210
211 /*
212 * Constructor for tests
213 */
214 ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
215 NavigableSet<byte[]> columns, long oldestUnexpiredTS) {
216 this(scan, scanInfo, columns, ScanType.USER_SCAN,
217 Long.MAX_VALUE, /* max Readpoint to track versions */
218 HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS);
219 }
220
221 /**
222 *
223 * @return whether there is an null column in the query
224 */
225 public boolean hasNullColumnInQuery() {
226 return hasNullColumn;
227 }
228
229 /**
230 * Determines if the caller should do one of several things:
231 * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)
232 * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL)
233 * - include the current KeyValue (MatchCode.INCLUDE)
234 * - ignore the current KeyValue (MatchCode.SKIP)
235 * - got to the next row (MatchCode.DONE)
236 *
237 * @param kv KeyValue to check
238 * @return The match code instance.
239 * @throws IOException in case there is an internal consistency problem
240 * caused by a data corruption.
241 */
242 public MatchCode match(KeyValue kv) throws IOException {
243 if (filter != null && filter.filterAllRemaining()) {
244 return MatchCode.DONE_SCAN;
245 }
246
247 byte [] bytes = kv.getBuffer();
248 int offset = kv.getOffset();
249 int initialOffset = offset;
250
251 int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT);
252 offset += KeyValue.ROW_OFFSET;
253
254 short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT);
255 offset += Bytes.SIZEOF_SHORT;
256
257 int ret = this.rowComparator.compareRows(row, this.rowOffset, this.rowLength,
258 bytes, offset, rowLength);
259 if (ret <= -1) {
260 return MatchCode.DONE;
261 } else if (ret >= 1) {
262 // could optimize this, if necessary?
263 // Could also be called SEEK_TO_CURRENT_ROW, but this
264 // should be rare/never happens.
265 return MatchCode.SEEK_NEXT_ROW;
266 }
267
268 // optimize case.
269 if (this.stickyNextRow)
270 return MatchCode.SEEK_NEXT_ROW;
271
272 if (this.columns.done()) {
273 stickyNextRow = true;
274 return MatchCode.SEEK_NEXT_ROW;
275 }
276
277 //Passing rowLength
278 offset += rowLength;
279
280 //Skipping family
281 byte familyLength = bytes [offset];
282 offset += familyLength + 1;
283
284 int qualLength = keyLength + KeyValue.ROW_OFFSET -
285 (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE;
286
287 long timestamp = kv.getTimestamp();
288 // check for early out based on timestamp alone
289 if (columns.isDone(timestamp)) {
290 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
291 }
292
293 /*
294 * The delete logic is pretty complicated now.
295 * This is corroborated by the following:
296 * 1. The store might be instructed to keep deleted rows around.
297 * 2. A scan can optionally see past a delete marker now.
298 * 3. If deleted rows are kept, we have to find out when we can
299 * remove the delete markers.
300 * 4. Family delete markers are always first (regardless of their TS)
301 * 5. Delete markers should not be counted as version
302 * 6. Delete markers affect puts of the *same* TS
303 * 7. Delete marker need to be version counted together with puts
304 * they affect
305 */
306 byte type = kv.getType();
307 if (kv.isDelete()) {
308 if (!keepDeletedCells) {
309 // first ignore delete markers if the scanner can do so, and the
310 // range does not include the marker
311 //
312 // during flushes and compactions also ignore delete markers newer
313 // than the readpoint of any open scanner, this prevents deleted
314 // rows that could still be seen by a scanner from being collected
315 boolean includeDeleteMarker = seePastDeleteMarkers ?
316 tr.withinTimeRange(timestamp) :
317 tr.withinOrAfterTimeRange(timestamp);
318 if (includeDeleteMarker
319 && kv.getMvccVersion() <= maxReadPointToTrackVersions) {
320 this.deletes.add(bytes, offset, qualLength, timestamp, type);
321 }
322 // Can't early out now, because DelFam come before any other keys
323 }
324 if (retainDeletesInOutput
325 || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes)
326 || kv.getMvccVersion() > maxReadPointToTrackVersions) {
327 // always include or it is not time yet to check whether it is OK
328 // to purge deltes or not
329 if (!isUserScan) {
330 // if this is not a user scan (compaction), we can filter this deletemarker right here
331 // otherwise (i.e. a "raw" scan) we fall through to normal version and timerange checking
332 return MatchCode.INCLUDE;
333 }
334 } else if (keepDeletedCells) {
335 if (timestamp < earliestPutTs) {
336 // keeping delete rows, but there are no puts older than
337 // this delete in the store files.
338 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
339 }
340 // else: fall through and do version counting on the
341 // delete markers
342 } else {
343 return MatchCode.SKIP;
344 }
345 // note the following next else if...
346 // delete marker are not subject to other delete markers
347 } else if (!this.deletes.isEmpty()) {
348 DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength,
349 timestamp);
350 switch (deleteResult) {
351 case FAMILY_DELETED:
352 case COLUMN_DELETED:
353 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
354 case VERSION_DELETED:
355 case FAMILY_VERSION_DELETED:
356 return MatchCode.SKIP;
357 case NOT_DELETED:
358 break;
359 default:
360 throw new RuntimeException("UNEXPECTED");
361 }
362 }
363
364 int timestampComparison = tr.compare(timestamp);
365 if (timestampComparison >= 1) {
366 return MatchCode.SKIP;
367 } else if (timestampComparison <= -1) {
368 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
369 }
370
371 /**
372 * Filters should be checked before checking column trackers. If we do
373 * otherwise, as was previously being done, ColumnTracker may increment its
374 * counter for even that KV which may be discarded later on by Filter. This
375 * would lead to incorrect results in certain cases.
376 */
377 ReturnCode filterResponse = ReturnCode.SKIP;
378 if (filter != null) {
379 filterResponse = filter.filterKeyValue(kv);
380 if (filterResponse == ReturnCode.SKIP) {
381 return MatchCode.SKIP;
382 } else if (filterResponse == ReturnCode.NEXT_COL) {
383 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
384 } else if (filterResponse == ReturnCode.NEXT_ROW) {
385 stickyNextRow = true;
386 return MatchCode.SEEK_NEXT_ROW;
387 } else if (filterResponse == ReturnCode.SEEK_NEXT_USING_HINT) {
388 return MatchCode.SEEK_NEXT_USING_HINT;
389 }
390 }
391
392 MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength,
393 timestamp, type, kv.getMvccVersion() > maxReadPointToTrackVersions);
394 /*
395 * According to current implementation, colChecker can only be
396 * SEEK_NEXT_COL, SEEK_NEXT_ROW, SKIP or INCLUDE. Therefore, always return
397 * the MatchCode. If it is SEEK_NEXT_ROW, also set stickyNextRow.
398 */
399 if (colChecker == MatchCode.SEEK_NEXT_ROW) {
400 stickyNextRow = true;
401 } else if (filter != null && colChecker == MatchCode.INCLUDE &&
402 filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL) {
403 return MatchCode.INCLUDE_AND_SEEK_NEXT_COL;
404 }
405 return colChecker;
406
407 }
408
409 /** Handle partial-drop-deletes. As we match keys in order, when we have a range from which
410 * we can drop deletes, we can set retainDeletesInOutput to false for the duration of this
411 * range only, and maintain consistency. */
412 private void checkPartialDropDeleteRange(byte [] row, int offset, short length) {
413 // If partial-drop-deletes are used, initially, dropDeletesFromRow and dropDeletesToRow
414 // are both set, and the matcher is set to retain deletes. We assume ordered keys. When
415 // dropDeletesFromRow is leq current kv, we start dropping deletes and reset
416 // dropDeletesFromRow; thus the 2nd "if" starts to apply.
417 if ((dropDeletesFromRow != null)
418 && ((dropDeletesFromRow == HConstants.EMPTY_START_ROW)
419 || (Bytes.compareTo(row, offset, length,
420 dropDeletesFromRow, 0, dropDeletesFromRow.length) >= 0))) {
421 retainDeletesInOutput = false;
422 dropDeletesFromRow = null;
423 }
424 // If dropDeletesFromRow is null and dropDeletesToRow is set, we are inside the partial-
425 // drop-deletes range. When dropDeletesToRow is leq current kv, we stop dropping deletes,
426 // and reset dropDeletesToRow so that we don't do any more compares.
427 if ((dropDeletesFromRow == null)
428 && (dropDeletesToRow != null) && (dropDeletesToRow != HConstants.EMPTY_END_ROW)
429 && (Bytes.compareTo(row, offset, length,
430 dropDeletesToRow, 0, dropDeletesToRow.length) >= 0)) {
431 retainDeletesInOutput = true;
432 dropDeletesToRow = null;
433 }
434 }
435
436 public boolean moreRowsMayExistAfter(KeyValue kv) {
437 if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) &&
438 rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(),
439 kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) {
440 // KV >= STOPROW
441 // then NO there is nothing left.
442 return false;
443 } else {
444 return true;
445 }
446 }
447
448 /**
449 * Set current row
450 * @param row
451 */
452 public void setRow(byte [] row, int offset, short length) {
453 checkPartialDropDeleteRange(row, offset, length);
454 this.row = row;
455 this.rowOffset = offset;
456 this.rowLength = length;
457 reset();
458 }
459
460 public void reset() {
461 this.deletes.reset();
462 this.columns.reset();
463
464 stickyNextRow = false;
465 }
466
467 /**
468 *
469 * @return the start key
470 */
471 public KeyValue getStartKey() {
472 return this.startKey;
473 }
474
475 /**
476 *
477 * @return the Filter
478 */
479 Filter getFilter() {
480 return this.filter;
481 }
482
483 public KeyValue getNextKeyHint(KeyValue kv) throws IOException {
484 if (filter == null) {
485 return null;
486 } else {
487 return filter.getNextKeyHint(kv);
488 }
489 }
490
491 public KeyValue getKeyForNextColumn(KeyValue kv) {
492 ColumnCount nextColumn = columns.getColumnHint();
493 if (nextColumn == null) {
494 return KeyValue.createLastOnRow(
495 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
496 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
497 kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
498 } else {
499 return KeyValue.createFirstOnRow(
500 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
501 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
502 nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength());
503 }
504 }
505
506 public KeyValue getKeyForNextRow(KeyValue kv) {
507 return KeyValue.createLastOnRow(
508 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
509 null, 0, 0,
510 null, 0, 0);
511 }
512
513 /**
514 * {@link #match} return codes. These instruct the scanner moving through
515 * memstores and StoreFiles what to do with the current KeyValue.
516 * <p>
517 * Additionally, this contains "early-out" language to tell the scanner to
518 * move on to the next File (memstore or Storefile), or to return immediately.
519 */
520 public static enum MatchCode {
521 /**
522 * Include KeyValue in the returned result
523 */
524 INCLUDE,
525
526 /**
527 * Do not include KeyValue in the returned result
528 */
529 SKIP,
530
531 /**
532 * Do not include, jump to next StoreFile or memstore (in time order)
533 */
534 NEXT,
535
536 /**
537 * Do not include, return current result
538 */
539 DONE,
540
541 /**
542 * These codes are used by the ScanQueryMatcher
543 */
544
545 /**
546 * Done with the row, seek there.
547 */
548 SEEK_NEXT_ROW,
549 /**
550 * Done with column, seek to next.
551 */
552 SEEK_NEXT_COL,
553
554 /**
555 * Done with scan, thanks to the row filter.
556 */
557 DONE_SCAN,
558
559 /*
560 * Seek to next key which is given as hint.
561 */
562 SEEK_NEXT_USING_HINT,
563
564 /**
565 * Include KeyValue and done with column, seek to next.
566 */
567 INCLUDE_AND_SEEK_NEXT_COL,
568
569 /**
570 * Include KeyValue and done with row, seek to next.
571 */
572 INCLUDE_AND_SEEK_NEXT_ROW,
573 }
574 }