1 /**
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 package org.apache.hadoop.hbase.regionserver;
21
22 import java.io.IOException;
23 import java.util.NavigableSet;
24
25 import org.apache.hadoop.classification.InterfaceAudience;
26 import org.apache.hadoop.hbase.HConstants;
27 import org.apache.hadoop.hbase.KeyValue;
28 import org.apache.hadoop.hbase.client.Scan;
29 import org.apache.hadoop.hbase.filter.Filter;
30 import org.apache.hadoop.hbase.filter.Filter.ReturnCode;
31 import org.apache.hadoop.hbase.io.TimeRange;
32 import org.apache.hadoop.hbase.regionserver.DeleteTracker.DeleteResult;
33 import org.apache.hadoop.hbase.util.Bytes;
34 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
35
36 import com.google.common.base.Preconditions;
37
38 /**
39 * A query matcher that is specifically designed for the scan case.
40 */
41 @InterfaceAudience.Private
42 public class ScanQueryMatcher {
43 // Optimization so we can skip lots of compares when we decide to skip
44 // to the next row.
45 private boolean stickyNextRow;
46 private final byte[] stopRow;
47
48 private final TimeRange tr;
49
50 private final Filter filter;
51
52 /** Keeps track of deletes */
53 private final DeleteTracker deletes;
54
55 /*
56 * The following three booleans define how we deal with deletes.
57 * There are three different aspects:
58 * 1. Whether to keep delete markers. This is used in compactions.
59 * Minor compactions always keep delete markers.
60 * 2. Whether to keep deleted rows. This is also used in compactions,
61 * if the store is set to keep deleted rows. This implies keeping
62 * the delete markers as well.
63 * In this case deleted rows are subject to the normal max version
64 * and TTL/min version rules just like "normal" rows.
65 * 3. Whether a scan can do time travel queries even before deleted
66 * marker to reach deleted rows.
67 */
68 /** whether to retain delete markers */
69 private boolean retainDeletesInOutput;
70
71 /** whether to return deleted rows */
72 private final boolean keepDeletedCells;
73 /** whether time range queries can see rows "behind" a delete */
74 private final boolean seePastDeleteMarkers;
75
76
77 /** Keeps track of columns and versions */
78 private final ColumnTracker columns;
79
80 /** Key to seek to in memstore and StoreFiles */
81 private final KeyValue startKey;
82
83 /** Row comparator for the region this query is for */
84 private final KeyValue.KeyComparator rowComparator;
85
86 /* row is not private for tests */
87 /** Row the query is on */
88 byte [] row;
89 int rowOffset;
90 short rowLength;
91
92 /**
93 * Oldest put in any of the involved store files
94 * Used to decide whether it is ok to delete
95 * family delete marker of this store keeps
96 * deleted KVs.
97 */
98 private final long earliestPutTs;
99
100 /** readPoint over which the KVs are unconditionally included */
101 protected long maxReadPointToTrackVersions;
102
103 private byte[] dropDeletesFromRow = null, dropDeletesToRow = null;
104
105 /**
106 * This variable shows whether there is an null column in the query. There
107 * always exists a null column in the wildcard column query.
108 * There maybe exists a null column in the explicit column query based on the
109 * first column.
110 * */
111 private boolean hasNullColumn = true;
112
113 // By default, when hbase.hstore.time.to.purge.deletes is 0ms, a delete
114 // marker is always removed during a major compaction. If set to non-zero
115 // value then major compaction will try to keep a delete marker around for
116 // the given number of milliseconds. We want to keep the delete markers
117 // around a bit longer because old puts might appear out-of-order. For
118 // example, during log replication between two clusters.
119 //
120 // If the delete marker has lived longer than its column-family's TTL then
121 // the delete marker will be removed even if time.to.purge.deletes has not
122 // passed. This is because all the Puts that this delete marker can influence
123 // would have also expired. (Removing of delete markers on col family TTL will
124 // not happen if min-versions is set to non-zero)
125 //
126 // But, if time.to.purge.deletes has not expired then a delete
127 // marker will not be removed just because there are no Puts that it is
128 // currently influencing. This is because Puts, that this delete can
129 // influence. may appear out of order.
130 private final long timeToPurgeDeletes;
131
132 private final boolean isUserScan;
133
134 /**
135 * Construct a QueryMatcher for a scan
136 * @param scan
137 * @param scanInfo The store's immutable scan info
138 * @param columns
139 * @param scanType Type of the scan
140 * @param earliestPutTs Earliest put seen in any of the store files.
141 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
142 * based on TTL
143 */
144 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
145 NavigableSet<byte[]> columns, ScanType scanType,
146 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS) {
147 this.tr = scan.getTimeRange();
148 this.rowComparator = scanInfo.getComparator().getRawComparator();
149 this.deletes = new ScanDeleteTracker();
150 this.stopRow = scan.getStopRow();
151 this.startKey = KeyValue.createFirstDeleteFamilyOnRow(scan.getStartRow(),
152 scanInfo.getFamily());
153 this.filter = scan.getFilter();
154 this.earliestPutTs = earliestPutTs;
155 this.maxReadPointToTrackVersions = readPointToUse;
156 this.timeToPurgeDeletes = scanInfo.getTimeToPurgeDeletes();
157
158 /* how to deal with deletes */
159 this.isUserScan = scanType == ScanType.USER_SCAN;
160 // keep deleted cells: if compaction or raw scan
161 this.keepDeletedCells = (scanInfo.getKeepDeletedCells() && !isUserScan) || scan.isRaw();
162 // retain deletes: if minor compaction or raw scan
163 this.retainDeletesInOutput = scanType == ScanType.COMPACT_RETAIN_DELETES || scan.isRaw();
164 // seePastDeleteMarker: user initiated scans
165 this.seePastDeleteMarkers = scanInfo.getKeepDeletedCells() && isUserScan;
166
167 int maxVersions = Math.min(scan.getMaxVersions(), scanInfo.getMaxVersions());
168 // Single branch to deal with two types of reads (columns vs all in family)
169 if (columns == null || columns.size() == 0) {
170 // there is always a null column in the wildcard column query.
171 hasNullColumn = true;
172
173 // use a specialized scan for wildcard column tracker.
174 this.columns = new ScanWildcardColumnTracker(
175 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
176 } else {
177 // whether there is null column in the explicit column query
178 hasNullColumn = (columns.first().length == 0);
179
180 // We can share the ExplicitColumnTracker, diff is we reset
181 // between rows, not between storefiles.
182 this.columns = new ExplicitColumnTracker(columns,
183 scanInfo.getMinVersions(), maxVersions, oldestUnexpiredTS);
184 }
185 }
186
187 /**
188 * Construct a QueryMatcher for a scan that drop deletes from a limited range of rows.
189 * @param scan
190 * @param scanInfo The store's immutable scan info
191 * @param columns
192 * @param earliestPutTs Earliest put seen in any of the store files.
193 * @param oldestUnexpiredTS the oldest timestamp we are interested in,
194 * based on TTL
195 * @param dropDeletesFromRow The inclusive left bound of the range; can be EMPTY_START_ROW.
196 * @param dropDeletesToRow The exclusive right bound of the range; can be EMPTY_END_ROW.
197 */
198 public ScanQueryMatcher(Scan scan, ScanInfo scanInfo, NavigableSet<byte[]> columns,
199 long readPointToUse, long earliestPutTs, long oldestUnexpiredTS,
200 byte[] dropDeletesFromRow, byte[] dropDeletesToRow) {
201 this(scan, scanInfo, columns, ScanType.COMPACT_RETAIN_DELETES, readPointToUse, earliestPutTs,
202 oldestUnexpiredTS);
203 Preconditions.checkArgument((dropDeletesFromRow != null) && (dropDeletesToRow != null));
204 this.dropDeletesFromRow = dropDeletesFromRow;
205 this.dropDeletesToRow = dropDeletesToRow;
206 }
207
208 /*
209 * Constructor for tests
210 */
211 ScanQueryMatcher(Scan scan, ScanInfo scanInfo,
212 NavigableSet<byte[]> columns, long oldestUnexpiredTS) {
213 this(scan, scanInfo, columns, ScanType.USER_SCAN,
214 Long.MAX_VALUE, /* max Readpoint to track versions */
215 HConstants.LATEST_TIMESTAMP, oldestUnexpiredTS);
216 }
217
218 /**
219 *
220 * @return whether there is an null column in the query
221 */
222 public boolean hasNullColumnInQuery() {
223 return hasNullColumn;
224 }
225
226 /**
227 * Determines if the caller should do one of several things:
228 * - seek/skip to the next row (MatchCode.SEEK_NEXT_ROW)
229 * - seek/skip to the next column (MatchCode.SEEK_NEXT_COL)
230 * - include the current KeyValue (MatchCode.INCLUDE)
231 * - ignore the current KeyValue (MatchCode.SKIP)
232 * - got to the next row (MatchCode.DONE)
233 *
234 * @param kv KeyValue to check
235 * @return The match code instance.
236 * @throws IOException in case there is an internal consistency problem
237 * caused by a data corruption.
238 */
239 public MatchCode match(KeyValue kv) throws IOException {
240 if (filter != null && filter.filterAllRemaining()) {
241 return MatchCode.DONE_SCAN;
242 }
243
244 byte [] bytes = kv.getBuffer();
245 int offset = kv.getOffset();
246 int initialOffset = offset;
247
248 int keyLength = Bytes.toInt(bytes, offset, Bytes.SIZEOF_INT);
249 offset += KeyValue.ROW_OFFSET;
250
251 short rowLength = Bytes.toShort(bytes, offset, Bytes.SIZEOF_SHORT);
252 offset += Bytes.SIZEOF_SHORT;
253
254 int ret = this.rowComparator.compareRows(row, this.rowOffset, this.rowLength,
255 bytes, offset, rowLength);
256 if (ret <= -1) {
257 return MatchCode.DONE;
258 } else if (ret >= 1) {
259 // could optimize this, if necessary?
260 // Could also be called SEEK_TO_CURRENT_ROW, but this
261 // should be rare/never happens.
262 return MatchCode.SEEK_NEXT_ROW;
263 }
264
265 // optimize case.
266 if (this.stickyNextRow)
267 return MatchCode.SEEK_NEXT_ROW;
268
269 if (this.columns.done()) {
270 stickyNextRow = true;
271 return MatchCode.SEEK_NEXT_ROW;
272 }
273
274 //Passing rowLength
275 offset += rowLength;
276
277 //Skipping family
278 byte familyLength = bytes [offset];
279 offset += familyLength + 1;
280
281 int qualLength = keyLength + KeyValue.ROW_OFFSET -
282 (offset - initialOffset) - KeyValue.TIMESTAMP_TYPE_SIZE;
283
284 long timestamp = kv.getTimestamp();
285 // check for early out based on timestamp alone
286 if (columns.isDone(timestamp)) {
287 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
288 }
289
290 /*
291 * The delete logic is pretty complicated now.
292 * This is corroborated by the following:
293 * 1. The store might be instructed to keep deleted rows around.
294 * 2. A scan can optionally see past a delete marker now.
295 * 3. If deleted rows are kept, we have to find out when we can
296 * remove the delete markers.
297 * 4. Family delete markers are always first (regardless of their TS)
298 * 5. Delete markers should not be counted as version
299 * 6. Delete markers affect puts of the *same* TS
300 * 7. Delete marker need to be version counted together with puts
301 * they affect
302 */
303 byte type = kv.getType();
304 if (kv.isDelete()) {
305 if (!keepDeletedCells) {
306 // first ignore delete markers if the scanner can do so, and the
307 // range does not include the marker
308 //
309 // during flushes and compactions also ignore delete markers newer
310 // than the readpoint of any open scanner, this prevents deleted
311 // rows that could still be seen by a scanner from being collected
312 boolean includeDeleteMarker = seePastDeleteMarkers ?
313 tr.withinTimeRange(timestamp) :
314 tr.withinOrAfterTimeRange(timestamp);
315 if (includeDeleteMarker
316 && kv.getMemstoreTS() <= maxReadPointToTrackVersions) {
317 this.deletes.add(bytes, offset, qualLength, timestamp, type);
318 }
319 // Can't early out now, because DelFam come before any other keys
320 }
321 if (retainDeletesInOutput
322 || (!isUserScan && (EnvironmentEdgeManager.currentTimeMillis() - timestamp) <= timeToPurgeDeletes)
323 || kv.getMemstoreTS() > maxReadPointToTrackVersions) {
324 // always include or it is not time yet to check whether it is OK
325 // to purge deltes or not
326 return MatchCode.INCLUDE;
327 } else if (keepDeletedCells) {
328 if (timestamp < earliestPutTs) {
329 // keeping delete rows, but there are no puts older than
330 // this delete in the store files.
331 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
332 }
333 // else: fall through and do version counting on the
334 // delete markers
335 } else {
336 return MatchCode.SKIP;
337 }
338 // note the following next else if...
339 // delete marker are not subject to other delete markers
340 } else if (!this.deletes.isEmpty()) {
341 DeleteResult deleteResult = deletes.isDeleted(bytes, offset, qualLength,
342 timestamp);
343 switch (deleteResult) {
344 case FAMILY_DELETED:
345 case COLUMN_DELETED:
346 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
347 case VERSION_DELETED:
348 return MatchCode.SKIP;
349 case NOT_DELETED:
350 break;
351 default:
352 throw new RuntimeException("UNEXPECTED");
353 }
354 }
355
356 int timestampComparison = tr.compare(timestamp);
357 if (timestampComparison >= 1) {
358 return MatchCode.SKIP;
359 } else if (timestampComparison <= -1) {
360 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
361 }
362
363 /**
364 * Filters should be checked before checking column trackers. If we do
365 * otherwise, as was previously being done, ColumnTracker may increment its
366 * counter for even that KV which may be discarded later on by Filter. This
367 * would lead to incorrect results in certain cases.
368 */
369 ReturnCode filterResponse = ReturnCode.SKIP;
370 if (filter != null) {
371 filterResponse = filter.filterKeyValue(kv);
372 if (filterResponse == ReturnCode.SKIP) {
373 return MatchCode.SKIP;
374 } else if (filterResponse == ReturnCode.NEXT_COL) {
375 return columns.getNextRowOrNextColumn(bytes, offset, qualLength);
376 } else if (filterResponse == ReturnCode.NEXT_ROW) {
377 stickyNextRow = true;
378 return MatchCode.SEEK_NEXT_ROW;
379 } else if (filterResponse == ReturnCode.SEEK_NEXT_USING_HINT) {
380 return MatchCode.SEEK_NEXT_USING_HINT;
381 }
382 }
383
384 MatchCode colChecker = columns.checkColumn(bytes, offset, qualLength,
385 timestamp, type, kv.getMemstoreTS() > maxReadPointToTrackVersions);
386 /*
387 * According to current implementation, colChecker can only be
388 * SEEK_NEXT_COL, SEEK_NEXT_ROW, SKIP or INCLUDE. Therefore, always return
389 * the MatchCode. If it is SEEK_NEXT_ROW, also set stickyNextRow.
390 */
391 if (colChecker == MatchCode.SEEK_NEXT_ROW) {
392 stickyNextRow = true;
393 } else if (filter != null && colChecker == MatchCode.INCLUDE &&
394 filterResponse == ReturnCode.INCLUDE_AND_NEXT_COL) {
395 return MatchCode.INCLUDE_AND_SEEK_NEXT_COL;
396 }
397 return colChecker;
398
399 }
400
401 /** Handle partial-drop-deletes. As we match keys in order, when we have a range from which
402 * we can drop deletes, we can set retainDeletesInOutput to false for the duration of this
403 * range only, and maintain consistency. */
404 private void checkPartialDropDeleteRange(byte [] row, int offset, short length) {
405 // If partial-drop-deletes are used, initially, dropDeletesFromRow and dropDeletesToRow
406 // are both set, and the matcher is set to retain deletes. We assume ordered keys. When
407 // dropDeletesFromRow is leq current kv, we start dropping deletes and reset
408 // dropDeletesFromRow; thus the 2nd "if" starts to apply.
409 if ((dropDeletesFromRow != null)
410 && ((dropDeletesFromRow == HConstants.EMPTY_START_ROW)
411 || (Bytes.compareTo(row, offset, length,
412 dropDeletesFromRow, 0, dropDeletesFromRow.length) >= 0))) {
413 retainDeletesInOutput = false;
414 dropDeletesFromRow = null;
415 }
416 // If dropDeletesFromRow is null and dropDeletesToRow is set, we are inside the partial-
417 // drop-deletes range. When dropDeletesToRow is leq current kv, we stop dropping deletes,
418 // and reset dropDeletesToRow so that we don't do any more compares.
419 if ((dropDeletesFromRow == null)
420 && (dropDeletesToRow != null) && (dropDeletesToRow != HConstants.EMPTY_END_ROW)
421 && (Bytes.compareTo(row, offset, length,
422 dropDeletesToRow, 0, dropDeletesToRow.length) >= 0)) {
423 retainDeletesInOutput = true;
424 dropDeletesToRow = null;
425 }
426 }
427
428 public boolean moreRowsMayExistAfter(KeyValue kv) {
429 if (!Bytes.equals(stopRow , HConstants.EMPTY_END_ROW) &&
430 rowComparator.compareRows(kv.getBuffer(),kv.getRowOffset(),
431 kv.getRowLength(), stopRow, 0, stopRow.length) >= 0) {
432 // KV >= STOPROW
433 // then NO there is nothing left.
434 return false;
435 } else {
436 return true;
437 }
438 }
439
440 /**
441 * Set current row
442 * @param row
443 */
444 public void setRow(byte [] row, int offset, short length) {
445 checkPartialDropDeleteRange(row, offset, length);
446 this.row = row;
447 this.rowOffset = offset;
448 this.rowLength = length;
449 reset();
450 }
451
452 public void reset() {
453 this.deletes.reset();
454 this.columns.reset();
455
456 stickyNextRow = false;
457 }
458
459 /**
460 *
461 * @return the start key
462 */
463 public KeyValue getStartKey() {
464 return this.startKey;
465 }
466
467 /**
468 *
469 * @return the Filter
470 */
471 Filter getFilter() {
472 return this.filter;
473 }
474
475 public KeyValue getNextKeyHint(KeyValue kv) throws IOException {
476 if (filter == null) {
477 return null;
478 } else {
479 return filter.getNextKeyHint(kv);
480 }
481 }
482
483 public KeyValue getKeyForNextColumn(KeyValue kv) {
484 ColumnCount nextColumn = columns.getColumnHint();
485 if (nextColumn == null) {
486 return KeyValue.createLastOnRow(
487 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
488 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
489 kv.getBuffer(), kv.getQualifierOffset(), kv.getQualifierLength());
490 } else {
491 return KeyValue.createFirstOnRow(
492 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
493 kv.getBuffer(), kv.getFamilyOffset(), kv.getFamilyLength(),
494 nextColumn.getBuffer(), nextColumn.getOffset(), nextColumn.getLength());
495 }
496 }
497
498 public KeyValue getKeyForNextRow(KeyValue kv) {
499 return KeyValue.createLastOnRow(
500 kv.getBuffer(), kv.getRowOffset(), kv.getRowLength(),
501 null, 0, 0,
502 null, 0, 0);
503 }
504
505 /**
506 * {@link #match} return codes. These instruct the scanner moving through
507 * memstores and StoreFiles what to do with the current KeyValue.
508 * <p>
509 * Additionally, this contains "early-out" language to tell the scanner to
510 * move on to the next File (memstore or Storefile), or to return immediately.
511 */
512 public static enum MatchCode {
513 /**
514 * Include KeyValue in the returned result
515 */
516 INCLUDE,
517
518 /**
519 * Do not include KeyValue in the returned result
520 */
521 SKIP,
522
523 /**
524 * Do not include, jump to next StoreFile or memstore (in time order)
525 */
526 NEXT,
527
528 /**
529 * Do not include, return current result
530 */
531 DONE,
532
533 /**
534 * These codes are used by the ScanQueryMatcher
535 */
536
537 /**
538 * Done with the row, seek there.
539 */
540 SEEK_NEXT_ROW,
541 /**
542 * Done with column, seek to next.
543 */
544 SEEK_NEXT_COL,
545
546 /**
547 * Done with scan, thanks to the row filter.
548 */
549 DONE_SCAN,
550
551 /*
552 * Seek to next key which is given as hint.
553 */
554 SEEK_NEXT_USING_HINT,
555
556 /**
557 * Include KeyValue and done with column, seek to next.
558 */
559 INCLUDE_AND_SEEK_NEXT_COL,
560
561 /**
562 * Include KeyValue and done with row, seek to next.
563 */
564 INCLUDE_AND_SEEK_NEXT_ROW,
565 }
566 }