001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, 013 * software distributed under the License is distributed on an 014 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 * KIND, either express or implied. See the License for the 016 * specific language governing permissions and limitations 017 * under the License. 018 */ 019package org.apache.commons.compress.compressors.lz77support; 020 021import java.io.IOException; 022import java.util.Arrays; 023 024/** 025 * Helper class for compression algorithms that use the ideas of LZ77. 026 * 027 * <p>Most LZ77 derived algorithms split input data into blocks of 028 * uncompressed data (called literal blocks) and back-references 029 * (pairs of offsets and lengths) that state "add <code>length</code> 030 * bytes that are the same as those already written starting 031 * <code>offset</code> bytes before the current position. The details 032 * of how those blocks and back-references are encoded are quite 033 * different between the algorithms and some algorithms perform 034 * additional steps (Huffman encoding in the case of DEFLATE for 035 * example).</p> 036 * 037 * <p>This class attempts to extract the core logic - finding 038 * back-references - so it can be re-used. It follows the algorithm 039 * explained in section 4 of RFC 1951 (DEFLATE) and currently doesn't 040 * implement the "lazy match" optimization. The three-byte hash 041 * function used in this class is the same as the one used by zlib and 042 * InfoZIP's ZIP implementation of DEFLATE. The whole class is 043 * strongly inspired by InfoZIP's implementation.</p> 044 * 045 * <p>LZ77 is used vaguely here (as well as many other places that 046 * talk about it :-), LZSS would likely be closer to the truth but 047 * LZ77 has become the synonym for a whole family of algorithms.</p> 048 * 049 * <p>The API consists of a compressor that is fed <code>byte</code>s 050 * and emits {@link Block}s to a registered callback where the blocks 051 * represent either {@link LiteralBlock literal blocks}, {@link 052 * BackReference back-references} or {@link EOD end of data 053 * markers}. In order to ensure the callback receives all information, 054 * the {@code #finish} method must be used once all data has been fed 055 * into the compressor.</p> 056 * 057 * <p>Several parameters influence the outcome of the "compression":</p> 058 * <dl> 059 * 060 * <dt><code>windowSize</code></dt> <dd>the size of the sliding 061 * window, must be a power of two - this determines the maximum 062 * offset a back-reference can take. The compressor maintains a 063 * buffer of twice of <code>windowSize</code> - real world values are 064 * in the area of 32k.</dd> 065 * 066 * <dt><code>minBackReferenceLength</code></dt> 067 * <dd>Minimal length of a back-reference found. A true minimum of 3 is 068 * hard-coded inside of this implemention but bigger lengths can be 069 * configured.</dd> 070 * 071 * <dt><code>maxBackReferenceLength</code></dt> 072 * <dd>Maximal length of a back-reference found.</dd> 073 * 074 * <dt><code>maxOffset</code></dt> 075 * <dd>Maximal offset of a back-reference.</dd> 076 * 077 * <dt><code>maxLiteralLength</code></dt> 078 * <dd>Maximal length of a literal block.</dd> 079 * </dl> 080 * 081 * @see "https://tools.ietf.org/html/rfc1951#section-4" 082 * @since 1.14 083 * @NotThreadSafe 084 */ 085public class LZ77Compressor { 086 087 /** Base class representing blocks the compressor may emit. */ 088 public static abstract class Block { 089 /** Enumeration of the block types the compressor may emit. */ 090 public enum BlockType { 091 LITERAL, BACK_REFERENCE, EOD 092 } 093 public abstract BlockType getType(); 094 } 095 096 /** 097 * Represents a literal block of data. 098 * 099 * <p>For performance reasons this encapsulates the real data, not 100 * a copy of it. Don't modify the data and process it inside of 101 * {@link Callback#accept} immediately as it will get overwritten 102 * sooner or later.</p> 103 */ 104 public static final class LiteralBlock extends Block { 105 private final byte[] data; 106 private final int offset, length; 107 public LiteralBlock(byte[] data, int offset, int length) { 108 this.data = data; 109 this.offset = offset; 110 this.length = length; 111 } 112 /** 113 * The literal data. 114 * 115 * <p>This returns a life view of the actual data in order to 116 * avoid copying, modify the array at your own risk.</p> 117 * @return the data 118 */ 119 public byte[] getData() { 120 return data; 121 } 122 /** 123 * Offset into data where the literal block starts. 124 * @return the offset 125 */ 126 public int getOffset() { 127 return offset; 128 } 129 /** 130 * Length of literal block. 131 * @return the length 132 */ 133 public int getLength() { 134 return length; 135 } 136 @Override 137 public BlockType getType() { 138 return BlockType.LITERAL; 139 } 140 @Override 141 public String toString() { 142 return "LiteralBlock starting at " + offset + " with length " + length; 143 } 144 } 145 146 /** 147 * Represents a back-reference. 148 */ 149 public static final class BackReference extends Block { 150 private final int offset, length; 151 public BackReference(int offset, int length) { 152 this.offset = offset; 153 this.length = length; 154 } 155 /** 156 * Provides the offset of the back-reference. 157 * @return the offset 158 */ 159 public int getOffset() { 160 return offset; 161 } 162 /** 163 * Provides the length of the back-reference. 164 * @return the length 165 */ 166 public int getLength() { 167 return length; 168 } 169 @Override 170 public BlockType getType() { 171 return BlockType.BACK_REFERENCE; 172 } 173 @Override 174 public String toString() { 175 return "BackReference with offset " + offset + " and length " + length; 176 } 177 } 178 179 /** A simple "we are done" marker. */ 180 public static final class EOD extends Block { 181 @Override 182 public BlockType getType() { 183 return BlockType.EOD; 184 } 185 } 186 187 private static final Block THE_EOD = new EOD(); 188 189 /** 190 * Callback invoked while the compressor processes data. 191 * 192 * <p>The callback is invoked on the same thread that receives the 193 * bytes to compress and may be invoked multiple times during the 194 * execution of {@link #compress} or {@link #finish}.</p> 195 */ 196 public interface Callback { 197 /** 198 * Consumes a block. 199 * @param b the block to consume 200 * @throws IOException in case of an error 201 */ 202 void accept(Block b) throws IOException; 203 } 204 205 static final int NUMBER_OF_BYTES_IN_HASH = 3; 206 private static final int NO_MATCH = -1; 207 208 private final Parameters params; 209 private final Callback callback; 210 211 // the sliding window, twice as big as "windowSize" parameter 212 private final byte[] window; 213 // the head of hash-chain - indexed by hash-code, points to the 214 // location inside of window of the latest sequence of bytes with 215 // the given hash. 216 private final int[] head; 217 // for each window-location points to the latest earlier location 218 // with the same hash. Only stores values for the latest 219 // "windowSize" elements, the index is "window location modulo 220 // windowSize". 221 private final int[] prev; 222 223 // bit mask used when indexing into prev 224 private final int wMask; 225 226 private boolean initialized = false; 227 // the position inside of window that shall be encoded right now 228 private int currentPosition; 229 // the number of bytes available to compress including the one at 230 // currentPosition 231 private int lookahead = 0; 232 // the hash of the three bytes stating at the current position 233 private int insertHash = 0; 234 // the position inside of the window where the current literal 235 // block starts (in case we are inside of a literal block). 236 private int blockStart = 0; 237 // position of the current match 238 private int matchStart = NO_MATCH; 239 // number of missed insertString calls for the up to three last 240 // bytes of the last match that can only be performed once more 241 // data has been read 242 private int missedInserts = 0; 243 244 /** 245 * Initializes a compressor with parameters and a callback. 246 * @param params the parameters 247 * @param callback the callback 248 * @throws NullPointerException if either parameter is <code>null</code> 249 */ 250 public LZ77Compressor(Parameters params, Callback callback) { 251 if (params == null) { 252 throw new NullPointerException("params must not be null"); 253 } 254 if (callback == null) { 255 throw new NullPointerException("callback must not be null"); 256 } 257 this.params = params; 258 this.callback = callback; 259 260 final int wSize = params.getWindowSize(); 261 window = new byte[wSize * 2]; 262 wMask = wSize - 1; 263 head = new int[HASH_SIZE]; 264 Arrays.fill(head, NO_MATCH); 265 prev = new int[wSize]; 266 } 267 268 /** 269 * Feeds bytes into the compressor which in turn may emit zero or 270 * more blocks to the callback during the execution of this 271 * method. 272 * @param data the data to compress - must not be null 273 * @throws IOException if the callback throws an exception 274 */ 275 public void compress(byte[] data) throws IOException { 276 compress(data, 0, data.length); 277 } 278 279 /** 280 * Feeds bytes into the compressor which in turn may emit zero or 281 * more blocks to the callback during the execution of this 282 * method. 283 * @param data the data to compress - must not be null 284 * @param off the start offset of the data 285 * @param len the number of bytes to compress 286 * @throws IOException if the callback throws an exception 287 */ 288 public void compress(byte[] data, int off, int len) throws IOException { 289 final int wSize = params.getWindowSize(); 290 while (len > wSize) { // chop into windowSize sized chunks 291 doCompress(data, off, wSize); 292 off += wSize; 293 len -= wSize; 294 } 295 if (len > 0) { 296 doCompress(data, off, len); 297 } 298 } 299 300 /** 301 * Tells the compressor to process all remaining data and signal 302 * end of data to the callback. 303 * 304 * <p>The compressor will in turn emit at least one block ({@link 305 * EOD}) but potentially multiple blocks to the callback during 306 * the execution of this method.</p> 307 * @throws IOException if the callback throws an exception 308 */ 309 public void finish() throws IOException { 310 if (blockStart != currentPosition || lookahead > 0) { 311 currentPosition += lookahead; 312 flushLiteralBlock(); 313 } 314 callback.accept(THE_EOD); 315 } 316 317 /** 318 * Adds some initial data to fill the window with. 319 * 320 * <p>This is used if the stream has been cut into blocks and 321 * back-references of one block may refer to data of the previous 322 * block(s). One such example is the LZ4 frame format using block 323 * dependency.</p> 324 * 325 * @param data the data to fill the window with. 326 * @throws IllegalStateException if the compressor has already started to accept data 327 */ 328 public void prefill(byte[] data) { 329 if (currentPosition != 0 || lookahead != 0) { 330 throw new IllegalStateException("the compressor has already started to accept data, can't prefill anymore"); 331 } 332 333 // don't need more than windowSize for back-references 334 final int len = Math.min(params.getWindowSize(), data.length); 335 System.arraycopy(data, data.length - len, window, 0, len); 336 337 if (len >= NUMBER_OF_BYTES_IN_HASH) { 338 initialize(); 339 final int stop = len - NUMBER_OF_BYTES_IN_HASH + 1; 340 for (int i = 0; i < stop; i++) { 341 insertString(i); 342 } 343 missedInserts = NUMBER_OF_BYTES_IN_HASH - 1; 344 } else { // not enough data to hash anything 345 missedInserts = len; 346 } 347 blockStart = currentPosition = len; 348 } 349 350 // we use a 15 bit hashcode as calculated in updateHash 351 private static final int HASH_SIZE = 1 << 15; 352 private static final int HASH_MASK = HASH_SIZE - 1; 353 private static final int H_SHIFT = 5; 354 355 /** 356 * Assumes we are calculating the hash for three consecutive bytes 357 * as a rolling hash, i.e. for bytes ABCD if H is the hash of ABC 358 * the new hash for BCD is nextHash(H, D). 359 * 360 * <p>The hash is shifted by five bits on each update so all 361 * effects of A have been swapped after the third update.</p> 362 */ 363 private int nextHash(int oldHash, byte nextByte) { 364 final int nextVal = nextByte & 0xFF; 365 return ((oldHash << H_SHIFT) ^ nextVal) & HASH_MASK; 366 } 367 368 // performs the actual algorithm with the pre-condition len <= windowSize 369 private void doCompress(byte[] data, int off, int len) throws IOException { 370 int spaceLeft = window.length - currentPosition - lookahead; 371 if (len > spaceLeft) { 372 slide(); 373 } 374 System.arraycopy(data, off, window, currentPosition + lookahead, len); 375 lookahead += len; 376 if (!initialized && lookahead >= params.getMinBackReferenceLength()) { 377 initialize(); 378 } 379 if (initialized) { 380 compress(); 381 } 382 } 383 384 private void slide() throws IOException { 385 final int wSize = params.getWindowSize(); 386 if (blockStart != currentPosition && blockStart < wSize) { 387 flushLiteralBlock(); 388 blockStart = currentPosition; 389 } 390 System.arraycopy(window, wSize, window, 0, wSize); 391 currentPosition -= wSize; 392 matchStart -= wSize; 393 blockStart -= wSize; 394 for (int i = 0; i < HASH_SIZE; i++) { 395 int h = head[i]; 396 head[i] = h >= wSize ? h - wSize : NO_MATCH; 397 } 398 for (int i = 0; i < wSize; i++) { 399 int p = prev[i]; 400 prev[i] = p >= wSize ? p - wSize : NO_MATCH; 401 } 402 } 403 404 private void initialize() { 405 for (int i = 0; i < NUMBER_OF_BYTES_IN_HASH - 1; i++) { 406 insertHash = nextHash(insertHash, window[i]); 407 } 408 initialized = true; 409 } 410 411 private void compress() throws IOException { 412 final int minMatch = params.getMinBackReferenceLength(); 413 final boolean lazy = params.getLazyMatching(); 414 final int lazyThreshold = params.getLazyMatchingThreshold(); 415 416 while (lookahead >= minMatch) { 417 catchUpMissedInserts(); 418 int matchLength = 0; 419 int hashHead = insertString(currentPosition); 420 if (hashHead != NO_MATCH && hashHead - currentPosition <= params.getMaxOffset()) { 421 // sets matchStart as a side effect 422 matchLength = longestMatch(hashHead); 423 424 if (lazy && matchLength <= lazyThreshold && lookahead > minMatch) { 425 // try to find a longer match using the next position 426 matchLength = longestMatchForNextPosition(matchLength); 427 } 428 } 429 if (matchLength >= minMatch) { 430 if (blockStart != currentPosition) { 431 // emit preceeding literal block 432 flushLiteralBlock(); 433 blockStart = NO_MATCH; 434 } 435 flushBackReference(matchLength); 436 insertStringsInMatch(matchLength); 437 lookahead -= matchLength; 438 currentPosition += matchLength; 439 blockStart = currentPosition; 440 } else { 441 // no match, append to current or start a new literal 442 lookahead--; 443 currentPosition++; 444 if (currentPosition - blockStart >= params.getMaxLiteralLength()) { 445 flushLiteralBlock(); 446 blockStart = currentPosition; 447 } 448 } 449 } 450 } 451 452 /** 453 * Inserts the current three byte sequence into the dictionary and 454 * returns the previous head of the hash-chain. 455 * 456 * <p>Updates <code>insertHash</code> and <code>prev</code> as a 457 * side effect.</p> 458 */ 459 private int insertString(int pos) { 460 insertHash = nextHash(insertHash, window[pos - 1 + NUMBER_OF_BYTES_IN_HASH]); 461 int hashHead = head[insertHash]; 462 prev[pos & wMask] = hashHead; 463 head[insertHash] = pos; 464 return hashHead; 465 } 466 467 private int longestMatchForNextPosition(final int prevMatchLength) { 468 // save a bunch of values to restore them if the next match isn't better than the current one 469 final int prevMatchStart = matchStart; 470 final int prevInsertHash = insertHash; 471 472 lookahead--; 473 currentPosition++; 474 int hashHead = insertString(currentPosition); 475 final int prevHashHead = prev[currentPosition & wMask]; 476 int matchLength = longestMatch(hashHead); 477 478 if (matchLength <= prevMatchLength) { 479 // use the first match, as the next one isn't any better 480 matchLength = prevMatchLength; 481 matchStart = prevMatchStart; 482 483 // restore modified values 484 head[insertHash] = prevHashHead; 485 insertHash = prevInsertHash; 486 currentPosition--; 487 lookahead++; 488 } 489 return matchLength; 490 } 491 492 private void insertStringsInMatch(int matchLength) { 493 // inserts strings contained in current match 494 // insertString inserts the byte 2 bytes after position, which may not yet be available -> missedInserts 495 final int stop = Math.min(matchLength - 1, lookahead - NUMBER_OF_BYTES_IN_HASH); 496 // currentPosition has been inserted already 497 for (int i = 1; i <= stop; i++) { 498 insertString(currentPosition + i); 499 } 500 missedInserts = matchLength - stop - 1; 501 } 502 503 private void catchUpMissedInserts() { 504 while (missedInserts > 0) { 505 insertString(currentPosition - missedInserts--); 506 } 507 } 508 509 private void flushBackReference(int matchLength) throws IOException { 510 callback.accept(new BackReference(currentPosition - matchStart, matchLength)); 511 } 512 513 private void flushLiteralBlock() throws IOException { 514 callback.accept(new LiteralBlock(window, blockStart, currentPosition - blockStart)); 515 } 516 517 /** 518 * Searches the hash chain for real matches and returns the length 519 * of the longest match (0 if none were found) that isn't too far 520 * away (WRT maxOffset). 521 * 522 * <p>Sets matchStart to the index of the start position of the 523 * longest match as a side effect.</p> 524 */ 525 private int longestMatch(int matchHead) { 526 final int minLength = params.getMinBackReferenceLength(); 527 int longestMatchLength = minLength - 1; 528 final int maxPossibleLength = Math.min(params.getMaxBackReferenceLength(), lookahead); 529 final int minIndex = Math.max(0, currentPosition - params.getMaxOffset()); 530 final int niceBackReferenceLength = Math.min(maxPossibleLength, params.getNiceBackReferenceLength()); 531 final int maxCandidates = params.getMaxCandidates(); 532 for (int candidates = 0; candidates < maxCandidates && matchHead >= minIndex; candidates++) { 533 int currentLength = 0; 534 for (int i = 0; i < maxPossibleLength; i++) { 535 if (window[matchHead + i] != window[currentPosition + i]) { 536 break; 537 } 538 currentLength++; 539 } 540 if (currentLength > longestMatchLength) { 541 longestMatchLength = currentLength; 542 matchStart = matchHead; 543 if (currentLength >= niceBackReferenceLength) { 544 // no need to search any further 545 break; 546 } 547 } 548 matchHead = prev[matchHead & wMask]; 549 } 550 return longestMatchLength; // < minLength if no matches have been found, will be ignored in compress() 551 } 552}