001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.digest; 019 020import java.nio.charset.Charset; 021import java.nio.charset.StandardCharsets; 022 023import org.apache.commons.codec.binary.StringUtils; 024 025/** 026 * Implementation of the MurmurHash3 32-bit and 128-bit hash functions. 027 * 028 * <p> 029 * MurmurHash is a non-cryptographic hash function suitable for general hash-based lookup. The name comes from two basic 030 * operations, multiply (MU) and rotate (R), used in its inner loop. Unlike cryptographic hash functions, it is not 031 * specifically designed to be difficult to reverse by an adversary, making it unsuitable for cryptographic purposes. 032 * </p> 033 * 034 * <p> 035 * This contains a Java port of the 32-bit hash function {@code MurmurHash3_x86_32} and the 128-bit hash function 036 * {@code MurmurHash3_x64_128} from Austin Applyby's original {@code c++} code in SMHasher. 037 * </p> 038 * 039 * <p> 040 * This is public domain code with no copyrights. From home page of 041 * <a href="https://github.com/aappleby/smhasher">SMHasher</a>: 042 * </p> 043 * 044 * <blockquote> "All MurmurHash versions are public domain software, and the author disclaims all copyright to their 045 * code." </blockquote> 046 * 047 * <p> 048 * Original adaption from Apache Hive. That adaption contains a {@code hash64} method that is not part of the original 049 * MurmurHash3 code. It is not recommended to use these methods. They will be removed in a future release. To obtain a 050 * 64-bit hash use half of the bits from the {@code hash128x64} methods using the input data converted to bytes. 051 * <p> 052 * 053 * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> 054 * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp"> Original MurmurHash3 c++ 055 * code</a> 056 * @see <a href= 057 * "https://github.com/apache/hive/blob/master/storage-api/src/java/org/apache/hive/common/util/Murmur3.java"> 058 * Apache Hive Murmer3</a> 059 * @since 1.13 060 */ 061public final class MurmurHash3 { 062 063 /** 064 * A random number to use for a hash code. 065 * 066 * @deprecated This is not used internally and will be removed in a future release. 067 */ 068 @Deprecated 069 public static final long NULL_HASHCODE = 2862933555777941757L; 070 071 /** 072 * A default seed to use for the murmur hash algorithm. 073 * Has the value {@code 104729}. 074 */ 075 public static final int DEFAULT_SEED = 104729; 076 077 /** TODO Replace on Java 8 with Long.BYTES. */ 078 static final int LONG_BYTES = Long.SIZE / Byte.SIZE; 079 080 /** TODO Replace on Java 8 with Integer.BYTES. */ 081 static final int INTEGER_BYTES = Integer.SIZE / Byte.SIZE; 082 083 /** TODO Replace on Java 8 with Short.BYTES. */ 084 static final int SHORT_BYTES = Short.SIZE / Byte.SIZE; 085 086 // Constants for 32-bit variant 087 private static final int C1_32 = 0xcc9e2d51; 088 private static final int C2_32 = 0x1b873593; 089 private static final int R1_32 = 15; 090 private static final int R2_32 = 13; 091 private static final int M_32 = 5; 092 private static final int N_32 = 0xe6546b64; 093 094 // Constants for 128-bit variant 095 private static final long C1 = 0x87c37b91114253d5L; 096 private static final long C2 = 0x4cf5ad432745937fL; 097 private static final int R1 = 31; 098 private static final int R2 = 27; 099 private static final int R3 = 33; 100 private static final int M = 5; 101 private static final int N1 = 0x52dce729; 102 private static final int N2 = 0x38495ab5; 103 104 /** No instance methods. */ 105 private MurmurHash3() { 106 } 107 108 /** 109 * Generates 32-bit hash from two longs with a default seed value. 110 * This is a helper method that will produce the same result as: 111 * 112 * <pre> 113 * int offset = 0; 114 * int seed = 104729; 115 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 116 * .putLong(data1) 117 * .putLong(data2) 118 * .array(), offset, 16, seed); 119 * </pre> 120 * 121 * @param data1 The first long to hash 122 * @param data2 The second long to hash 123 * @return The 32-bit hash 124 * @see #hash32x86(byte[], int, int, int) 125 */ 126 public static int hash32(final long data1, final long data2) { 127 return hash32(data1, data2, DEFAULT_SEED); 128 } 129 130 /** 131 * Generates 32-bit hash from two longs with the given seed. 132 * This is a helper method that will produce the same result as: 133 * 134 * <pre> 135 * int offset = 0; 136 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(16) 137 * .putLong(data1) 138 * .putLong(data2) 139 * .array(), offset, 16, seed); 140 * </pre> 141 * 142 * @param data1 The first long to hash 143 * @param data2 The second long to hash 144 * @param seed The initial seed value 145 * @return The 32-bit hash 146 * @see #hash32x86(byte[], int, int, int) 147 */ 148 public static int hash32(final long data1, final long data2, final int seed) { 149 int hash = seed; 150 final long r0 = Long.reverseBytes(data1); 151 final long r1 = Long.reverseBytes(data2); 152 153 hash = mix32((int) r0, hash); 154 hash = mix32((int) (r0 >>> 32), hash); 155 hash = mix32((int) (r1), hash); 156 hash = mix32((int) (r1 >>> 32), hash); 157 158 hash ^= LONG_BYTES * 2; 159 return fmix32(hash); 160 } 161 162 /** 163 * Generates 32-bit hash from a long with a default seed value. 164 * This is a helper method that will produce the same result as: 165 * 166 * <pre> 167 * int offset = 0; 168 * int seed = 104729; 169 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 170 * .putLong(data) 171 * .array(), offset, 8, seed); 172 * </pre> 173 * 174 * @param data The long to hash 175 * @return The 32-bit hash 176 * @see #hash32x86(byte[], int, int, int) 177 */ 178 public static int hash32(final long data) { 179 return hash32(data, DEFAULT_SEED); 180 } 181 182 /** 183 * Generates 32-bit hash from a long with the given seed. 184 * This is a helper method that will produce the same result as: 185 * 186 * <pre> 187 * int offset = 0; 188 * int hash = MurmurHash3.hash32x86(ByteBuffer.allocate(8) 189 * .putLong(data) 190 * .array(), offset, 8, seed); 191 * </pre> 192 * 193 * @param data The long to hash 194 * @param seed The initial seed value 195 * @return The 32-bit hash 196 * @see #hash32x86(byte[], int, int, int) 197 */ 198 public static int hash32(final long data, final int seed) { 199 int hash = seed; 200 final long r0 = Long.reverseBytes(data); 201 202 hash = mix32((int) r0, hash); 203 hash = mix32((int) (r0 >>> 32), hash); 204 205 hash ^= LONG_BYTES; 206 return fmix32(hash); 207 } 208 209 /** 210 * Generates 32-bit hash from the byte array with a default seed. 211 * This is a helper method that will produce the same result as: 212 * 213 * <pre> 214 * int offset = 0; 215 * int seed = 104729; 216 * int hash = MurmurHash3.hash32(data, offset, data.length, seed); 217 * </pre> 218 * 219 * <p>This implementation contains a sign-extension bug in the finalization step of 220 * any bytes left over from dividing the length by 4. This manifests if any of these 221 * bytes are negative.<p> 222 * 223 * @param data The input byte array 224 * @return The 32-bit hash 225 * @see #hash32(byte[], int, int, int) 226 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 227 */ 228 @Deprecated 229 public static int hash32(final byte[] data) { 230 return hash32(data, 0, data.length, DEFAULT_SEED); 231 } 232 233 /** 234 * Generates 32-bit hash from a string with a default seed. 235 * <p> 236 * Before 1.14 the string was converted using default encoding. 237 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 238 * </p> 239 * This is a helper method that will produce the same result as: 240 * 241 * <pre> 242 * int offset = 0; 243 * int seed = 104729; 244 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 245 * int hash = MurmurHash3.hash32(bytes, offset, bytes.length, seed); 246 * </pre> 247 * 248 * <p>This implementation contains a sign-extension bug in the finalization step of 249 * any bytes left over from dividing the length by 4. This manifests if any of these 250 * bytes are negative.<p> 251 * 252 * @param data The input string 253 * @return The 32-bit hash 254 * @see #hash32(byte[], int, int, int) 255 * @deprecated Use {@link #hash32x86(byte[], int, int, int)} with the bytes returned from 256 * {@link String#getBytes(java.nio.charset.Charset)}. This corrects the processing of trailing bytes. 257 */ 258 @Deprecated 259 public static int hash32(final String data) { 260 final byte[] bytes = StringUtils.getBytesUtf8(data); 261 return hash32(bytes, 0, bytes.length, DEFAULT_SEED); 262 } 263 264 /** 265 * Generates 32-bit hash from the byte array with the given length and a default seed. 266 * This is a helper method that will produce the same result as: 267 * 268 * <pre> 269 * int offset = 0; 270 * int seed = 104729; 271 * int hash = MurmurHash3.hash32(data, offset, length, seed); 272 * </pre> 273 * 274 * <p>This implementation contains a sign-extension bug in the finalization step of 275 * any bytes left over from dividing the length by 4. This manifests if any of these 276 * bytes are negative.<p> 277 * 278 * @param data The input byte array 279 * @param length The length of array 280 * @return The 32-bit hash 281 * @see #hash32(byte[], int, int, int) 282 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 283 */ 284 @Deprecated 285 public static int hash32(final byte[] data, final int length) { 286 return hash32(data, length, DEFAULT_SEED); 287 } 288 289 /** 290 * Generates 32-bit hash from the byte array with the given length and seed. This is a 291 * helper method that will produce the same result as: 292 * 293 * <pre> 294 * int offset = 0; 295 * int hash = MurmurHash3.hash32(data, offset, length, seed); 296 * </pre> 297 * 298 * <p>This implementation contains a sign-extension bug in the finalization step of 299 * any bytes left over from dividing the length by 4. This manifests if any of these 300 * bytes are negative.<p> 301 * 302 * @param data The input byte array 303 * @param length The length of array 304 * @param seed The initial seed value 305 * @return The 32-bit hash 306 * @see #hash32(byte[], int, int, int) 307 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 308 */ 309 @Deprecated 310 public static int hash32(final byte[] data, final int length, final int seed) { 311 return hash32(data, 0, length, seed); 312 } 313 314 /** 315 * Generates 32-bit hash from the byte array with the given offset, length and seed. 316 * 317 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 318 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 319 * 320 * <p>This implementation contains a sign-extension bug in the finalization step of 321 * any bytes left over from dividing the length by 4. This manifests if any of these 322 * bytes are negative.<p> 323 * 324 * @param data The input byte array 325 * @param offset The offset of data 326 * @param length The length of array 327 * @param seed The initial seed value 328 * @return The 32-bit hash 329 * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. 330 */ 331 @Deprecated 332 public static int hash32(final byte[] data, final int offset, final int length, final int seed) { 333 int hash = seed; 334 final int nblocks = length >> 2; 335 336 // body 337 for (int i = 0; i < nblocks; i++) { 338 final int index = offset + (i << 2); 339 final int k = getLittleEndianInt(data, index); 340 hash = mix32(k, hash); 341 } 342 343 // tail 344 // ************ 345 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 346 // ************ 347 final int index = offset + (nblocks << 2); 348 int k1 = 0; 349 switch (offset + length - index) { 350 case 3: 351 k1 ^= data[index + 2] << 16; 352 case 2: 353 k1 ^= data[index + 1] << 8; 354 case 1: 355 k1 ^= data[index]; 356 357 // mix functions 358 k1 *= C1_32; 359 k1 = Integer.rotateLeft(k1, R1_32); 360 k1 *= C2_32; 361 hash ^= k1; 362 } 363 364 hash ^= length; 365 return fmix32(hash); 366 } 367 368 /** 369 * Generates 32-bit hash from the byte array with a seed of zero. 370 * This is a helper method that will produce the same result as: 371 * 372 * <pre> 373 * int offset = 0; 374 * int seed = 0; 375 * int hash = MurmurHash3.hash32x86(data, offset, data.length, seed); 376 * </pre> 377 * 378 * @param data The input byte array 379 * @return The 32-bit hash 380 * @see #hash32x86(byte[], int, int, int) 381 * @since 1.14 382 */ 383 public static int hash32x86(final byte[] data) { 384 return hash32x86(data, 0, data.length, 0); 385 } 386 387 /** 388 * Generates 32-bit hash from the byte array with the given offset, length and seed. 389 * 390 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 391 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 392 * 393 * @param data The input byte array 394 * @param offset The offset of data 395 * @param length The length of array 396 * @param seed The initial seed value 397 * @return The 32-bit hash 398 * @since 1.14 399 */ 400 public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) { 401 int hash = seed; 402 final int nblocks = length >> 2; 403 404 // body 405 for (int i = 0; i < nblocks; i++) { 406 final int index = offset + (i << 2); 407 final int k = getLittleEndianInt(data, index); 408 hash = mix32(k, hash); 409 } 410 411 // tail 412 final int index = offset + (nblocks << 2); 413 int k1 = 0; 414 switch (offset + length - index) { 415 case 3: 416 k1 ^= (data[index + 2] & 0xff) << 16; 417 case 2: 418 k1 ^= (data[index + 1] & 0xff) << 8; 419 case 1: 420 k1 ^= (data[index] & 0xff); 421 422 // mix functions 423 k1 *= C1_32; 424 k1 = Integer.rotateLeft(k1, R1_32); 425 k1 *= C2_32; 426 hash ^= k1; 427 } 428 429 hash ^= length; 430 return fmix32(hash); 431 } 432 433 /** 434 * Generates 64-bit hash from a long with a default seed. 435 * 436 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 437 * 438 * <p>This is a Murmur3-like 64-bit variant. 439 * The method does not produce the same result as either half of the hash bytes from 440 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code long}. 441 * This method will be removed in a future release.</p> 442 * 443 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 444 * this result as the default seed is positive.</p> 445 * 446 * <p>This is a helper method that will produce the same result as:</p> 447 * 448 * <pre> 449 * int offset = 0; 450 * int seed = 104729; 451 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(8) 452 * .putLong(data) 453 * .array(), offset, 8, seed); 454 * </pre> 455 * 456 * @param data The long to hash 457 * @return The 64-bit hash 458 * @see #hash64(byte[], int, int, int) 459 * @deprecated Not part of the MurmurHash3 implementation. 460 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code long}. 461 */ 462 @Deprecated 463 public static long hash64(final long data) { 464 long hash = DEFAULT_SEED; 465 long k = Long.reverseBytes(data); 466 final int length = LONG_BYTES; 467 // mix functions 468 k *= C1; 469 k = Long.rotateLeft(k, R1); 470 k *= C2; 471 hash ^= k; 472 hash = Long.rotateLeft(hash, R2) * M + N1; 473 // finalization 474 hash ^= length; 475 hash = fmix64(hash); 476 return hash; 477 } 478 479 /** 480 * Generates 64-bit hash from an int with a default seed. 481 * 482 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 483 * 484 * <p>This is a Murmur3-like 64-bit variant. 485 * The method does not produce the same result as either half of the hash bytes from 486 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code int}. 487 * This method will be removed in a future release.</p> 488 * 489 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 490 * this result as the default seed is positive.</p> 491 * 492 * <p>This is a helper method that will produce the same result as:</p> 493 * 494 * <pre> 495 * int offset = 0; 496 * int seed = 104729; 497 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(4) 498 * .putInt(data) 499 * .array(), offset, 4, seed); 500 * </pre> 501 * 502 * @param data The int to hash 503 * @return The 64-bit hash 504 * @see #hash64(byte[], int, int, int) 505 * @deprecated Not part of the MurmurHash3 implementation. 506 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code int}. 507 */ 508 @Deprecated 509 public static long hash64(final int data) { 510 long k1 = Integer.reverseBytes(data) & (-1L >>> 32); 511 final int length = INTEGER_BYTES; 512 long hash = DEFAULT_SEED; 513 k1 *= C1; 514 k1 = Long.rotateLeft(k1, R1); 515 k1 *= C2; 516 hash ^= k1; 517 // finalization 518 hash ^= length; 519 hash = fmix64(hash); 520 return hash; 521 } 522 523 /** 524 * Generates 64-bit hash from a short with a default seed. 525 * 526 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 527 * 528 * <p>This is a Murmur3-like 64-bit variant. 529 * The method does not produce the same result as either half of the hash bytes from 530 * {@linkplain #hash128x64(byte[])} with the same byte data from the {@code short}. 531 * This method will be removed in a future release.</p> 532 * 533 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 534 * this result as the default seed is positive.</p> 535 * 536 * <p>This is a helper method that will produce the same result as:</p> 537 * 538 * <pre> 539 * int offset = 0; 540 * int seed = 104729; 541 * long hash = MurmurHash3.hash64(ByteBuffer.allocate(2) 542 * .putShort(data) 543 * .array(), offset, 2, seed); 544 * </pre> 545 * 546 * @param data The short to hash 547 * @return The 64-bit hash 548 * @see #hash64(byte[], int, int, int) 549 * @deprecated Not part of the MurmurHash3 implementation. 550 * Use half of the hash bytes from {@link #hash128x64(byte[])} with the bytes from the {@code short}. 551 */ 552 @Deprecated 553 public static long hash64(final short data) { 554 long hash = DEFAULT_SEED; 555 long k1 = 0; 556 k1 ^= ((long) data & 0xff) << 8; 557 k1 ^= ((long) ((data & 0xFF00) >> 8) & 0xff); 558 k1 *= C1; 559 k1 = Long.rotateLeft(k1, R1); 560 k1 *= C2; 561 hash ^= k1; 562 563 // finalization 564 hash ^= SHORT_BYTES; 565 hash = fmix64(hash); 566 return hash; 567 } 568 569 /** 570 * Generates 64-bit hash from a byte array with a default seed. 571 * 572 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 573 * 574 * <p>This is a Murmur3-like 64-bit variant. 575 * The method does not produce the same result as either half of the hash bytes from 576 * {@linkplain #hash128x64(byte[])} with the same byte data. 577 * This method will be removed in a future release.</p> 578 * 579 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 580 * this result as the default seed is positive.</p> 581 * 582 * <p>This is a helper method that will produce the same result as:</p> 583 * 584 * <pre> 585 * int offset = 0; 586 * int seed = 104729; 587 * long hash = MurmurHash3.hash64(data, offset, data.length, seed); 588 * </pre> 589 * 590 * @param data The input byte array 591 * @return The 64-bit hash 592 * @see #hash64(byte[], int, int, int) 593 * @deprecated Not part of the MurmurHash3 implementation. 594 * Use half of the hash bytes from {@link #hash128x64(byte[])}. 595 */ 596 @Deprecated 597 public static long hash64(final byte[] data) { 598 return hash64(data, 0, data.length, DEFAULT_SEED); 599 } 600 601 /** 602 * Generates 64-bit hash from a byte array with the given offset and length and a default seed. 603 * 604 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 605 * 606 * <p>This is a Murmur3-like 64-bit variant. 607 * The method does not produce the same result as either half of the hash bytes from 608 * {@linkplain #hash128x64(byte[])} with the same byte data. 609 * This method will be removed in a future release.</p> 610 * 611 * <p>Note: The sign extension bug in {@link #hash64(byte[], int, int, int)} does not effect 612 * this result as the default seed is positive.</p> 613 * 614 * <p>This is a helper method that will produce the same result as:</p> 615 * 616 * <pre> 617 * int seed = 104729; 618 * long hash = MurmurHash3.hash64(data, offset, length, seed); 619 * </pre> 620 * 621 * @param data The input byte array 622 * @param offset The offset of data 623 * @param length The length of array 624 * @return The 64-bit hash 625 * @see #hash64(byte[], int, int, int) 626 * @deprecated Not part of the MurmurHash3 implementation. 627 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 628 */ 629 @Deprecated 630 public static long hash64(final byte[] data, final int offset, final int length) { 631 return hash64(data, offset, length, DEFAULT_SEED); 632 } 633 634 /** 635 * Generates 64-bit hash from a byte array with the given offset, length and seed. 636 * 637 * <p><strong>This is not part of the original MurmurHash3 {@code c++} implementation.</strong></p> 638 * 639 * <p>This is a Murmur3-like 64-bit variant. 640 * This method will be removed in a future release.</p> 641 * 642 * <p>This implementation contains a sign-extension bug in the seed initialization. 643 * This manifests if the seed is negative.</p> 644 * 645 * <p>This algorithm processes 8 bytes chunks of data in a manner similar to the 16 byte chunks 646 * of data processed in the MurmurHash3 {@code MurmurHash3_x64_128} method. However the hash 647 * is not mixed with a hash chunk from the next 8 bytes of data. The method will not return 648 * the same value as the first or second 64-bits of the function 649 * {@link #hash128(byte[], int, int, int)}.</p> 650 * 651 * <p>Use of this method is not advised. Use the first long returned from 652 * {@link #hash128x64(byte[], int, int, int)}.<p> 653 * 654 * @param data The input byte array 655 * @param offset The offset of data 656 * @param length The length of array 657 * @param seed The initial seed value 658 * @return The 64-bit hash 659 * @deprecated Not part of the MurmurHash3 implementation. 660 * Use half of the hash bytes from {@link #hash128x64(byte[], int, int, int)}. 661 */ 662 @Deprecated 663 public static long hash64(final byte[] data, final int offset, final int length, final int seed) { 664 // ************ 665 // Note: This fails to apply masking using 0xffffffffL to the seed. 666 // ************ 667 long hash = seed; 668 final int nblocks = length >> 3; 669 670 // body 671 for (int i = 0; i < nblocks; i++) { 672 final int index = offset + (i << 3); 673 long k = getLittleEndianLong(data, index); 674 675 // mix functions 676 k *= C1; 677 k = Long.rotateLeft(k, R1); 678 k *= C2; 679 hash ^= k; 680 hash = Long.rotateLeft(hash, R2) * M + N1; 681 } 682 683 // tail 684 long k1 = 0; 685 final int index = offset + (nblocks << 3); 686 switch (offset + length - index) { 687 case 7: 688 k1 ^= ((long) data[index + 6] & 0xff) << 48; 689 case 6: 690 k1 ^= ((long) data[index + 5] & 0xff) << 40; 691 case 5: 692 k1 ^= ((long) data[index + 4] & 0xff) << 32; 693 case 4: 694 k1 ^= ((long) data[index + 3] & 0xff) << 24; 695 case 3: 696 k1 ^= ((long) data[index + 2] & 0xff) << 16; 697 case 2: 698 k1 ^= ((long) data[index + 1] & 0xff) << 8; 699 case 1: 700 k1 ^= ((long) data[index] & 0xff); 701 k1 *= C1; 702 k1 = Long.rotateLeft(k1, R1); 703 k1 *= C2; 704 hash ^= k1; 705 } 706 707 // finalization 708 hash ^= length; 709 hash = fmix64(hash); 710 711 return hash; 712 } 713 714 /** 715 * Generates 128-bit hash from the byte array with a default seed. 716 * This is a helper method that will produce the same result as: 717 * 718 * <pre> 719 * int offset = 0; 720 * int seed = 104729; 721 * int hash = MurmurHash3.hash128(data, offset, data.length, seed); 722 * </pre> 723 * 724 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 725 * this result as the default seed is positive.</p> 726 * 727 * @param data The input byte array 728 * @return The 128-bit hash (2 longs) 729 * @see #hash128(byte[], int, int, int) 730 */ 731 public static long[] hash128(final byte[] data) { 732 return hash128(data, 0, data.length, DEFAULT_SEED); 733 } 734 735 /** 736 * Generates 128-bit hash from the byte array with a seed of zero. 737 * This is a helper method that will produce the same result as: 738 * 739 * <pre> 740 * int offset = 0; 741 * int seed = 0; 742 * int hash = MurmurHash3.hash128x64(data, offset, data.length, seed); 743 * </pre> 744 * 745 * @param data The input byte array 746 * @return The 128-bit hash (2 longs) 747 * @see #hash128x64(byte[], int, int, int) 748 * @since 1.14 749 */ 750 public static long[] hash128x64(final byte[] data) { 751 return hash128x64(data, 0, data.length, 0); 752 } 753 754 /** 755 * Generates 128-bit hash from a string with a default seed. 756 * <p> 757 * Before 1.14 the string was converted using default encoding. 758 * Since 1.14 the string is converted to bytes using UTF-8 encoding. 759 * </p> 760 * This is a helper method that will produce the same result as: 761 * 762 * <pre> 763 * int offset = 0; 764 * int seed = 104729; 765 * byte[] bytes = data.getBytes(StandardCharsets.UTF_8); 766 * int hash = MurmurHash3.hash128(bytes, offset, bytes.length, seed); 767 * </pre> 768 * 769 * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect 770 * this result as the default seed is positive.</p> 771 * 772 * @param data The input String 773 * @return The 128-bit hash (2 longs) 774 * @see #hash128(byte[], int, int, int) 775 * @deprecated Use {@link #hash128x64(byte[])} using the bytes returned from 776 * {@link String#getBytes(java.nio.charset.Charset)}. 777 */ 778 @Deprecated 779 public static long[] hash128(final String data) { 780 final byte[] bytes = StringUtils.getBytesUtf8(data); 781 return hash128(bytes, 0, bytes.length, DEFAULT_SEED); 782 } 783 784 /** 785 * Generates 128-bit hash from the byte array with the given offset, length and seed. 786 * 787 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 788 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 789 * 790 * <p>This implementation contains a sign-extension bug in the seed initialization. 791 * This manifests if the seed is negative.<p> 792 * 793 * @param data The input byte array 794 * @param offset The first element of array 795 * @param length The length of array 796 * @param seed The initial seed value 797 * @return The 128-bit hash (2 longs) 798 * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialization. 799 */ 800 @Deprecated 801 public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { 802 // ************ 803 // Note: This fails to apply masking using 0xffffffffL to the seed. 804 // ************ 805 return hash128x64(data, offset, length, seed); 806 } 807 808 /** 809 * Generates 128-bit hash from the byte array with the given offset, length and seed. 810 * 811 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 812 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 813 * 814 * @param data The input byte array 815 * @param offset The first element of array 816 * @param length The length of array 817 * @param seed The initial seed value 818 * @return The 128-bit hash (2 longs) 819 * @since 1.14 820 */ 821 public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) { 822 // Use an unsigned 32-bit integer as the seed 823 return hash128x64(data, offset, length, seed & 0xffffffffL); 824 } 825 826 /** 827 * Generates 128-bit hash from the byte array with the given offset, length and seed. 828 * 829 * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} 830 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 831 * 832 * @param data The input byte array 833 * @param offset The first element of array 834 * @param length The length of array 835 * @param seed The initial seed value 836 * @return The 128-bit hash (2 longs) 837 */ 838 private static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) { 839 long h1 = seed; 840 long h2 = seed; 841 final int nblocks = length >> 4; 842 843 // body 844 for (int i = 0; i < nblocks; i++) { 845 final int index = offset + (i << 4); 846 long k1 = getLittleEndianLong(data, index); 847 long k2 = getLittleEndianLong(data, index + 8); 848 849 // mix functions for k1 850 k1 *= C1; 851 k1 = Long.rotateLeft(k1, R1); 852 k1 *= C2; 853 h1 ^= k1; 854 h1 = Long.rotateLeft(h1, R2); 855 h1 += h2; 856 h1 = h1 * M + N1; 857 858 // mix functions for k2 859 k2 *= C2; 860 k2 = Long.rotateLeft(k2, R3); 861 k2 *= C1; 862 h2 ^= k2; 863 h2 = Long.rotateLeft(h2, R1); 864 h2 += h1; 865 h2 = h2 * M + N2; 866 } 867 868 // tail 869 long k1 = 0; 870 long k2 = 0; 871 final int index = offset + (nblocks << 4); 872 switch (offset + length - index) { 873 case 15: 874 k2 ^= ((long) data[index + 14] & 0xff) << 48; 875 case 14: 876 k2 ^= ((long) data[index + 13] & 0xff) << 40; 877 case 13: 878 k2 ^= ((long) data[index + 12] & 0xff) << 32; 879 case 12: 880 k2 ^= ((long) data[index + 11] & 0xff) << 24; 881 case 11: 882 k2 ^= ((long) data[index + 10] & 0xff) << 16; 883 case 10: 884 k2 ^= ((long) data[index + 9] & 0xff) << 8; 885 case 9: 886 k2 ^= data[index + 8] & 0xff; 887 k2 *= C2; 888 k2 = Long.rotateLeft(k2, R3); 889 k2 *= C1; 890 h2 ^= k2; 891 892 case 8: 893 k1 ^= ((long) data[index + 7] & 0xff) << 56; 894 case 7: 895 k1 ^= ((long) data[index + 6] & 0xff) << 48; 896 case 6: 897 k1 ^= ((long) data[index + 5] & 0xff) << 40; 898 case 5: 899 k1 ^= ((long) data[index + 4] & 0xff) << 32; 900 case 4: 901 k1 ^= ((long) data[index + 3] & 0xff) << 24; 902 case 3: 903 k1 ^= ((long) data[index + 2] & 0xff) << 16; 904 case 2: 905 k1 ^= ((long) data[index + 1] & 0xff) << 8; 906 case 1: 907 k1 ^= data[index] & 0xff; 908 k1 *= C1; 909 k1 = Long.rotateLeft(k1, R1); 910 k1 *= C2; 911 h1 ^= k1; 912 } 913 914 // finalization 915 h1 ^= length; 916 h2 ^= length; 917 918 h1 += h2; 919 h2 += h1; 920 921 h1 = fmix64(h1); 922 h2 = fmix64(h2); 923 924 h1 += h2; 925 h2 += h1; 926 927 return new long[] { h1, h2 }; 928 } 929 930 /** 931 * Gets the little-endian long from 8 bytes starting at the specified index. 932 * 933 * @param data The data 934 * @param index The index 935 * @return The little-endian long 936 */ 937 private static long getLittleEndianLong(final byte[] data, final int index) { 938 return (((long) data[index ] & 0xff) ) | 939 (((long) data[index + 1] & 0xff) << 8) | 940 (((long) data[index + 2] & 0xff) << 16) | 941 (((long) data[index + 3] & 0xff) << 24) | 942 (((long) data[index + 4] & 0xff) << 32) | 943 (((long) data[index + 5] & 0xff) << 40) | 944 (((long) data[index + 6] & 0xff) << 48) | 945 (((long) data[index + 7] & 0xff) << 56); 946 } 947 948 /** 949 * Gets the little-endian int from 4 bytes starting at the specified index. 950 * 951 * @param data The data 952 * @param index The index 953 * @return The little-endian int 954 */ 955 private static int getLittleEndianInt(final byte[] data, final int index) { 956 return ((data[index ] & 0xff) ) | 957 ((data[index + 1] & 0xff) << 8) | 958 ((data[index + 2] & 0xff) << 16) | 959 ((data[index + 3] & 0xff) << 24); 960 } 961 962 /** 963 * Performs the intermediate mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 964 * 965 * @param k The data to add to the hash 966 * @param hash The current hash 967 * @return The new hash 968 */ 969 private static int mix32(int k, int hash) { 970 k *= C1_32; 971 k = Integer.rotateLeft(k, R1_32); 972 k *= C2_32; 973 hash ^= k; 974 return Integer.rotateLeft(hash, R2_32) * M_32 + N_32; 975 } 976 977 /** 978 * Performs the final avalanche mix step of the 32-bit hash function {@code MurmurHash3_x86_32}. 979 * 980 * @param hash The current hash 981 * @return The final hash 982 */ 983 private static int fmix32(int hash) { 984 hash ^= (hash >>> 16); 985 hash *= 0x85ebca6b; 986 hash ^= (hash >>> 13); 987 hash *= 0xc2b2ae35; 988 hash ^= (hash >>> 16); 989 return hash; 990 } 991 992 /** 993 * Performs the final avalanche mix step of the 64-bit hash function {@code MurmurHash3_x64_128}. 994 * 995 * @param hash The current hash 996 * @return The final hash 997 */ 998 private static long fmix64(long hash) { 999 hash ^= (hash >>> 33); 1000 hash *= 0xff51afd7ed558ccdL; 1001 hash ^= (hash >>> 33); 1002 hash *= 0xc4ceb9fe1a85ec53L; 1003 hash ^= (hash >>> 33); 1004 return hash; 1005 } 1006 1007 /** 1008 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1009 * hash computed. 1010 * 1011 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1012 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1013 * 1014 * @since 1.14 1015 */ 1016 public static class IncrementalHash32x86 { 1017 1018 /** The size of byte blocks that are processed together. */ 1019 private static final int BLOCK_SIZE = 4; 1020 1021 /** Up to 3 unprocessed bytes from input data. */ 1022 private final byte[] unprocessed = new byte[3]; 1023 1024 /** The number of unprocessed bytes in the tail data. */ 1025 private int unprocessedLength; 1026 1027 /** The total number of input bytes added since the start. */ 1028 private int totalLen; 1029 1030 /** 1031 * The current running hash. 1032 * This must be finalised to generate the 32-bit hash value. 1033 */ 1034 private int hash; 1035 1036 /** 1037 * Starts a new incremental hash. 1038 * 1039 * @param seed The initial seed value 1040 */ 1041 public final void start(final int seed) { 1042 // Reset 1043 unprocessedLength = totalLen = 0; 1044 this.hash = seed; 1045 } 1046 1047 /** 1048 * Adds the byte array to the current incremental hash. 1049 * 1050 * @param data The input byte array 1051 * @param offset The offset of data 1052 * @param length The length of array 1053 */ 1054 public final void add(final byte[] data, final int offset, final int length) { 1055 if (length <= 0) { 1056 // Nothing to add 1057 return; 1058 } 1059 totalLen += length; 1060 1061 // Process the bytes in blocks of 4. 1062 // New bytes must be added to any current unprocessed bytes, 1063 // then processed in blocks of 4 and the remaining bytes saved: 1064 // 1065 // |--|---------------------------|--| 1066 // unprocessed 1067 // main block 1068 // remaining 1069 1070 // Check if the unprocessed bytes and new bytes can fill a block of 4. 1071 // Make this overflow safe in the event that length is Integer.MAX_VALUE. 1072 // Equivalent to: (unprocessedLength + length < BLOCK_SIZE) 1073 if (unprocessedLength + length - BLOCK_SIZE < 0) { 1074 // Not enough so add to the unprocessed bytes 1075 System.arraycopy(data, offset, unprocessed, unprocessedLength, length); 1076 unprocessedLength += length; 1077 return; 1078 } 1079 1080 // Combine unprocessed bytes with new bytes. 1081 int newOffset; 1082 int newLength; 1083 if (unprocessedLength > 0) { 1084 int k = -1; 1085 switch (unprocessedLength) { 1086 case 1: 1087 k = orBytes(unprocessed[0], data[offset], data[offset + 1], data[offset + 2]); 1088 break; 1089 case 2: 1090 k = orBytes(unprocessed[0], unprocessed[1], data[offset], data[offset + 1]); 1091 break; 1092 case 3: 1093 k = orBytes(unprocessed[0], unprocessed[1], unprocessed[2], data[offset]); 1094 break; 1095 default: 1096 throw new IllegalStateException("Unprocessed length should be 1, 2, or 3: " + unprocessedLength); 1097 } 1098 hash = mix32(k, hash); 1099 // Update the offset and length 1100 final int consumed = BLOCK_SIZE - unprocessedLength; 1101 newOffset = offset + consumed; 1102 newLength = length - consumed; 1103 } else { 1104 newOffset = offset; 1105 newLength = length; 1106 } 1107 1108 // Main processing of blocks of 4 bytes 1109 final int nblocks = newLength >> 2; 1110 1111 for (int i = 0; i < nblocks; i++) { 1112 final int index = newOffset + (i << 2); 1113 final int k = getLittleEndianInt(data, index); 1114 hash = mix32(k, hash); 1115 } 1116 1117 // Save left-over unprocessed bytes 1118 final int consumed = (nblocks << 2); 1119 unprocessedLength = newLength - consumed; 1120 if (unprocessedLength != 0) { 1121 System.arraycopy(data, newOffset + consumed, unprocessed, 0, unprocessedLength); 1122 } 1123 } 1124 1125 /** 1126 * Generate the 32-bit hash value. Repeat calls to this method with no additional data 1127 * will generate the same hash value. 1128 * 1129 * @return The 32-bit hash 1130 */ 1131 public final int end() { 1132 // Allow calling end() again after adding no data to return the same result. 1133 return finalise(hash, unprocessedLength, unprocessed, totalLen); 1134 } 1135 1136 /** 1137 * Finalize the running hash to the output 32-bit hash by processing remaining bytes 1138 * and performing final mixing. 1139 * 1140 * @param hash The running hash 1141 * @param unprocessedLength The number of unprocessed bytes in the tail data. 1142 * @param unprocessed Up to 3 unprocessed bytes from input data. 1143 * @param totalLen The total number of input bytes added since the start. 1144 * @return The 32-bit hash 1145 */ 1146 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1147 int result = hash; 1148 int k1 = 0; 1149 switch (unprocessedLength) { 1150 case 3: 1151 k1 ^= (unprocessed[2] & 0xff) << 16; 1152 case 2: 1153 k1 ^= (unprocessed[1] & 0xff) << 8; 1154 case 1: 1155 k1 ^= (unprocessed[0] & 0xff); 1156 1157 // mix functions 1158 k1 *= C1_32; 1159 k1 = Integer.rotateLeft(k1, R1_32); 1160 k1 *= C2_32; 1161 result ^= k1; 1162 } 1163 1164 // finalization 1165 result ^= totalLen; 1166 return fmix32(result); 1167 } 1168 1169 /** 1170 * Combines the bytes using an Or operation ({@code | } in a little-endian representation 1171 * of a 32-bit integer; byte 1 will be the least significant byte, byte 4 the most 1172 * significant. 1173 * 1174 * @param b1 The first byte 1175 * @param b2 The second byte 1176 * @param b3 The third byte 1177 * @param b4 The fourth byte 1178 * @return The 32-bit integer 1179 */ 1180 private static int orBytes(final byte b1, final byte b2, final byte b3, final byte b4) { 1181 return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); 1182 } 1183 } 1184 1185 /** 1186 * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new 1187 * hash computed. 1188 * 1189 * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} 1190 * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> 1191 * 1192 * <p>This implementation contains a sign-extension bug in the finalization step of 1193 * any bytes left over from dividing the length by 4. This manifests if any of these 1194 * bytes are negative.<p> 1195 * 1196 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1197 */ 1198 @Deprecated 1199 public static class IncrementalHash32 extends IncrementalHash32x86 { 1200 1201 /** 1202 * {@inheritDoc} 1203 * 1204 * <p>This implementation contains a sign-extension bug in the finalization step of 1205 * any bytes left over from dividing the length by 4. This manifests if any of these 1206 * bytes are negative.<p> 1207 * 1208 * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. 1209 */ 1210 @Override 1211 @Deprecated 1212 int finalise(final int hash, final int unprocessedLength, final byte[] unprocessed, final int totalLen) { 1213 int result = hash; 1214 // ************ 1215 // Note: This fails to apply masking using 0xff to the 3 remaining bytes. 1216 // ************ 1217 int k1 = 0; 1218 switch (unprocessedLength) { 1219 case 3: 1220 k1 ^= unprocessed[2] << 16; 1221 case 2: 1222 k1 ^= unprocessed[1] << 8; 1223 case 1: 1224 k1 ^= unprocessed[0]; 1225 1226 // mix functions 1227 k1 *= C1_32; 1228 k1 = Integer.rotateLeft(k1, R1_32); 1229 k1 *= C2_32; 1230 result ^= k1; 1231 } 1232 1233 // finalization 1234 result ^= totalLen; 1235 return fmix32(result); 1236 } 1237 } 1238}