View Javadoc
1   /*
2    * Copyright (C) 2011 The Guava Authors
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5    * in compliance with the License. You may obtain a copy of the License at
6    *
7    * http://www.apache.org/licenses/LICENSE-2.0
8    *
9    * Unless required by applicable law or agreed to in writing, software distributed under the License
10   * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11   * or implied. See the License for the specific language governing permissions and limitations under
12   * the License.
13   */
14  
15  /*
16   * MurmurHash3 was written by Austin Appleby, and is placed in the public
17   * domain. The author hereby disclaims copyright to this source code.
18   */
19  
20  /*
21   * Source:
22   * http://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
23   * (Modified to adapt to Guava coding conventions and to use the HashFunction interface)
24   */
25  
26  package com.google.common.hash;
27  
28  import static com.google.common.base.Preconditions.checkPositionIndexes;
29  import static com.google.common.base.Preconditions.checkState;
30  import static com.google.common.primitives.UnsignedBytes.toInt;
31  
32  import com.google.common.base.Charsets;
33  import com.google.common.primitives.Chars;
34  import com.google.common.primitives.Ints;
35  import com.google.common.primitives.Longs;
36  import com.google.errorprone.annotations.CanIgnoreReturnValue;
37  import java.io.Serializable;
38  import java.nio.ByteBuffer;
39  import java.nio.ByteOrder;
40  import java.nio.charset.Charset;
41  import javax.annotation.Nullable;
42  
43  /**
44   * See MurmurHash3_x86_32 in <a
45   * href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp">the C++
46   * implementation</a>.
47   *
48   * @author Austin Appleby
49   * @author Dimitris Andreou
50   * @author Kurt Alfred Kluever
51   */
52  final class Murmur3_32HashFunction extends AbstractHashFunction implements Serializable {
53    static final HashFunction MURMUR3_32 = new Murmur3_32HashFunction(0);
54  
55    static final HashFunction GOOD_FAST_HASH_32 =
56        new Murmur3_32HashFunction(Hashing.GOOD_FAST_HASH_SEED);
57  
58    private static final int CHUNK_SIZE = 4;
59  
60    private static final int C1 = 0xcc9e2d51;
61    private static final int C2 = 0x1b873593;
62  
63    private final int seed;
64  
65    Murmur3_32HashFunction(int seed) {
66      this.seed = seed;
67    }
68  
69    @Override
70    public int bits() {
71      return 32;
72    }
73  
74    @Override
75    public Hasher newHasher() {
76      return new Murmur3_32Hasher(seed);
77    }
78  
79    @Override
80    public String toString() {
81      return "Hashing.murmur3_32(" + seed + ")";
82    }
83  
84    @Override
85    public boolean equals(@Nullable Object object) {
86      if (object instanceof Murmur3_32HashFunction) {
87        Murmur3_32HashFunction other = (Murmur3_32HashFunction) object;
88        return seed == other.seed;
89      }
90      return false;
91    }
92  
93    @Override
94    public int hashCode() {
95      return getClass().hashCode() ^ seed;
96    }
97  
98    @Override
99    public HashCode hashInt(int input) {
100     int k1 = mixK1(input);
101     int h1 = mixH1(seed, k1);
102 
103     return fmix(h1, Ints.BYTES);
104   }
105 
106   @Override
107   public HashCode hashLong(long input) {
108     int low = (int) input;
109     int high = (int) (input >>> 32);
110 
111     int k1 = mixK1(low);
112     int h1 = mixH1(seed, k1);
113 
114     k1 = mixK1(high);
115     h1 = mixH1(h1, k1);
116 
117     return fmix(h1, Longs.BYTES);
118   }
119 
120   @Override
121   public HashCode hashUnencodedChars(CharSequence input) {
122     int h1 = seed;
123 
124     // step through the CharSequence 2 chars at a time
125     for (int i = 1; i < input.length(); i += 2) {
126       int k1 = input.charAt(i - 1) | (input.charAt(i) << 16);
127       k1 = mixK1(k1);
128       h1 = mixH1(h1, k1);
129     }
130 
131     // deal with any remaining characters
132     if ((input.length() & 1) == 1) {
133       int k1 = input.charAt(input.length() - 1);
134       k1 = mixK1(k1);
135       h1 ^= k1;
136     }
137 
138     return fmix(h1, Chars.BYTES * input.length());
139   }
140 
141   @SuppressWarnings("deprecation") // need to use Charsets for Android tests to pass
142   @Override
143   public HashCode hashString(CharSequence input, Charset charset) {
144     if (Charsets.UTF_8.equals(charset)) {
145       int utf16Length = input.length();
146       int h1 = seed;
147       int i = 0;
148       int len = 0;
149 
150       // This loop optimizes for pure ASCII.
151       while (i + 4 <= utf16Length) {
152         char c0 = input.charAt(i);
153         char c1 = input.charAt(i + 1);
154         char c2 = input.charAt(i + 2);
155         char c3 = input.charAt(i + 3);
156         if (c0 < 0x80 && c1 < 0x80 && c2 < 0x80 && c3 < 0x80) {
157           int k1 = c0 | (c1 << 8) | (c2 << 16) | (c3 << 24);
158           k1 = mixK1(k1);
159           h1 = mixH1(h1, k1);
160           i += 4;
161           len += 4;
162         } else {
163           break;
164         }
165       }
166 
167       long buffer = 0;
168       int shift = 0;
169       for (; i < utf16Length; i++) {
170         char c = input.charAt(i);
171         if (c < 0x80) {
172           buffer |= (long) c << shift;
173           shift += 8;
174           len++;
175         } else if (c < 0x800) {
176           buffer |= charToTwoUtf8Bytes(c) << shift;
177           shift += 16;
178           len += 2;
179         } else if (c < Character.MIN_SURROGATE || c > Character.MAX_SURROGATE) {
180           buffer |= charToThreeUtf8Bytes(c) << shift;
181           shift += 24;
182           len += 3;
183         } else {
184           int codePoint = Character.codePointAt(input, i);
185           if (codePoint == c) {
186             // not a valid code point; let the JDK handle invalid Unicode
187             return hashBytes(input.toString().getBytes(charset));
188           }
189           i++;
190           buffer |= codePointToFourUtf8Bytes(codePoint) << shift;
191           len += 4;
192         }
193 
194         if (shift >= 32) {
195           int k1 = mixK1((int) buffer);
196           h1 = mixH1(h1, k1);
197           buffer = buffer >>> 32;
198           shift -= 32;
199         }
200       }
201 
202       int k1 = mixK1((int) buffer);
203       h1 ^= k1;
204       return fmix(h1, len);
205     } else {
206       return hashBytes(input.toString().getBytes(charset));
207     }
208   }
209   
210   @Override
211   public HashCode hashBytes(byte[] input, int off, int len) {
212     checkPositionIndexes(off, off + len, input.length);
213     int h1 = seed;
214     int i;
215     for (i = 0; i + CHUNK_SIZE <= len; i += CHUNK_SIZE) {
216       int k1 = mixK1(getIntLittleEndian(input, off + i));
217       h1 = mixH1(h1, k1);
218     }
219 
220     int k1 = 0;
221     for (int shift = 0; i < len; i++, shift += 8) {
222       k1 ^= toInt(input[off + i]) << shift;
223     }
224     h1 ^= mixK1(k1);
225     return fmix(h1, len);
226   }
227 
228   private static int getIntLittleEndian(byte[] input, int offset) {
229     return Ints.fromBytes(input[offset + 3], input[offset + 2], input[offset + 1], input[offset]);
230   }
231 
232   private static int mixK1(int k1) {
233     k1 *= C1;
234     k1 = Integer.rotateLeft(k1, 15);
235     k1 *= C2;
236     return k1;
237   }
238 
239   private static int mixH1(int h1, int k1) {
240     h1 ^= k1;
241     h1 = Integer.rotateLeft(h1, 13);
242     h1 = h1 * 5 + 0xe6546b64;
243     return h1;
244   }
245 
246   // Finalization mix - force all bits of a hash block to avalanche
247   private static HashCode fmix(int h1, int length) {
248     h1 ^= length;
249     h1 ^= h1 >>> 16;
250     h1 *= 0x85ebca6b;
251     h1 ^= h1 >>> 13;
252     h1 *= 0xc2b2ae35;
253     h1 ^= h1 >>> 16;
254     return HashCode.fromInt(h1);
255   }
256 
257   @CanIgnoreReturnValue
258   private static final class Murmur3_32Hasher extends AbstractHasher {
259     private int h1;
260     private long buffer;
261     private int shift;
262     private int length;
263     private boolean isDone;
264 
265     Murmur3_32Hasher(int seed) {
266       this.h1 = seed;
267       this.length = 0;
268       isDone = false;
269     }
270 
271     private void update(int nBytes, long update) {
272       // 1 <= nBytes <= 4
273       buffer |= (update & 0xFFFFFFFFL) << shift;
274       shift += nBytes * 8;
275       length += nBytes;
276 
277       if (shift >= 32) {
278         h1 = mixH1(h1, mixK1((int) buffer));
279         buffer >>>= 32;
280         shift -= 32;
281       }
282     }
283 
284     @Override
285     public Hasher putByte(byte b) {
286       update(1, b & 0xFF);
287       return this;
288     }
289 
290     @Override
291     public Hasher putBytes(byte[] bytes, int off, int len) {
292       checkPositionIndexes(off, off + len, bytes.length);
293       int i;
294       for (i = 0; i + 4 <= len; i += 4) {
295         update(4, getIntLittleEndian(bytes, off + i));
296       }
297       for (; i < len; i++) {
298         putByte(bytes[off + i]);
299       }
300       return this;
301     }
302 
303     @Override
304     public Hasher putBytes(ByteBuffer buffer) {
305       ByteOrder bo = buffer.order();
306       buffer.order(ByteOrder.LITTLE_ENDIAN);
307       while (buffer.remaining() >= 4) {
308         putInt(buffer.getInt());
309       }
310       while (buffer.hasRemaining()) {
311         putByte(buffer.get());
312       }
313       buffer.order(bo);
314       return this;
315     }
316 
317     @Override
318     public Hasher putInt(int i) {
319       update(4, i);
320       return this;
321     }
322 
323     @Override
324     public Hasher putLong(long l) {
325       update(4, (int) l);
326       update(4, l >>> 32);
327       return this;
328     }
329 
330     @Override
331     public Hasher putChar(char c) {
332       update(2, c);
333       return this;
334     }
335 
336     @SuppressWarnings("deprecation") // need to use Charsets for Android tests to pass
337     @Override
338     public Hasher putString(CharSequence input, Charset charset) {
339       if (Charsets.UTF_8.equals(charset)) {
340         int utf16Length = input.length();
341         int i = 0;
342 
343         // This loop optimizes for pure ASCII.
344         while (i + 4 <= utf16Length) {
345           char c0 = input.charAt(i);
346           char c1 = input.charAt(i + 1);
347           char c2 = input.charAt(i + 2);
348           char c3 = input.charAt(i + 3);
349           if (c0 < 0x80 && c1 < 0x80 && c2 < 0x80 && c3 < 0x80) {
350             update(4, c0 | (c1 << 8) | (c2 << 16) | (c3 << 24));
351             i += 4;
352           } else {
353             break;
354           }
355         }
356 
357         for (; i < utf16Length; i++) {
358           char c = input.charAt(i);
359           if (c < 0x80) {
360             update(1, c);
361           } else if (c < 0x800) {
362             update(2, charToTwoUtf8Bytes(c));
363           } else if (c < Character.MIN_SURROGATE || c > Character.MAX_SURROGATE) {
364             update(3, charToThreeUtf8Bytes(c));
365           } else {
366             int codePoint = Character.codePointAt(input, i);
367             if (codePoint == c) {
368               // fall back to JDK getBytes instead of trying to handle invalid surrogates ourselves
369               putBytes(input.subSequence(i, utf16Length).toString().getBytes(charset));
370               return this;
371             }
372             i++;
373             update(4, codePointToFourUtf8Bytes(codePoint));
374           }
375         }
376         return this;
377       } else {
378         return super.putString(input, charset);
379       }
380     }
381 
382     @Override
383     public HashCode hash() {
384       checkState(!isDone);
385       isDone = true;
386       h1 ^= mixK1((int) buffer);
387       return fmix(h1, length);
388     }
389   }
390 
391   private static long codePointToFourUtf8Bytes(int codePoint) {
392     return (((0xFL << 4) | (codePoint >>> 18)) & 0xFF)
393         | ((0x80L | (0x3F & (codePoint >>> 12))) << 8)
394         | ((0x80L | (0x3F & (codePoint >>> 6))) << 16)
395         | ((0x80L | (0x3F & codePoint)) << 24);
396   }
397 
398   private static long charToThreeUtf8Bytes(char c) {
399     return (((0xF << 5) | (c >>> 12)) & 0xFF)
400         | ((0x80 | (0x3F & (c >>> 6))) << 8)
401         | ((0x80 | (0x3F & c)) << 16);
402   }
403 
404   private static long charToTwoUtf8Bytes(char c) {
405     return (((0xF << 6) | (c >>> 6)) & 0xFF) | ((0x80 | (0x3F & c)) << 8);
406   }
407 
408   private static final long serialVersionUID = 0L;
409 }