Source file src/hash/maphash/maphash.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences.
     6  // These hash functions are intended to be used to implement hash tables or
     7  // other data structures that need to map arbitrary strings or byte
     8  // sequences to a uniform distribution on unsigned 64-bit integers.
     9  // Each different instance of a hash table or data structure should use its own Seed.
    10  //
    11  // The hash functions are not cryptographically secure.
    12  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    13  package maphash
    14  
    15  import (
    16  	"unsafe"
    17  )
    18  
    19  // A Seed is a random value that selects the specific hash function
    20  // computed by a Hash. If two Hashes use the same Seeds, they
    21  // will compute the same hash values for any given input.
    22  // If two Hashes use different Seeds, they are very likely to compute
    23  // distinct hash values for any given input.
    24  //
    25  // A Seed must be initialized by calling MakeSeed.
    26  // The zero seed is uninitialized and not valid for use with Hash's SetSeed method.
    27  //
    28  // Each Seed value is local to a single process and cannot be serialized
    29  // or otherwise recreated in a different process.
    30  type Seed struct {
    31  	s uint64
    32  }
    33  
    34  // Bytes returns the hash of b with the given seed.
    35  //
    36  // Bytes is equivalent to, but more convenient and efficient than:
    37  //
    38  //	var h Hash
    39  //	h.SetSeed(seed)
    40  //	h.Write(b)
    41  //	return h.Sum64()
    42  func Bytes(seed Seed, b []byte) uint64 {
    43  	state := seed.s
    44  	if state == 0 {
    45  		panic("maphash: use of uninitialized Seed")
    46  	}
    47  	if len(b) == 0 {
    48  		return rthash(nil, 0, state) // avoid &b[0] index panic below
    49  	}
    50  	if len(b) > bufSize {
    51  		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
    52  		for len(b) > bufSize {
    53  			state = rthash(&b[0], bufSize, state)
    54  			b = b[bufSize:]
    55  		}
    56  	}
    57  	return rthash(&b[0], len(b), state)
    58  }
    59  
    60  // String returns the hash of s with the given seed.
    61  //
    62  // String is equivalent to, but more convenient and efficient than:
    63  //
    64  //	var h Hash
    65  //	h.SetSeed(seed)
    66  //	h.WriteString(s)
    67  //	return h.Sum64()
    68  func String(seed Seed, s string) uint64 {
    69  	state := seed.s
    70  	if state == 0 {
    71  		panic("maphash: use of uninitialized Seed")
    72  	}
    73  	for len(s) > bufSize {
    74  		p := (*byte)(unsafe.StringData(s))
    75  		state = rthash(p, bufSize, state)
    76  		s = s[bufSize:]
    77  	}
    78  	p := (*byte)(unsafe.StringData(s))
    79  	return rthash(p, len(s), state)
    80  }
    81  
    82  // A Hash computes a seeded hash of a byte sequence.
    83  //
    84  // The zero Hash is a valid Hash ready to use.
    85  // A zero Hash chooses a random seed for itself during
    86  // the first call to a Reset, Write, Seed, or Sum64 method.
    87  // For control over the seed, use SetSeed.
    88  //
    89  // The computed hash values depend only on the initial seed and
    90  // the sequence of bytes provided to the Hash object, not on the way
    91  // in which the bytes are provided. For example, the three sequences
    92  //
    93  //	h.Write([]byte{'f','o','o'})
    94  //	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    95  //	h.WriteString("foo")
    96  //
    97  // all have the same effect.
    98  //
    99  // Hashes are intended to be collision-resistant, even for situations
   100  // where an adversary controls the byte sequences being hashed.
   101  //
   102  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
   103  // If multiple goroutines must compute the same seeded hash,
   104  // each can declare its own Hash and call SetSeed with a common Seed.
   105  type Hash struct {
   106  	_     [0]func()     // not comparable
   107  	seed  Seed          // initial seed used for this hash
   108  	state Seed          // current hash of all flushed bytes
   109  	buf   [bufSize]byte // unflushed byte buffer
   110  	n     int           // number of unflushed bytes
   111  }
   112  
   113  // bufSize is the size of the Hash write buffer.
   114  // The buffer ensures that writes depend only on the sequence of bytes,
   115  // not the sequence of WriteByte/Write/WriteString calls,
   116  // by always calling rthash with a full buffer (except for the tail).
   117  const bufSize = 128
   118  
   119  // initSeed seeds the hash if necessary.
   120  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
   121  // Note that this does not include Write/WriteByte/WriteString in the case
   122  // where they only add to h.buf. (If they write too much, they call h.flush,
   123  // which does call h.initSeed.)
   124  func (h *Hash) initSeed() {
   125  	if h.seed.s == 0 {
   126  		seed := MakeSeed()
   127  		h.seed = seed
   128  		h.state = seed
   129  	}
   130  }
   131  
   132  // WriteByte adds b to the sequence of bytes hashed by h.
   133  // It never fails; the error result is for implementing io.ByteWriter.
   134  func (h *Hash) WriteByte(b byte) error {
   135  	if h.n == len(h.buf) {
   136  		h.flush()
   137  	}
   138  	h.buf[h.n] = b
   139  	h.n++
   140  	return nil
   141  }
   142  
   143  // Write adds b to the sequence of bytes hashed by h.
   144  // It always writes all of b and never fails; the count and error result are for implementing io.Writer.
   145  func (h *Hash) Write(b []byte) (int, error) {
   146  	size := len(b)
   147  	// Deal with bytes left over in h.buf.
   148  	// h.n <= bufSize is always true.
   149  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   150  	if h.n > 0 && h.n <= bufSize {
   151  		k := copy(h.buf[h.n:], b)
   152  		h.n += k
   153  		if h.n < bufSize {
   154  			// Copied the entirety of b to h.buf.
   155  			return size, nil
   156  		}
   157  		b = b[k:]
   158  		h.flush()
   159  		// No need to set h.n = 0 here; it happens just before exit.
   160  	}
   161  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   162  	if len(b) > bufSize {
   163  		h.initSeed()
   164  		for len(b) > bufSize {
   165  			h.state.s = rthash(&b[0], bufSize, h.state.s)
   166  			b = b[bufSize:]
   167  		}
   168  	}
   169  	// Copy the tail.
   170  	copy(h.buf[:], b)
   171  	h.n = len(b)
   172  	return size, nil
   173  }
   174  
   175  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   176  // It always writes all of s and never fails; the count and error result are for implementing io.StringWriter.
   177  func (h *Hash) WriteString(s string) (int, error) {
   178  	// WriteString mirrors Write. See Write for comments.
   179  	size := len(s)
   180  	if h.n > 0 && h.n <= bufSize {
   181  		k := copy(h.buf[h.n:], s)
   182  		h.n += k
   183  		if h.n < bufSize {
   184  			return size, nil
   185  		}
   186  		s = s[k:]
   187  		h.flush()
   188  	}
   189  	if len(s) > bufSize {
   190  		h.initSeed()
   191  		for len(s) > bufSize {
   192  			ptr := (*byte)(unsafe.StringData(s))
   193  			h.state.s = rthash(ptr, bufSize, h.state.s)
   194  			s = s[bufSize:]
   195  		}
   196  	}
   197  	copy(h.buf[:], s)
   198  	h.n = len(s)
   199  	return size, nil
   200  }
   201  
   202  // Seed returns h's seed value.
   203  func (h *Hash) Seed() Seed {
   204  	h.initSeed()
   205  	return h.seed
   206  }
   207  
   208  // SetSeed sets h to use seed, which must have been returned by MakeSeed
   209  // or by another Hash's Seed method.
   210  // Two Hash objects with the same seed behave identically.
   211  // Two Hash objects with different seeds will very likely behave differently.
   212  // Any bytes added to h before this call will be discarded.
   213  func (h *Hash) SetSeed(seed Seed) {
   214  	if seed.s == 0 {
   215  		panic("maphash: use of uninitialized Seed")
   216  	}
   217  	h.seed = seed
   218  	h.state = seed
   219  	h.n = 0
   220  }
   221  
   222  // Reset discards all bytes added to h.
   223  // (The seed remains the same.)
   224  func (h *Hash) Reset() {
   225  	h.initSeed()
   226  	h.state = h.seed
   227  	h.n = 0
   228  }
   229  
   230  // precondition: buffer is full.
   231  func (h *Hash) flush() {
   232  	if h.n != len(h.buf) {
   233  		panic("maphash: flush of partially full buffer")
   234  	}
   235  	h.initSeed()
   236  	h.state.s = rthash(&h.buf[0], h.n, h.state.s)
   237  	h.n = 0
   238  }
   239  
   240  // Sum64 returns h's current 64-bit value, which depends on
   241  // h's seed and the sequence of bytes added to h since the
   242  // last call to Reset or SetSeed.
   243  //
   244  // All bits of the Sum64 result are close to uniformly and
   245  // independently distributed, so it can be safely reduced
   246  // by using bit masking, shifting, or modular arithmetic.
   247  func (h *Hash) Sum64() uint64 {
   248  	h.initSeed()
   249  	return rthash(&h.buf[0], h.n, h.state.s)
   250  }
   251  
   252  // MakeSeed returns a new random seed.
   253  func MakeSeed() Seed {
   254  	var s uint64
   255  	for {
   256  		s = runtime_fastrand64()
   257  		// We use seed 0 to indicate an uninitialized seed/hash,
   258  		// so keep trying until we get a non-zero seed.
   259  		if s != 0 {
   260  			break
   261  		}
   262  	}
   263  	return Seed{s: s}
   264  }
   265  
   266  //go:linkname runtime_fastrand64 runtime.fastrand64
   267  func runtime_fastrand64() uint64
   268  
   269  func rthash(ptr *byte, len int, seed uint64) uint64 {
   270  	if len == 0 {
   271  		return seed
   272  	}
   273  	// The runtime hasher only works on uintptr. For 64-bit
   274  	// architectures, we use the hasher directly. Otherwise,
   275  	// we use two parallel hashers on the lower and upper 32 bits.
   276  	if unsafe.Sizeof(uintptr(0)) == 8 {
   277  		return uint64(runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len)))
   278  	}
   279  	lo := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed), uintptr(len))
   280  	hi := runtime_memhash(unsafe.Pointer(ptr), uintptr(seed>>32), uintptr(len))
   281  	return uint64(hi)<<32 | uint64(lo)
   282  }
   283  
   284  //go:linkname runtime_memhash runtime.memhash
   285  //go:noescape
   286  func runtime_memhash(p unsafe.Pointer, seed, s uintptr) uintptr
   287  
   288  // Sum appends the hash's current 64-bit value to b.
   289  // It exists for implementing hash.Hash.
   290  // For direct calls, it is more efficient to use Sum64.
   291  func (h *Hash) Sum(b []byte) []byte {
   292  	x := h.Sum64()
   293  	return append(b,
   294  		byte(x>>0),
   295  		byte(x>>8),
   296  		byte(x>>16),
   297  		byte(x>>24),
   298  		byte(x>>32),
   299  		byte(x>>40),
   300  		byte(x>>48),
   301  		byte(x>>56))
   302  }
   303  
   304  // Size returns h's hash value size, 8 bytes.
   305  func (h *Hash) Size() int { return 8 }
   306  
   307  // BlockSize returns h's block size.
   308  func (h *Hash) BlockSize() int { return len(h.buf) }
   309  

View as plain text