maphash.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package maphash provides hash functions on byte sequences and comparable values.
     6  // These hash functions are intended to be used to implement hash tables or
     7  // other data structures that need to map arbitrary strings or byte
     8  // sequences to a uniform distribution on unsigned 64-bit integers.
     9  // Each different instance of a hash table or data structure should use its own [Seed].
    10  //
    11  // The hash functions are not cryptographically secure.
    12  // (See crypto/sha256 and crypto/sha512 for cryptographic use.)
    13  package maphash
    14  
    15  import (
    16  	"internal/byteorder"
    17  	"math"
    18  )
    19  
    20  // A Seed is a random value that selects the specific hash function
    21  // computed by a [Hash]. If two Hashes use the same Seeds, they
    22  // will compute the same hash values for any given input.
    23  // If two Hashes use different Seeds, they are very likely to compute
    24  // distinct hash values for any given input.
    25  //
    26  // A Seed must be initialized by calling [MakeSeed].
    27  // The zero seed is uninitialized and not valid for use with [Hash]'s SetSeed method.
    28  //
    29  // Each Seed value is local to a single process and cannot be serialized
    30  // or otherwise recreated in a different process.
    31  type Seed struct {
    32  	s uint64
    33  }
    34  
    35  // Bytes returns the hash of b with the given seed.
    36  //
    37  // Bytes is equivalent to, but more convenient and efficient than:
    38  //
    39  //	var h Hash
    40  //	h.SetSeed(seed)
    41  //	h.Write(b)
    42  //	return h.Sum64()
    43  func Bytes(seed Seed, b []byte) uint64 {
    44  	state := seed.s
    45  	if state == 0 {
    46  		panic("maphash: use of uninitialized Seed")
    47  	}
    48  
    49  	if len(b) > bufSize {
    50  		b = b[:len(b):len(b)] // merge len and cap calculations when reslicing
    51  		for len(b) > bufSize {
    52  			state = rthash(b[:bufSize], state)
    53  			b = b[bufSize:]
    54  		}
    55  	}
    56  	return rthash(b, state)
    57  }
    58  
    59  // String returns the hash of s with the given seed.
    60  //
    61  // String is equivalent to, but more convenient and efficient than:
    62  //
    63  //	var h Hash
    64  //	h.SetSeed(seed)
    65  //	h.WriteString(s)
    66  //	return h.Sum64()
    67  func String(seed Seed, s string) uint64 {
    68  	state := seed.s
    69  	if state == 0 {
    70  		panic("maphash: use of uninitialized Seed")
    71  	}
    72  	for len(s) > bufSize {
    73  		state = rthashString(s[:bufSize], state)
    74  		s = s[bufSize:]
    75  	}
    76  	return rthashString(s, state)
    77  }
    78  
    79  // A Hash computes a seeded hash of a byte sequence.
    80  //
    81  // The zero Hash is a valid Hash ready to use.
    82  // A zero Hash chooses a random seed for itself during
    83  // the first call to a Reset, Write, Seed, or Sum64 method.
    84  // For control over the seed, use SetSeed.
    85  //
    86  // The computed hash values depend only on the initial seed and
    87  // the sequence of bytes provided to the Hash object, not on the way
    88  // in which the bytes are provided. For example, the three sequences
    89  //
    90  //	h.Write([]byte{'f','o','o'})
    91  //	h.WriteByte('f'); h.WriteByte('o'); h.WriteByte('o')
    92  //	h.WriteString("foo")
    93  //
    94  // all have the same effect.
    95  //
    96  // Hashes are intended to be collision-resistant, even for situations
    97  // where an adversary controls the byte sequences being hashed.
    98  //
    99  // A Hash is not safe for concurrent use by multiple goroutines, but a Seed is.
   100  // If multiple goroutines must compute the same seeded hash,
   101  // each can declare its own Hash and call SetSeed with a common Seed.
   102  type Hash struct {
   103  	_     [0]func()     // not comparable
   104  	seed  Seed          // initial seed used for this hash
   105  	state Seed          // current hash of all flushed bytes
   106  	buf   [bufSize]byte // unflushed byte buffer
   107  	n     int           // number of unflushed bytes
   108  }
   109  
   110  // bufSize is the size of the Hash write buffer.
   111  // The buffer ensures that writes depend only on the sequence of bytes,
   112  // not the sequence of WriteByte/Write/WriteString calls,
   113  // by always calling rthash with a full buffer (except for the tail).
   114  const bufSize = 128
   115  
   116  // initSeed seeds the hash if necessary.
   117  // initSeed is called lazily before any operation that actually uses h.seed/h.state.
   118  // Note that this does not include Write/WriteByte/WriteString in the case
   119  // where they only add to h.buf. (If they write too much, they call h.flush,
   120  // which does call h.initSeed.)
   121  func (h *Hash) initSeed() {
   122  	if h.seed.s == 0 {
   123  		seed := MakeSeed()
   124  		h.seed = seed
   125  		h.state = seed
   126  	}
   127  }
   128  
   129  // WriteByte adds b to the sequence of bytes hashed by h.
   130  // It never fails; the error result is for implementing [io.ByteWriter].
   131  func (h *Hash) WriteByte(b byte) error {
   132  	if h.n == len(h.buf) {
   133  		h.flush()
   134  	}
   135  	h.buf[h.n] = b
   136  	h.n++
   137  	return nil
   138  }
   139  
   140  // Write adds b to the sequence of bytes hashed by h.
   141  // It always writes all of b and never fails; the count and error result are for implementing [io.Writer].
   142  func (h *Hash) Write(b []byte) (int, error) {
   143  	size := len(b)
   144  	// Deal with bytes left over in h.buf.
   145  	// h.n <= bufSize is always true.
   146  	// Checking it is ~free and it lets the compiler eliminate a bounds check.
   147  	if h.n > 0 && h.n <= bufSize {
   148  		k := copy(h.buf[h.n:], b)
   149  		h.n += k
   150  		if h.n < bufSize {
   151  			// Copied the entirety of b to h.buf.
   152  			return size, nil
   153  		}
   154  		b = b[k:]
   155  		h.flush()
   156  		// No need to set h.n = 0 here; it happens just before exit.
   157  	}
   158  	// Process as many full buffers as possible, without copying, and calling initSeed only once.
   159  	if len(b) > bufSize {
   160  		h.initSeed()
   161  		for len(b) > bufSize {
   162  			h.state.s = rthash(b[:bufSize], h.state.s)
   163  			b = b[bufSize:]
   164  		}
   165  	}
   166  	// Copy the tail.
   167  	copy(h.buf[:], b)
   168  	h.n = len(b)
   169  	return size, nil
   170  }
   171  
   172  // WriteString adds the bytes of s to the sequence of bytes hashed by h.
   173  // It always writes all of s and never fails; the count and error result are for implementing [io.StringWriter].
   174  func (h *Hash) WriteString(s string) (int, error) {
   175  	// WriteString mirrors Write. See Write for comments.
   176  	size := len(s)
   177  	if h.n > 0 && h.n <= bufSize {
   178  		k := copy(h.buf[h.n:], s)
   179  		h.n += k
   180  		if h.n < bufSize {
   181  			return size, nil
   182  		}
   183  		s = s[k:]
   184  		h.flush()
   185  	}
   186  	if len(s) > bufSize {
   187  		h.initSeed()
   188  		for len(s) > bufSize {
   189  			h.state.s = rthashString(s[:bufSize], h.state.s)
   190  			s = s[bufSize:]
   191  		}
   192  	}
   193  	copy(h.buf[:], s)
   194  	h.n = len(s)
   195  	return size, nil
   196  }
   197  
   198  // Seed returns h's seed value.
   199  func (h *Hash) Seed() Seed {
   200  	h.initSeed()
   201  	return h.seed
   202  }
   203  
   204  // SetSeed sets h to use seed, which must have been returned by [MakeSeed]
   205  // or by another [Hash.Seed] method.
   206  // Two [Hash] objects with the same seed behave identically.
   207  // Two [Hash] objects with different seeds will very likely behave differently.
   208  // Any bytes added to h before this call will be discarded.
   209  func (h *Hash) SetSeed(seed Seed) {
   210  	if seed.s == 0 {
   211  		panic("maphash: use of uninitialized Seed")
   212  	}
   213  	h.seed = seed
   214  	h.state = seed
   215  	h.n = 0
   216  }
   217  
   218  // Reset discards all bytes added to h.
   219  // (The seed remains the same.)
   220  func (h *Hash) Reset() {
   221  	h.initSeed()
   222  	h.state = h.seed
   223  	h.n = 0
   224  }
   225  
   226  // precondition: buffer is full.
   227  func (h *Hash) flush() {
   228  	if h.n != len(h.buf) {
   229  		panic("maphash: flush of partially full buffer")
   230  	}
   231  	h.initSeed()
   232  	h.state.s = rthash(h.buf[:h.n], h.state.s)
   233  	h.n = 0
   234  }
   235  
   236  // Sum64 returns h's current 64-bit value, which depends on
   237  // h's seed and the sequence of bytes added to h since the
   238  // last call to [Hash.Reset] or [Hash.SetSeed].
   239  //
   240  // All bits of the Sum64 result are close to uniformly and
   241  // independently distributed, so it can be safely reduced
   242  // by using bit masking, shifting, or modular arithmetic.
   243  func (h *Hash) Sum64() uint64 {
   244  	h.initSeed()
   245  	return rthash(h.buf[:h.n], h.state.s)
   246  }
   247  
   248  // MakeSeed returns a new random seed.
   249  func MakeSeed() Seed {
   250  	var s uint64
   251  	for {
   252  		s = randUint64()
   253  		// We use seed 0 to indicate an uninitialized seed/hash,
   254  		// so keep trying until we get a non-zero seed.
   255  		if s != 0 {
   256  			break
   257  		}
   258  	}
   259  	return Seed{s: s}
   260  }
   261  
   262  // Sum appends the hash's current 64-bit value to b.
   263  // It exists for implementing [hash.Hash].
   264  // For direct calls, it is more efficient to use [Hash.Sum64].
   265  func (h *Hash) Sum(b []byte) []byte {
   266  	x := h.Sum64()
   267  	return append(b,
   268  		byte(x>>0),
   269  		byte(x>>8),
   270  		byte(x>>16),
   271  		byte(x>>24),
   272  		byte(x>>32),
   273  		byte(x>>40),
   274  		byte(x>>48),
   275  		byte(x>>56))
   276  }
   277  
   278  // Size returns h's hash value size, 8 bytes.
   279  func (h *Hash) Size() int { return 8 }
   280  
   281  // BlockSize returns h's block size.
   282  func (h *Hash) BlockSize() int { return len(h.buf) }
   283  
   284  // Comparable returns the hash of comparable value v with the given seed
   285  // such that Comparable(s, v1) == Comparable(s, v2) if v1 == v2.
   286  // If v != v, then the resulting hash is randomly distributed.
   287  func Comparable[T comparable](seed Seed, v T) uint64 {
   288  	escapeForHash(v)
   289  	return comparableHash(v, seed)
   290  }
   291  
   292  // escapeForHash forces v to be on the heap, if v contains a
   293  // non-string pointer. We cannot hash pointers to local variables,
   294  // as the address of the local variable might change on stack growth.
   295  // Strings are okay as the hash depends on only the content, not
   296  // the pointer.
   297  //
   298  // This is essentially
   299  //
   300  //	if hasNonStringPointers(T) { abi.Escape(v) }
   301  //
   302  // Implemented as a compiler intrinsic.
   303  func escapeForHash[T comparable](v T) { panic("intrinsic") }
   304  
   305  // WriteComparable adds x to the data hashed by h.
   306  func WriteComparable[T comparable](h *Hash, x T) {
   307  	escapeForHash(x)
   308  	// writeComparable (not in purego mode) directly operates on h.state
   309  	// without using h.buf. Mix in the buffer length so it won't
   310  	// commute with a buffered write, which either changes h.n or changes
   311  	// h.state.
   312  	if h.n != 0 {
   313  		writeComparable(h, h.n)
   314  	}
   315  	writeComparable(h, x)
   316  }
   317  
   318  func (h *Hash) float64(f float64) {
   319  	if f == 0 {
   320  		h.WriteByte(0)
   321  		return
   322  	}
   323  	var buf [8]byte
   324  	if f != f {
   325  		byteorder.LEPutUint64(buf[:], randUint64())
   326  		h.Write(buf[:])
   327  		return
   328  	}
   329  	byteorder.LEPutUint64(buf[:], math.Float64bits(f))
   330  	h.Write(buf[:])
   331  }
   332  
   333  func btoi(b bool) byte {
   334  	if b {
   335  		return 1
   336  	}
   337  	return 0
   338  }
   339
View as plain text