Source file test/fixedbugs/issue69434.go

     1  // run
     2  
     3  // Copyright 2024 The Go Authors. All rights reserved.
     4  // Use of this source code is governed by a BSD-style
     5  // license that can be found in the LICENSE file.
     6  
     7  package main
     8  
     9  import (
    10  	"bufio"
    11  	"fmt"
    12  	"io"
    13  	"iter"
    14  	"math/rand"
    15  	"os"
    16  	"strings"
    17  	"unicode"
    18  )
    19  
    20  // WordReader is the struct that implements io.Reader
    21  type WordReader struct {
    22  	scanner *bufio.Scanner
    23  }
    24  
    25  // NewWordReader creates a new WordReader from an io.Reader
    26  func NewWordReader(r io.Reader) *WordReader {
    27  	scanner := bufio.NewScanner(r)
    28  	scanner.Split(bufio.ScanWords)
    29  	return &WordReader{
    30  		scanner: scanner,
    31  	}
    32  }
    33  
    34  // Read reads data from the input stream and returns a single lowercase word at a time
    35  func (wr *WordReader) Read(p []byte) (n int, err error) {
    36  	if !wr.scanner.Scan() {
    37  		if err := wr.scanner.Err(); err != nil {
    38  			return 0, err
    39  		}
    40  		return 0, io.EOF
    41  	}
    42  	word := wr.scanner.Text()
    43  	cleanedWord := removeNonAlphabetic(word)
    44  	if len(cleanedWord) == 0 {
    45  		return wr.Read(p)
    46  	}
    47  	n = copy(p, []byte(cleanedWord))
    48  	return n, nil
    49  }
    50  
    51  // All returns an iterator allowing the caller to iterate over the WordReader using for/range.
    52  func (wr *WordReader) All() iter.Seq[string] {
    53  	word := make([]byte, 1024)
    54  	return func(yield func(string) bool) {
    55  		var err error
    56  		var n int
    57  		for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) {
    58  			if !yield(string(word[:n])) {
    59  				return
    60  			}
    61  		}
    62  		if err != io.EOF {
    63  			fmt.Fprintf(os.Stderr, "error reading word: %v\n", err)
    64  		}
    65  	}
    66  }
    67  
    68  // removeNonAlphabetic removes non-alphabetic characters from a word using strings.Map
    69  func removeNonAlphabetic(word string) string {
    70  	return strings.Map(func(r rune) rune {
    71  		if unicode.IsLetter(r) {
    72  			return unicode.ToLower(r)
    73  		}
    74  		return -1
    75  	}, word)
    76  }
    77  
    78  // ProbabilisticSkipper determines if an item should be retained with probability 1/(1<<n)
    79  type ProbabilisticSkipper struct {
    80  	n       int
    81  	counter uint64
    82  	bitmask uint64
    83  }
    84  
    85  // NewProbabilisticSkipper initializes the ProbabilisticSkipper
    86  func NewProbabilisticSkipper(n int) *ProbabilisticSkipper {
    87  	pr := &ProbabilisticSkipper{n: n}
    88  	pr.refreshCounter()
    89  	return pr
    90  }
    91  
    92  // check panics if pr.n is not the expected value
    93  func (pr *ProbabilisticSkipper) check(n int) {
    94  	if pr.n != n {
    95  		panic(fmt.Sprintf("check: pr.n != n  %d != %d", pr.n, n))
    96  	}
    97  }
    98  
    99  // refreshCounter refreshes the counter with a new random value
   100  func (pr *ProbabilisticSkipper) refreshCounter() {
   101  	if pr.n == 0 {
   102  		pr.bitmask = ^uint64(0) // All bits set to 1
   103  	} else {
   104  		pr.bitmask = rand.Uint64()
   105  		for i := 0; i < pr.n-1; i++ {
   106  			pr.bitmask &= rand.Uint64()
   107  		}
   108  	}
   109  	pr.counter = 64
   110  }
   111  
   112  // ShouldSkip returns true with probability 1/(1<<n)
   113  func (pr *ProbabilisticSkipper) ShouldSkip() bool {
   114  	remove := pr.bitmask&1 == 0
   115  	pr.bitmask >>= 1
   116  	pr.counter--
   117  	if pr.counter == 0 {
   118  		pr.refreshCounter()
   119  	}
   120  	return remove
   121  }
   122  
   123  // EstimateUniqueWordsIter estimates the number of unique words using a probabilistic counting method
   124  func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int {
   125  	wordReader := NewWordReader(reader)
   126  	words := make(map[string]struct{}, memorySize)
   127  
   128  	rounds := 0
   129  	roundRemover := NewProbabilisticSkipper(1)
   130  	wordSkipper := NewProbabilisticSkipper(rounds)
   131  	wordSkipper.check(rounds)
   132  
   133  	for word := range wordReader.All() {
   134  		wordSkipper.check(rounds)
   135  		if wordSkipper.ShouldSkip() {
   136  			delete(words, word)
   137  		} else {
   138  			words[word] = struct{}{}
   139  
   140  			if len(words) >= memorySize {
   141  				rounds++
   142  
   143  				wordSkipper = NewProbabilisticSkipper(rounds)
   144  				for w := range words {
   145  					if roundRemover.ShouldSkip() {
   146  						delete(words, w)
   147  					}
   148  				}
   149  			}
   150  		}
   151  		wordSkipper.check(rounds)
   152  	}
   153  
   154  	if len(words) == 0 {
   155  		return 0
   156  	}
   157  
   158  	invProbability := 1 << rounds
   159  	estimatedUniqueWords := len(words) * invProbability
   160  	return estimatedUniqueWords
   161  }
   162  
   163  func main() {
   164  	input := "Hello, world! This is a test. Hello, world, hello!"
   165  	expectedUniqueWords := 6 // "hello", "world", "this", "is", "a", "test" (but "hello" and "world" are repeated)
   166  	memorySize := 6
   167  
   168  	reader := strings.NewReader(input)
   169  	estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize)
   170  	if estimatedUniqueWords != expectedUniqueWords {
   171  		// ...
   172  	}
   173  }
   174  

View as plain text