Source file
test/fixedbugs/issue69434.go
1
2
3
4
5
6
7 package main
8
9 import (
10 "bufio"
11 "fmt"
12 "io"
13 "iter"
14 "math/rand"
15 "os"
16 "strings"
17 "unicode"
18 )
19
20
21 type WordReader struct {
22 scanner *bufio.Scanner
23 }
24
25
26 func NewWordReader(r io.Reader) *WordReader {
27 scanner := bufio.NewScanner(r)
28 scanner.Split(bufio.ScanWords)
29 return &WordReader{
30 scanner: scanner,
31 }
32 }
33
34
35 func (wr *WordReader) Read(p []byte) (n int, err error) {
36 if !wr.scanner.Scan() {
37 if err := wr.scanner.Err(); err != nil {
38 return 0, err
39 }
40 return 0, io.EOF
41 }
42 word := wr.scanner.Text()
43 cleanedWord := removeNonAlphabetic(word)
44 if len(cleanedWord) == 0 {
45 return wr.Read(p)
46 }
47 n = copy(p, []byte(cleanedWord))
48 return n, nil
49 }
50
51
52 func (wr *WordReader) All() iter.Seq[string] {
53 word := make([]byte, 1024)
54 return func(yield func(string) bool) {
55 var err error
56 var n int
57 for n, err = wr.Read(word); err == nil; n, err = wr.Read(word) {
58 if !yield(string(word[:n])) {
59 return
60 }
61 }
62 if err != io.EOF {
63 fmt.Fprintf(os.Stderr, "error reading word: %v\n", err)
64 }
65 }
66 }
67
68
69 func removeNonAlphabetic(word string) string {
70 return strings.Map(func(r rune) rune {
71 if unicode.IsLetter(r) {
72 return unicode.ToLower(r)
73 }
74 return -1
75 }, word)
76 }
77
78
79 type ProbabilisticSkipper struct {
80 n int
81 counter uint64
82 bitmask uint64
83 }
84
85
86 func NewProbabilisticSkipper(n int) *ProbabilisticSkipper {
87 pr := &ProbabilisticSkipper{n: n}
88 pr.refreshCounter()
89 return pr
90 }
91
92
93 func (pr *ProbabilisticSkipper) check(n int) {
94 if pr.n != n {
95 panic(fmt.Sprintf("check: pr.n != n %d != %d", pr.n, n))
96 }
97 }
98
99
100 func (pr *ProbabilisticSkipper) refreshCounter() {
101 if pr.n == 0 {
102 pr.bitmask = ^uint64(0)
103 } else {
104 pr.bitmask = rand.Uint64()
105 for i := 0; i < pr.n-1; i++ {
106 pr.bitmask &= rand.Uint64()
107 }
108 }
109 pr.counter = 64
110 }
111
112
113 func (pr *ProbabilisticSkipper) ShouldSkip() bool {
114 remove := pr.bitmask&1 == 0
115 pr.bitmask >>= 1
116 pr.counter--
117 if pr.counter == 0 {
118 pr.refreshCounter()
119 }
120 return remove
121 }
122
123
124 func EstimateUniqueWordsIter(reader io.Reader, memorySize int) int {
125 wordReader := NewWordReader(reader)
126 words := make(map[string]struct{}, memorySize)
127
128 rounds := 0
129 roundRemover := NewProbabilisticSkipper(1)
130 wordSkipper := NewProbabilisticSkipper(rounds)
131 wordSkipper.check(rounds)
132
133 for word := range wordReader.All() {
134 wordSkipper.check(rounds)
135 if wordSkipper.ShouldSkip() {
136 delete(words, word)
137 } else {
138 words[word] = struct{}{}
139
140 if len(words) >= memorySize {
141 rounds++
142
143 wordSkipper = NewProbabilisticSkipper(rounds)
144 for w := range words {
145 if roundRemover.ShouldSkip() {
146 delete(words, w)
147 }
148 }
149 }
150 }
151 wordSkipper.check(rounds)
152 }
153
154 if len(words) == 0 {
155 return 0
156 }
157
158 invProbability := 1 << rounds
159 estimatedUniqueWords := len(words) * invProbability
160 return estimatedUniqueWords
161 }
162
163 func main() {
164 input := "Hello, world! This is a test. Hello, world, hello!"
165 expectedUniqueWords := 6
166 memorySize := 6
167
168 reader := strings.NewReader(input)
169 estimatedUniqueWords := EstimateUniqueWordsIter(reader, memorySize)
170 if estimatedUniqueWords != expectedUniqueWords {
171
172 }
173 }
174
View as plain text