1
2
3
4
5
6
7
8 package main
9
10 import (
11 "errors"
12 "flag"
13 "fmt"
14 "io"
15 "log"
16 "net/http"
17 "os"
18 "regexp"
19 "strings"
20 "sync"
21 )
22
23 var (
24 root = flag.String("root", "http://localhost:6060", "Root to crawl")
25 verbose = flag.Bool("verbose", false, "verbose")
26 )
27
28 var wg sync.WaitGroup
29 var urlq = make(chan string)
30
31
32 type urlFrag struct {
33 url, frag string
34 }
35
36 var (
37 mu sync.Mutex
38 crawled = make(map[string]bool)
39 neededFrags = make(map[urlFrag][]string)
40 )
41
42 var aRx = regexp.MustCompile(`<a href=['"]?(/[^\s'">]+)`)
43
44
45 var (
46 linkSources = make(map[string][]string)
47 fragExists = make(map[urlFrag]bool)
48 problems []string
49 )
50
51 func localLinks(body string) (links []string) {
52 seen := map[string]bool{}
53 mv := aRx.FindAllStringSubmatch(body, -1)
54 for _, m := range mv {
55 ref := m[1]
56 if strings.HasPrefix(ref, "/src/") {
57 continue
58 }
59 if !seen[ref] {
60 seen[ref] = true
61 links = append(links, m[1])
62 }
63 }
64 return
65 }
66
67 var idRx = regexp.MustCompile(`\bid=['"]?([^\s'">]+)`)
68
69 func pageIDs(body string) (ids []string) {
70 mv := idRx.FindAllStringSubmatch(body, -1)
71 for _, m := range mv {
72 ids = append(ids, m[1])
73 }
74 return
75 }
76
77
78 func crawl(url string, sourceURL string) {
79 if strings.Contains(url, "/devel/release") {
80 return
81 }
82 mu.Lock()
83 defer mu.Unlock()
84 if u, frag, ok := strings.Cut(url, "#"); ok {
85 url = u
86 if frag != "" {
87 uf := urlFrag{url, frag}
88 neededFrags[uf] = append(neededFrags[uf], sourceURL)
89 }
90 }
91 if crawled[url] {
92 return
93 }
94 crawled[url] = true
95
96 wg.Add(1)
97 go func() {
98 urlq <- url
99 }()
100 }
101
102 func addProblem(url, errmsg string) {
103 msg := fmt.Sprintf("Error on %s: %s (from %s)", url, errmsg, linkSources[url])
104 if *verbose {
105 log.Print(msg)
106 }
107 problems = append(problems, msg)
108 }
109
110 func crawlLoop() {
111 for url := range urlq {
112 if err := doCrawl(url); err != nil {
113 addProblem(url, err.Error())
114 }
115 }
116 }
117
118 func doCrawl(url string) error {
119 defer wg.Done()
120
121 req, err := http.NewRequest("GET", url, nil)
122 if err != nil {
123 return err
124 }
125 res, err := http.DefaultTransport.RoundTrip(req)
126 if err != nil {
127 return err
128 }
129
130 if res.StatusCode/100 == 3 {
131 newURL, err := res.Location()
132 if err != nil {
133 return fmt.Errorf("resolving redirect: %v", err)
134 }
135 if !strings.HasPrefix(newURL.String(), *root) {
136
137 return nil
138 }
139 crawl(newURL.String(), url)
140 return nil
141 }
142 if res.StatusCode != 200 {
143 return errors.New(res.Status)
144 }
145 slurp, err := io.ReadAll(res.Body)
146 res.Body.Close()
147 if err != nil {
148 log.Fatalf("Error reading %s body: %v", url, err)
149 }
150 if *verbose {
151 log.Printf("Len of %s: %d", url, len(slurp))
152 }
153 body := string(slurp)
154 for _, ref := range localLinks(body) {
155 if *verbose {
156 log.Printf(" links to %s", ref)
157 }
158 dest := *root + ref
159 linkSources[dest] = append(linkSources[dest], url)
160 crawl(dest, url)
161 }
162 for _, id := range pageIDs(body) {
163 if *verbose {
164 log.Printf(" url %s has #%s", url, id)
165 }
166 fragExists[urlFrag{url, id}] = true
167 }
168 return nil
169 }
170
171 func main() {
172 flag.Parse()
173
174 go crawlLoop()
175 crawl(*root, "")
176
177 wg.Wait()
178 close(urlq)
179 for uf, needers := range neededFrags {
180 if !fragExists[uf] {
181 problems = append(problems, fmt.Sprintf("Missing fragment for %+v from %v", uf, needers))
182 }
183 }
184
185 for _, s := range problems {
186 fmt.Println(s)
187 }
188 if len(problems) > 0 {
189 os.Exit(1)
190 }
191 }
192
View as plain text