-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.go
112 lines (83 loc) · 2.24 KB
/
scraper.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
package main
import (
"errors"
"fmt"
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"net/url"
"strings"
)
func normalizeUrl(rootUrl *url.URL, link string) *url.URL {
u, err := url.Parse(strings.TrimSpace(link))
if err != nil {
log.Println(err)
return nil
}
u = rootUrl.ResolveReference(u)
if u.Path == "" {
u.Path = "/"
}
u.Fragment = ""
return u
}
func Scrape(rootUrl *url.URL, visitedLinks *LinkHash, siteMap *SiteMap, pendingLinks chan<- string, link string) error {
log.Printf("%d unique links visited", visitedLinks.Size())
log.Printf("Attempting to visit %s", link)
visitedLinks.Try(link)
res, err := http.Get(link)
if err != nil {
visitedLinks.Failed(link)
if visitedLinks.Tries(link) < 3 {
pendingLinks <- link
}
return err
}
defer res.Body.Close()
contentType := res.Header.Get("Content-Type")
if !strings.HasPrefix(contentType, "text/html") {
visitedLinks.Add(link)
log.Println(fmt.Sprintf("%s is of Content-Type: %s", link, contentType))
return nil
}
visitedLinks.Add(link)
if res.StatusCode != 200 {
if res.StatusCode == 404 {
log.Println(fmt.Sprintf("Document at %s not found", link))
return nil
}
return errors.New(fmt.Sprintf("%s status code error: %d %s", link, res.StatusCode, res.Status))
}
log.Printf("Page fetched successfully %s", link)
log.Printf("Parsing document at %s", link)
doc, err := goquery.NewDocumentFromReader(res.Body)
if err != nil {
return err
}
log.Printf("Document at %s parsed successfully", link)
linksCount := 0
links := make([]string, 0)
seenLinks := make(map[string]bool)
// get all anchor tags with qoquery
doc.Find("a").Each(func(i int, s *goquery.Selection) {
// fetch all their hrefs
link, _ := s.Attr("href")
u := normalizeUrl(rootUrl, link)
// filter URLs from a different host
if !strings.HasPrefix(u.Scheme, "http") || u.Host != rootUrl.Host {
return
}
resolvedLink := u.String()
if !seenLinks[resolvedLink] {
seenLinks[resolvedLink] = true
links = append(links, resolvedLink)
}
if !visitedLinks.Visited(resolvedLink) {
pendingLinks <- resolvedLink
}
linksCount++
})
siteMap.Add(link, links)
log.Printf("%d eligible links in document at %s scraped successfully", linksCount, link)
return nil
}