Skip to content

Commit

Permalink
feat: improve code
Browse files Browse the repository at this point in the history
  • Loading branch information
ashishb committed Aug 31, 2024
1 parent ffccabe commit 39a0880
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 25 deletions.
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
module github.com/ashishb/outbound-link-checker

go 1.22.5

require github.com/rs/zerolog v1.33.0

require (
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
golang.org/x/sys v0.24.0 // indirect
)
17 changes: 17 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg=
github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8=
github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.24.0 h1:Twjiwq9dn6R1fQcyiK+wQyHWfaz/BJB+YIpzU/Cv3Xg=
golang.org/x/sys v0.24.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
70 changes: 70 additions & 0 deletions internal/logger/zerlog_config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package logger

import (
"encoding/json"
"os"
"strconv"
"strings"
"time"

"github.com/rs/zerolog"
"github.com/rs/zerolog/log"
)

const (
_defaultLogLevel = zerolog.DebugLevel
)

// ConfigureLogging configures ZeroLog's logging config with good defaults
func ConfigureLogging(colorLogOutput bool) {
// UNIX Time is faster and smaller than most timestamps
zerolog.TimeFieldFormat = zerolog.TimeFormatUnix
logLevel := getLogLevel()
zerolog.SetGlobalLevel(logLevel)

if colorLogOutput {
// Pretty printing is a bit inefficient for production
output := zerolog.ConsoleWriter{Out: os.Stderr}
output.FormatTimestamp = func(t any) string {
ms, err := t.(json.Number).Int64()
if err != nil {
panic(err)
}
return time.Unix(ms, 0).In(time.Local).Format("03:04:05PM")
}
log.Logger = log.Output(output)
log.Logger = log.With().Caller().Logger()
}

zerolog.TimestampFunc = func() time.Time {
return time.Now().In(time.Local)
}
zerolog.CallerMarshalFunc = func(_ uintptr, file string, line int) string {
// Use just the filename and not the full file path for logging
fields := strings.Split(file, "/")
return fields[len(fields)-1] + ":" + strconv.Itoa(line)
}
}

func getLogLevel() zerolog.Level {
logLevelStr := strings.TrimSpace(os.Getenv("LOG_LEVEL"))
if len(logLevelStr) == 0 {
return _defaultLogLevel
}
switch strings.ToUpper(logLevelStr) {
case "TRACE":
return zerolog.TraceLevel
case "DEBUG":
return zerolog.DebugLevel
case "INFO":
return zerolog.InfoLevel
case "ERROR":
return zerolog.ErrorLevel
case "WARN":
return zerolog.WarnLevel
case "FATAL":
return zerolog.FatalLevel
default:
panic("Unexpected log level: " + logLevelStr)
}
}
105 changes: 80 additions & 25 deletions outbound-link-checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import (
"bufio"
"flag"
"fmt"
"github.com/ashishb/outbound-link-checker/internal/logger"
"github.com/rs/zerolog/log"
"io"
"net/http"
"net/url"
Expand Down Expand Up @@ -49,6 +51,7 @@ var domain = flag.String("domain", "",

func main() {
handleFlags()
logger.ConfigureLogging(true)

whitelistedDomains := initWhitelistedDomains()
knownDeadOrBlockedExternalUrls := initKnownDeadOrBlockedExternalUrls()
Expand Down Expand Up @@ -78,7 +81,9 @@ func handleFlags() {
func initWhitelistedDomains() map[string]bool {
dat, err := os.ReadFile(*domainWhitelistFile)
if err != nil {
fmt.Printf("Domain whitelist file does not exist, it will be created later: %s\n", *domainWhitelistFile)
log.Warn().
Str("file", *domainWhitelistFile).
Msg("Domain whitelist file does not exist, it will be created later")
}
whitelisted := make([]string, 0)
whitelistCount := 0
Expand All @@ -95,7 +100,10 @@ func initWhitelistedDomains() map[string]bool {
whitelistCount++
whitelisted = append(whitelisted, line)
}
fmt.Printf("Read %d domains in the domain whitelist\n", whitelistCount)
log.Info().
Str("file", *domainWhitelistFile).
Int("count", whitelistCount).
Msg("Domain whitelist file loaded")

whitelistedDomains := make(map[string]bool, 0)
for _, domain := range whitelisted {
Expand Down Expand Up @@ -147,7 +155,9 @@ func crawl(
knownDeadOrWhitelistedExternalUrls map[string]bool) {

if !recordNewVisit(url, visitedMap) {
// fmt.Printf("Skipping already visited url: %s\n", url)
log.Debug().
Str("url", url).
Msg("Skipping already visited url")
return
}

Expand All @@ -161,29 +171,42 @@ func crawl(
}
lock.Unlock()

if crawlPageLimit >= 0 {
fmt.Printf("Crawling %d (limit: %d) URL: \"%s\"\n", countValue, crawlPageLimit, url)
} else {
fmt.Printf("Crawling %d URL: \"%s\"\n", countValue, url)
}
log.Info().
Int("count", countValue).
Int("limit", crawlPageLimit).
Str("url", url).
Msg("Crawling")

// Fetch the body
body, err := getBody(url)
if err != nil {
fmt.Printf("Error %s while crawling url %s\n", err, url)
log.Error().
Str("url", url).
Err(err).
Msg("Error while fetching body")
return
}

// Extract the urls
urls := getUrls(body)
fmt.Printf("Found %d urls on \"%s\"\n", len(urls), url)
urls := getUrls(string(body))
log.Debug().
Str("url", url).
Int("count", len(urls)).
Msg("Found urls")

for _, url2 := range urls {
log.Info().
Str("url", url2).
Msg("Visiting url")
url2 = normalizeUrl(url2)
recordLink(url, url2, outboundLinkMap)
inDomainUrl, err := belongsToDomain(url2, domain)
if err != nil {
fmt.Printf("Error while parsing \"%s\" which came from the source \"%s\": \"%s\"\n", url2, url, err)
log.Error().
Str("url", url2).
Str("source", url).
Err(err).
Msg("Error while checking if url belongs to domain")
continue
}
if inDomainUrl {
Expand All @@ -202,7 +225,9 @@ func crawl(
value := runningCrawlCount
crawlCountLock.Unlock()
if value > 0 {
//fmt.Printf("Running count value is %d\n", value)
log.Debug().
Int("value", value).
Msg("Waiting for all crawls to finish")
time.Sleep(time.Second)
} else {
return
Expand Down Expand Up @@ -278,7 +303,7 @@ func belongsToDomain(url2 string, domain string) (bool, error) {
return false, nil
}

func getBody(url string) (string, error) {
func getBody(url string) ([]byte, error) {
waitForCrawlCountAvailability()
incrementRunningCrawlCount()
defer decrementRunningCrawlCount()
Expand All @@ -290,19 +315,28 @@ func getBody(url string) (string, error) {
time.Sleep(time.Duration((retryCount - 1) * 1000 * 1000 * 1000))
response, err1 := http.Get(url)
if err1 != nil {
fmt.Printf("Failed to fetch on %d try: %s\n", retryCount, url)
log.Warn().
Int("retryCount", retryCount).
Str("url", url).
Err(err1).
Msg("Failed to fetch")
err = err1
continue
}
defer response.Body.Close()
bodyBytes, err2 := io.ReadAll(response.Body)
if err2 != nil {
fmt.Printf("Failed to fetch on %d try: %s\n", retryCount, url)
log.Warn().
Int("retryCount", retryCount).
Str("url", url).
Err(err2).
Msg("Failed to read body")
err = err2
continue
}
return string(bodyBytes), nil
return bodyBytes, nil
}
return "", err
return nil, err
}

func checkIfAlive(externalUrl string, sourceUrl string) {
Expand All @@ -326,13 +360,21 @@ func checkIfAlive(externalUrl string, sourceUrl string) {
}

// Hacky way to get links from HTML page
var linkRegEx = regexp.MustCompile("<a.*?href=\"(.*?)\"")
var linkRegEx = regexp.MustCompile(`<a.*?href=(.*?)[\s>]`)

func getUrls(htmlBody string) []string {
links := linkRegEx.FindAllStringSubmatch(htmlBody, -1)
result := make([]string, len(links))
for i := range links {
result = append(result, links[i][1])
link := links[i][1]
link = strings.Trim(link, "\"")
link = strings.Trim(link, "'")

// Internal links
if strings.HasPrefix(link, "#") {
continue
}
result = append(result, link)
}
return result
}
Expand All @@ -356,12 +398,19 @@ func printResults(
}
}

fmt.Printf("Results:\n")
log.Info().
Int("count", len(link)).
Msg("Results")
count := 0
for url, sourceUrls := range link {
if len(sourceUrls) >= 1 {
count++
fmt.Printf("URL %d/%d: \"%s\"\ninbound pages: %s\n\n", count, len(link), url, sourceUrls[0])
log.Info().
Int("count", count).
Int("total", len(link)).
Str("url", url).
Str("sourceUrls", sourceUrls[0]).
Msg("URL")
if *interactive {
handleInteractively(url, whitelistedDomains)
}
Expand All @@ -382,7 +431,9 @@ func handleInteractively(url2 string, whitelistedDomains map[string]bool) {
url2 = strings.Trim(url2, " ")
parsedUrl, err := url.Parse(url2)
if err != nil {
fmt.Printf("Error parsing \"%s\" to extract domain\n", url2)
log.Error().
Str("url", url2).
Msg("Error parsing url")
return
}

Expand Down Expand Up @@ -411,8 +462,12 @@ func handleInteractively(url2 string, whitelistedDomains map[string]bool) {
w := bufio.NewWriter(file)
fmt.Fprintln(w, domain)
w.Flush()
fmt.Printf("Domain %s whitelisted\n\n", domain)
log.Info().
Str("domain", domain).
Msg("Domain whitelisted")
} else {
fmt.Printf("Domain %s not whitelisted\n\n", domain)
log.Info().
Str("domain", domain).
Msg("Domain not whitelisted")
}
}

0 comments on commit 39a0880

Please sign in to comment.