From 60bd2d9ceafa5b1ce0c05ac3a8790bd02c22da78 Mon Sep 17 00:00:00 2001 From: Petr Shevtsov Date: Tue, 22 Sep 2020 22:53:29 +0300 Subject: [PATCH 1/3] Use %w verb Use %w verb so the error returned by `fmt.Errorf` will have an `Unwrap` method. --- webanalyze.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webanalyze.go b/webanalyze.go index bf2a64d..f44bd5a 100644 --- a/webanalyze.go +++ b/webanalyze.go @@ -210,7 +210,7 @@ func (wa *WebAnalyzer) process(job *Job, appDefs *AppsDefinition) ([]Match, []st } else { resp, err := fetchHost(job.URL, wa.client) if err != nil { - return nil, links, fmt.Errorf("Failed to retrieve: %v", err) + return nil, links, fmt.Errorf("Failed to retrieve: %w", err) } defer resp.Body.Close() From 36acb80093733aaa8096a6158c08f95e9159e612 Mon Sep 17 00:00:00 2001 From: Paul Whiting Date: Wed, 30 Sep 2020 10:18:00 -0600 Subject: [PATCH 2/3] Optionally follow redirects (default: true) --- cmd/webanalyze/main.go | 7 +++++-- jobdesc.go | 5 ++++- webanalyze.go | 7 +++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/cmd/webanalyze/main.go b/cmd/webanalyze/main.go index 60c9dfb..e53940c 100644 --- a/cmd/webanalyze/main.go +++ b/cmd/webanalyze/main.go @@ -26,6 +26,7 @@ var ( crawlCount int searchSubdomain bool silent bool + redirect bool ) func init() { @@ -38,6 +39,7 @@ func init() { flag.IntVar(&crawlCount, "crawl", 0, "links to follow from the root page (default 0)") flag.BoolVar(&searchSubdomain, "search", true, "searches all urls with same base domain (i.e. example.com and sub.example.com)") flag.BoolVar(&silent, "silent", false, "avoid printing header (default false)") + flag.BoolVar(&redirect, "redirect", true, "follow http redirects (default true)") } func main() { @@ -111,12 +113,12 @@ func main() { go func() { for host := range hosts { - job := webanalyze.NewOnlineJob(host, "", nil, crawlCount, searchSubdomain) + job := webanalyze.NewOnlineJob(host, "", nil, crawlCount, searchSubdomain, redirect) result, links := wa.Process(job) if searchSubdomain { for _, v := range links { - crawlJob := webanalyze.NewOnlineJob(v, "", nil, 0, false) + crawlJob := webanalyze.NewOnlineJob(v, "", nil, 0, false, redirect) result, _ := wa.Process(crawlJob) output(result, wa, outWriter) } @@ -200,6 +202,7 @@ func printHeader() { printOption("apps", apps) printOption("crawl count", crawlCount) printOption("search subdomains", searchSubdomain) + printOption("follow redirects", redirect) fmt.Printf("\n") } diff --git a/jobdesc.go b/jobdesc.go index df60ffc..e240c0a 100644 --- a/jobdesc.go +++ b/jobdesc.go @@ -20,6 +20,7 @@ type Job struct { Crawl int SearchSubdomain bool forceNotDownload bool + followRedirect bool } // NewOfflineJob constructs a job out of the constituents of a @@ -35,6 +36,7 @@ func NewOfflineJob(url, body string, headers map[string][]string) *Job { Crawl: 0, SearchSubdomain: false, forceNotDownload: true, + followRedirect: false, } } @@ -42,7 +44,7 @@ func NewOfflineJob(url, body string, headers map[string][]string) *Job { // or a URL, Body and Headers. If it contains at least a URL and Body, // then webanalyzer will not re-download the data, but if a Body is // absent then downloading will be attempted. -func NewOnlineJob(url, body string, headers map[string][]string, crawlCount int, searchSubdomain bool) *Job { +func NewOnlineJob(url, body string, headers map[string][]string, crawlCount int, searchSubdomain bool, redirect bool) *Job { return &Job{ URL: url, Body: []byte(body), @@ -50,5 +52,6 @@ func NewOnlineJob(url, body string, headers map[string][]string, crawlCount int, Crawl: crawlCount, SearchSubdomain: searchSubdomain, forceNotDownload: false, + followRedirect: redirect, } } diff --git a/webanalyze.go b/webanalyze.go index f44bd5a..c5d8802 100644 --- a/webanalyze.go +++ b/webanalyze.go @@ -218,6 +218,13 @@ func (wa *WebAnalyzer) process(job *Job, appDefs *AppsDefinition) ([]Match, []st body, err = ioutil.ReadAll(resp.Body) if err == nil { headers = resp.Header + if job.followRedirect { + for k, v := range resp.Header { + if k == "Location" { + links = append(links, v[0]) + } + } + } cookies = resp.Cookies() } } From a59ff5448fbfa90a14a36ec8e037a04db2721e47 Mon Sep 17 00:00:00 2001 From: Paul Whiting Date: Wed, 30 Sep 2020 11:19:34 -0600 Subject: [PATCH 3/3] Ensure location redirect is sane --- webanalyze.go | 68 +++++++++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/webanalyze.go b/webanalyze.go index c5d8802..92ca483 100644 --- a/webanalyze.go +++ b/webanalyze.go @@ -143,45 +143,51 @@ func sameUrl(u1, u2 *url.URL) bool { u1.RequestURI() == u2.RequestURI() } -func parseLinks(doc *goquery.Document, base *url.URL, searchSubdomain bool) []string { - var links []string +func resolveLink( base *url.URL, val string, searchSubdomain bool ) string { + u, err := url.Parse(val) + if err != nil { + return "" + } - doc.Find("a").Each(func(i int, s *goquery.Selection) { - val, ok := s.Attr("href") - if !ok { - return - } + urlResolved := base.ResolveReference(u) - u, err := url.Parse(val) - if err != nil { - return - } + if !searchSubdomain && urlResolved.Hostname() != base.Hostname() { + return "" + } - urlResolved := base.ResolveReference(u) + if searchSubdomain && !isSubdomain(base, u) { + return "" + } - if !searchSubdomain && urlResolved.Hostname() != base.Hostname() { - return - } + if urlResolved.RequestURI() == "" { + urlResolved.Path = "/" + } - if searchSubdomain && !isSubdomain(base, u) { - return - } + if sameUrl(base, urlResolved) { + return "" + } - if urlResolved.RequestURI() == "" { - urlResolved.Path = "/" - } + // only allow http/https + if urlResolved.Scheme != "http" && urlResolved.Scheme != "https" { + return "" + } - if sameUrl(base, urlResolved) { - return - } + return urlResolved.String() +} - // only allow http/https - if urlResolved.Scheme != "http" && urlResolved.Scheme != "https" { +func parseLinks(doc *goquery.Document, base *url.URL, searchSubdomain bool) []string { + var links []string + + doc.Find("a").Each(func(i int, s *goquery.Selection) { + val, ok := s.Attr("href") + if !ok { return } - links = append(links, urlResolved.String()) - + u := resolveLink(base, val, searchSubdomain) + if u != "" { + links = append(links, u) + } }) return unique(links) @@ -221,7 +227,11 @@ func (wa *WebAnalyzer) process(job *Job, appDefs *AppsDefinition) ([]Match, []st if job.followRedirect { for k, v := range resp.Header { if k == "Location" { - links = append(links, v[0]) + base, _ := url.Parse(job.URL) + u := resolveLink(base, v[0], job.SearchSubdomain) + if u != "" { + links = append(links, v[0]) + } } } }