diff --git a/webanalyze.go b/webanalyze.go index c5d8802..92ca483 100644 --- a/webanalyze.go +++ b/webanalyze.go @@ -143,45 +143,51 @@ func sameUrl(u1, u2 *url.URL) bool { u1.RequestURI() == u2.RequestURI() } -func parseLinks(doc *goquery.Document, base *url.URL, searchSubdomain bool) []string { - var links []string +func resolveLink( base *url.URL, val string, searchSubdomain bool ) string { + u, err := url.Parse(val) + if err != nil { + return "" + } - doc.Find("a").Each(func(i int, s *goquery.Selection) { - val, ok := s.Attr("href") - if !ok { - return - } + urlResolved := base.ResolveReference(u) - u, err := url.Parse(val) - if err != nil { - return - } + if !searchSubdomain && urlResolved.Hostname() != base.Hostname() { + return "" + } - urlResolved := base.ResolveReference(u) + if searchSubdomain && !isSubdomain(base, u) { + return "" + } - if !searchSubdomain && urlResolved.Hostname() != base.Hostname() { - return - } + if urlResolved.RequestURI() == "" { + urlResolved.Path = "/" + } - if searchSubdomain && !isSubdomain(base, u) { - return - } + if sameUrl(base, urlResolved) { + return "" + } - if urlResolved.RequestURI() == "" { - urlResolved.Path = "/" - } + // only allow http/https + if urlResolved.Scheme != "http" && urlResolved.Scheme != "https" { + return "" + } - if sameUrl(base, urlResolved) { - return - } + return urlResolved.String() +} - // only allow http/https - if urlResolved.Scheme != "http" && urlResolved.Scheme != "https" { +func parseLinks(doc *goquery.Document, base *url.URL, searchSubdomain bool) []string { + var links []string + + doc.Find("a").Each(func(i int, s *goquery.Selection) { + val, ok := s.Attr("href") + if !ok { return } - links = append(links, urlResolved.String()) - + u := resolveLink(base, val, searchSubdomain) + if u != "" { + links = append(links, u) + } }) return unique(links) @@ -221,7 +227,11 @@ func (wa *WebAnalyzer) process(job *Job, appDefs *AppsDefinition) ([]Match, []st if job.followRedirect { for k, v := range resp.Header { if k == "Location" { - links = append(links, v[0]) + base, _ := url.Parse(job.URL) + u := resolveLink(base, v[0], job.SearchSubdomain) + if u != "" { + links = append(links, v[0]) + } } } }