-
Notifications
You must be signed in to change notification settings - Fork 6
/
scrape.go
76 lines (68 loc) · 2.26 KB
/
scrape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package safer
import (
"errors"
"net/http"
"strings"
"github.com/antchfx/htmlquery"
"golang.org/x/net/html"
)
const (
companySnapshotURL = "https://safer.fmcsa.dot.gov/query.asp"
searchURL = "https://safer.fmcsa.dot.gov/keywordx.asp"
paramUSDOT = "USDOT"
paramMCMX = "MC_MX"
)
var headers = http.Header{
"Accept": {"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"},
"Accept-Encoding": {"gzip, deflate, br"},
"Accept-Language": {"en-US,en;q=0.9"},
"Cache-Control": {"max-age=0"},
"Connection": {"keep-alive"},
"Host": {"safer.fmcsa.dot.gov"},
"Upgrade-Insecure-Requests": {"1"},
"User-Agent": {"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Mobile Safari/537.36"},
}
type scraper struct {
companySnapshotURL string
searchURL string
}
func (s *scraper) scrapeCompanySnapshot(queryParam, queryString string) (*CompanySnapshot, error) {
params := "?searchType=ANY&query_type=queryCarrierSnapshot&query_param=" + queryParam + "&query_string=" + queryString
reqURL := companySnapshotURL
if s.companySnapshotURL != "" {
reqURL = s.companySnapshotURL
}
node, err := postRequestToHTMLNode(reqURL + params)
if err != nil {
return nil, err
}
return htmlNodeToCompanySnapshot(node)
}
func (s *scraper) scrapeCompanyNameSearch(queryString string) ([]CompanyResult, error) {
params := "?SEARCHTYPE=&searchstring=*" + strings.ToUpper(queryString) + "*"
reqURL := searchURL
if s.searchURL != "" {
reqURL = s.searchURL
}
node, err := postRequestToHTMLNode(reqURL + params)
if err != nil {
return nil, err
}
return htmlNodeToCompanyResults(node)
}
func postRequestToHTMLNode(reqURL string) (*html.Node, error) {
req, err := http.NewRequest(http.MethodPost, reqURL, http.NoBody)
if err != nil {
return nil, err
}
req.Header = headers
resp, err := http.DefaultClient.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errors.New(resp.Status + " Response from SAFER")
}
return htmlquery.Parse(resp.Body)
}