gryffin.go

// Copyright 2015, Yahoo Inc. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

/*
Package gryffin is an application scanning infrastructure.
*/
package gryffin

import (
	"bytes"
	"encoding/json"
	"fmt"
	"hash/fnv"
	"io/ioutil"
	"net"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"strings"
	"time"

	distance "github.com/yahoo/gryffin/html-distance"
)

// A Scan consists of the job, target, request and response.
type Scan struct {
	// ID is a random ID to identify this particular scan.
	// if ID is empty, this scan should not be performed (but record for rate limiting).
	ID           string
	Job          *Job
	Request      *http.Request
	RequestBody  string
	Response     *http.Response
	ResponseBody string
	Cookies      []*http.Cookie
	Fingerprint  Fingerprint
	HitCount     int
}

// Job stores the job id and config (if any).
type Job struct {
	ID             string
	DomainsAllowed []string // Domains that we would crawl
}

// Fingerprint contains all the different types of hash for the Scan (Request & Response)
type Fingerprint struct {
	Origin             uint64 // origin
	URL                uint64 // origin + path
	Request            uint64 // method, url, body
	RequestFull        uint64 // request + header
	ResponseSimilarity uint64
}

// HTTPDoer interface is to be implemented by http.Client
type HTTPDoer interface {
	Do(*http.Request) (*http.Response, error)
}

// Fuzzer runs the fuzzing.
type Fuzzer interface {
	Fuzz(*Scan) (int, error)
}

// Renderer is an interface for implementation HTML DOM renderer and obtain the response body and links.
// Since DOM construction is very likely to be asynchronous, we return the channels to receive response and links.
type Renderer interface {
	Do(*Scan)
	GetRequestBody() <-chan *Scan
	GetLinks() <-chan *Scan
}

// LogMessage contains the data fields to be marshalled as JSON for forwarding to the log processor.
type LogMessage struct {
	Service string
	Msg     string
	Method  string
	Url     string
	JobID   string
	// Fingerprint Fingerprint
}

// NewScan creates a scan.
func NewScan(method, url, post string) *Scan {
	// ensure we got a memory store..
	memoryStoreMu.Lock()
	if memoryStore == nil {
		memoryStore = NewGryffinStore()
	}
	memoryStoreMu.Unlock()

	id := GenRandomID()

	job := &Job{ID: GenRandomID()}

	req, err := http.NewRequest(method, url, ioutil.NopCloser(strings.NewReader(post)))
	if err != nil {
		// s.Log("Invalid url for NewScan: %s", err)
		return nil
	}

	// put the host component of the url as the domains to be allowed
	host, _, err := net.SplitHostPort(req.URL.Host)
	if err != nil {
		job.DomainsAllowed = []string{req.URL.Host}
	} else {
		job.DomainsAllowed = []string{host}
	}

	// Add chrome user agent
	req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36")

	return &Scan{
		ID:          id,
		Job:         job,
		Request:     req,
		RequestBody: post,
	}
}

// getOrigin returns the Origin of the URL (scheme, hostname, port )
func getOrigin(u *url.URL) string {
	return u.Scheme + "://" + u.Host
}

// MergeRequest merge the request field in scan with the existing one.
func (s *Scan) MergeRequest(req *http.Request) {

	// set cookie from response (if it is not done..)
	if s.Response != nil {
		s.Cookies = append(s.Cookies, s.Response.Cookies()...)
		// s.CookieJar.SetCookies(s.Request.URL, s.Response.Cookies())
	}

	// read the request body, and then reset the reader
	var post []byte
	if req.Body != nil {
		if post, err := ioutil.ReadAll(req.Body); err == nil {
			req.Body = ioutil.NopCloser(bytes.NewReader(post))
		} else {
			// only possible error is bytes.ErrTooLarge from ioutil package.
			s.Error("MergeRequest", err)
		}
	}

	// resolve relative url.
	if !req.URL.IsAbs() {
		req.URL = s.Request.URL.ResolveReference(req.URL)
	}

	// TODO - drop if Method, URL, Body are same..
	// if req == s.Request {
	// s.Logf("Result after merge generate same request.", nil)
	// }

	// swap
	prevReq := s.Request
	s.Request = req
	s.RequestBody = string(post)

	// TODO - handle relative URL .

	// Create a cookie jar, add cookie list (so cookie jar reject invalid cookie.)
	jar, _ := cookiejar.New(nil)
	jar.SetCookies(req.URL, s.Cookies)

	// reset cookies
	s.Cookies = make([]*http.Cookie, 0)
	for _, c := range jar.Cookies(req.URL) {
		req.AddCookie(c)
		s.Cookies = append(s.Cookies, c)
	}

	// Add user agent
	req.Header.Set("User-Agent", prevReq.UserAgent())

	// Add referrer - TODO, perhaps we don't need this!

	// remove Response.
	s.Response = nil
	s.ResponseBody = ""

}

// Spawn spawns a new scan object with a different ID.
func (s *Scan) Spawn() *Scan {
	id := GenRandomID()
	job := *s.Job
	req := *s.Request // copy the value.

	post := s.RequestBody
	s.Request.Body = ioutil.NopCloser(strings.NewReader(post))

	// get the cookiejar, save the new cookies
	// jar := s.CookieJar
	cookies := s.Cookies[:]
	if s.Response != nil {
		cookies = append(cookies, s.Response.Cookies()...)
		// jar.SetCookies(s.Request.URL, s.Response.Cookies())
	}

	return &Scan{
		ID:          id,
		Job:         &job,
		Request:     &req,
		RequestBody: post,
		Cookies:     cookies,
	}
}

// Poke checks if the target is up.
func (s *Scan) Poke(client HTTPDoer) (err error) {

	s.Logm("Poke", "Poking")

	// Add 5s timeout if it is http.Client
	switch client := client.(type) {
	case *http.Client:
		client.Timeout = time.Duration(3) * time.Second
	}

	// delete the similarity case for the domain.
	// s.Session.DelPrefix("hash/unique/" + s.Request.URL.Host)

	// http.Request is embeded in a Request embeded in a Scan.
	s.Response, err = client.Do(s.Request)
	if err != nil {
		s.Logm("Poke", "Failed")
		return
	}

	s.ReadResponseBody()

	s.HitCount++
	return
}

// ReadResponseBody read Response.Body and fill it to ReadResponseBody.
// It will also reconstruct the io.ReaderCloser stream.
func (s *Scan) ReadResponseBody() {
	if s.ResponseBody == "" && s.Response != nil {
		if b, err := ioutil.ReadAll(s.Response.Body); err == nil {
			s.ResponseBody = string(b)
			s.Response.Body = ioutil.NopCloser(bytes.NewReader(b))
		}
	}
}

func hash(s string) uint64 {
	h := fnv.New64()
	h.Write([]byte(s))
	return h.Sum64()
}

// UpdateFingerprint updates the fingerprint field.
func (s *Scan) UpdateFingerprint() {
	f := &s.Fingerprint
	if s.Request != nil {
		if f.Origin == 0 {
			f.Origin = hash(getOrigin(s.Request.URL))
		}
		if f.URL == 0 {
			f.URL = hash(s.Request.URL.String())
		}
		if f.Request == 0 {
			f.Request = hash(s.Request.URL.String() + "\n" + s.RequestBody)
		}
		// if f.RequestFull == 0 {
		// TODO
		// }
	}

	if f.ResponseSimilarity == 0 {
		if r := strings.NewReader(s.ResponseBody); s.ResponseBody != "" && r != nil {
			f.ResponseSimilarity = distance.Fingerprint(r, 3)
			s.Logm("Fingerprint", "Computed")
		}
	}

}

// RateLimit checks whether we are under the allowed rate for crawling the site.
// It returns a delay time to wait to check for ReadyToCrawl again.
func (s *Scan) RateLimit() int {
	if memoryStore.Hit(s.Request.URL.Host) {
		return 0
	}
	return 5

	// store := s.Session
	// // for each 5 second epoch, we create a key and see how many crawls are done.
	// ts := time.Now().Truncate(5 * time.Second).Unix()
	// k := "rate/" + s.Request.URL.Host + "/" + strconv.FormatInt(ts, 10)
	// if v, ok := store.Get(k); ok {
	// 	if v.(int64) >= 5 {
	// 		// s.Logm("RateLimit", "Delay 5 second")
	// 		// s.Logf("Wait for 5 second for %s (v:%d)", s.Request.URL, v)
	// 		return 5
	// 	}
	// 	// ready to crawl.
	// 	// TODO - this is not atomic.
	// 	c, _ := store.Get(k)
	// 	store.Set(k, c.(int64)+1)
	// 	// s.Logm("RateLimit", "No Delay")
	// 	return 0
	// }

	// store.Set(k, 1)
	// // s.Logm("RateLimit", "No Delay")
	// return 0
}

// IsScanAllowed check if the request URL is allowed per Job.DomainsAllowed.
func (s *Scan) IsScanAllowed() bool {
	// relative URL
	if !s.Request.URL.IsAbs() {
		return true
	}

	host, _, err := net.SplitHostPort(s.Request.URL.Host)
	if err != nil {
		host = s.Request.URL.Host
	}

	for _, allowed := range s.Job.DomainsAllowed {
		if host == allowed {
			return true
		}
	}
	return false
}

// CrawlAsync run the crawling asynchronously.
func (s *Scan) CrawlAsync(r Renderer) {
	s.Logm("CrawlAsync", "Started")
	if s.IsScanAllowed() {
		r.Do(s)
	} else {
		s.Logm("CrawlAsync", "Scan Not Allowed")
	}
}

// IsDuplicatedPage checks if we should proceed based on the Response
func (s *Scan) IsDuplicatedPage() bool {
	s.UpdateFingerprint()
	f := s.Fingerprint.ResponseSimilarity
	if !memoryStore.Seen(s.Job.ID, "oracle", f, 2) {
		memoryStore.See(s.Job.ID, "oracle", f)
		s.Logm("IsDuplicatedPage", "Unique Page")
		return false
	}
	s.Logm("IsDuplicatedPage", "Duplicate Page")
	return true
}

// Fuzz runs the vulnerability fuzzer, return the issue count.
func (s *Scan) Fuzz(fuzzer Fuzzer) (int, error) {
	c, err := fuzzer.Fuzz(s)
	return c, err
}

// // ExtractLinks extracts the list of links found from the responseText in the Scan.
// func (s *Scan) ExtractLinks() (scans []Scan, err error) {

// 	return
// }

// ShouldCrawl checks if the links should be queued for next crawl.
func (s *Scan) ShouldCrawl() bool {
	s.UpdateFingerprint()
	f := s.Fingerprint.URL
	if !memoryStore.Seen(s.Job.ID, "hash", f, 0) {
		memoryStore.See(s.Job.ID, "hash", f)
		s.Logm("ShouldCrawl", "Unique Link")
		return true
	}
	s.Logm("ShouldCrawl", "Duplicate Link")
	return false
}

// TODO - LogFmt (fmt string)
// TODO - LogI (interface)
// Error logs the error for the given service.
func (s *Scan) Error(service string, err error) {
	errmsg := fmt.Sprint(err)
	s.Logm(service, errmsg)
}

// Logmf logs the message for the given service.
func (s *Scan) Logmf(service, format string, a ...interface{}) {
	s.Logm(service, fmt.Sprintf(format, a...))
}

// Logm sends a LogMessage to Log processor.
func (s *Scan) Logm(service, msg string) {
	// TODO - improve the efficiency of this.
	m := &LogMessage{
		Service: service,
		Msg:     msg,
		// Fingerprint: s.Fingerprint,
		Method: s.Request.Method,
		Url:    s.Request.URL.String(),
		JobID:  s.Job.ID,
	}
	s.Log(m)
}

// Logf logs using the given format string.
func (s *Scan) Logf(format string, a ...interface{}) {
	str := fmt.Sprintf(format, a...)
	s.Log(str)
}

// Log encodes the given argument as JSON and writes it to
// the log writer.
func (s *Scan) Log(v interface{}) {
	if logWriter == nil {
		return
	}
	logWriterMu.Lock()
	encoder := json.NewEncoder(logWriter)
	encoder.Encode(v)
	logWriterMu.Unlock()
}