Skip to content

Commit

Permalink
Merge pull request #1245 from dogancanbakir/error_page_classifier
Browse files Browse the repository at this point in the history
Error page classifier
  • Loading branch information
Mzack9999 authored Jul 19, 2023
2 parents a3073df + 1c15cc3 commit b3de0bc
Show file tree
Hide file tree
Showing 12 changed files with 392 additions and 16 deletions.
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,14 @@ EXTRACTOR:

FILTERS:
-fc, -filter-code string filter response with specified status code (-fc 403,401)
-fep, -filter-error-page filter response with ML based error page detection
-fl, -filter-length string filter response with specified content length (-fl 23,33)
-flc, -filter-line-count string filter response body with specified line count (-flc 423,532)
-fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532)
-ffc, -filter-favicon string[] filter response with specified favicon hash (-mfc 1494302000)
-fs, -filter-string string filter response with specified string (-fs admin)
-fe, -filter-regex string filter response with specified regex (-fe admin)
-fcdn, -filter-cdn string[] filter host with specified cdn provider (incapsula, oracle, google, azure, cloudflare, cloudfront, fastly, akamai, sucuri, leaseweb)
-fcdn, -filter-cdn string[] filter host with specified cdn provider (google, leaseweb, stackpath, cloudfront, fastly)
-frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1')
-fdc, -filter-condition string filter response with dsl expression condition

Expand Down Expand Up @@ -349,6 +350,28 @@ https://support.hackerone.com [301,302,301,200] [HackerOne] [Cloudflare,Ruby on
https://resources.hackerone.com [301,301,404] [Sorry, no Folders found.]
```

### Error Page Classifier and Filtering

The Error Page Classifier and Filtering feature aims to add intelligence to the tool by enabling it to classify and filter out common error pages returned by web applications. It is an enhancement to the existing httpx capabilities and is geared towards reducing the noise in the results and helping users focus on what matters most.

```console
httpx -l urls.txt -path /v1/api -fep

__ __ __ _ __
/ /_ / /_/ /_____ | |/ /
/ __ \/ __/ __/ __ \| /
/ / / / /_/ /_/ /_/ / |
/_/ /_/\__/\__/ .___/_/|_|
/_/

projectdiscovery.io

[INF] Current httpx version v1.3.3 (latest)
https://scanme.sh/v1/api
```

Filtered error pages are stored to predefined file `filtered_error_page.json` in jsonline format when `-filter-error-page` option is used.

### Favicon Hash


Expand Down
Binary file added common/errorpageclassifier/clf.gob
Binary file not shown.
201 changes: 201 additions & 0 deletions common/errorpageclassifier/dataset.txt

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions common/errorpageclassifier/errorpageclassifier.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package errorpageclassifier

import (
_ "embed"

"github.com/jaytaylor/html2text"
"github.com/projectdiscovery/utils/ml/naive_bayes"
)

//go:embed clf.gob
var classifierData []byte

type ErrorPageClassifier struct {
classifier *naive_bayes.NaiveBayesClassifier
}

func New() *ErrorPageClassifier {
classifier, err := naive_bayes.NewClassifierFromFileData(classifierData)
if err != nil {
panic(err)
}
return &ErrorPageClassifier{classifier: classifier}
}

func (n *ErrorPageClassifier) Classify(html string) string {
text := htmlToText(html)
if text == "" {
return "other"
}
return n.classifier.Classify(text)
}

func htmlToText(html string) string {
text, err := html2text.FromString(html, html2text.Options{TextOnly: true})
if err != nil {
panic(err)
}
return text
}
53 changes: 53 additions & 0 deletions common/errorpageclassifier/errorpageclassifier_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package errorpageclassifier

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestErrorPageClassifier(t *testing.T) {
t.Run("test creation of new ErrorPageClassifier", func(t *testing.T) {
epc := New()
assert.NotNil(t, epc)
})

t.Run("test classification non error page text", func(t *testing.T) {
epc := New()
assert.Equal(t, "nonerror", epc.Classify(`<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Terms of Service</title>
</head>
<body>
<h1>Welcome to our Terms of Service page.</h1>
<p>Understand our conditions for providing services.</p>
</body>
</html>
`))
})

t.Run("test classification on error page text", func(t *testing.T) {
epc := New()
assert.Equal(t, "error", epc.Classify(`<!DOCTYPE html>
<html>
<head>
<title>Error 403: Forbidden</title>
<style>
.error-message {
text-align: center;
color: #333;
}
</style>
</head>
<body>
<div class="error-message">
<h1>Error 403: Forbidden</h1>
<p>Sorry you don't have access rights to this page.</p>
</div>
</body>
</html>
`))
})
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ require (
github.com/go-faker/faker/v4 v4.1.1
github.com/go-rod/rod v0.113.4
github.com/hdm/jarm-go v0.0.7
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056
github.com/mfonda/simhash v0.0.0-20151007195837-79f94a1100d6
github.com/mitchellh/mapstructure v1.5.0
github.com/projectdiscovery/asnmap v1.0.4
Expand Down Expand Up @@ -86,6 +87,7 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/kataras/jwt v0.1.8 // indirect
github.com/klauspost/compress v1.15.15 // indirect
github.com/kljensen/snowball v0.8.0 // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
Expand All @@ -112,6 +114,7 @@ require (
github.com/sashabaranov/go-openai v1.13.0 // indirect
github.com/shirou/gopsutil/v3 v3.23.6 // indirect
github.com/shoenig/go-m1cpu v0.1.6 // indirect
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf // indirect
github.com/syndtr/goleveldb v1.0.0 // indirect
github.com/tidwall/btree v1.6.0 // indirect
github.com/tidwall/buntdb v1.3.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ github.com/hbakhtiyor/strsim v0.0.0-20190107154042-4d2bbb273edf/go.mod h1:V99KdS
github.com/hdm/jarm-go v0.0.7 h1:Eq0geenHrBSYuKrdVhrBdMMzOmA+CAMLzN2WrF3eL6A=
github.com/hdm/jarm-go v0.0.7/go.mod h1:kinGoS0+Sdn1Rr54OtanET5E5n7AlD6T6CrJAKDjJSQ=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA=
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4dN7jwJOQ1U=
Expand All @@ -114,6 +116,8 @@ github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0
github.com/klauspost/compress v1.15.15 h1:EF27CXIuDsYJ6mmvtBRlEuB2UVOqHG1tAXgZ7yIO+lw=
github.com/klauspost/compress v1.15.15/go.mod h1:ZcK2JAFqKOpnBlxcLsJzYfrS9X1akm9fHZNnD9+Vo/4=
github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/kljensen/snowball v0.8.0 h1:WU4cExxK6sNW33AiGdbn4e8RvloHrhkAssu2mVJ11kg=
github.com/kljensen/snowball v0.8.0/go.mod h1:OGo5gFWjaeXqCu4iIrMl5OYip9XUJHGOU5eSkPjVg2A=
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
Expand Down Expand Up @@ -247,6 +251,8 @@ github.com/sirupsen/logrus v1.3.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPx
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf h1:pvbZ0lM0XWPBqUKqFU8cmavspvIl9nulOYwdy6IFRRo=
github.com/ssor/bom v0.0.0-20170718123548-6386211fdfcf/go.mod h1:RJID2RhlZKId02nZ62WenDCkgHFerpIOmW0iT7GKmXM=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
Expand Down
2 changes: 2 additions & 0 deletions resume.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
index=21
resume_from=www.hackerone.com
5 changes: 2 additions & 3 deletions runner/banner.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
package runner

import (
"github.com/projectdiscovery/gologger"
"github.com/projectdiscovery/gologger"
updateutils "github.com/projectdiscovery/utils/update"
)


const banner = `
__ __ __ _ __
/ /_ / /_/ /_____ | |/ /
Expand All @@ -30,4 +29,4 @@ func GetUpdateCallback() func() {
showBanner()
updateutils.GetUpdateToolCallback("httpx", version)()
}
}
}
2 changes: 2 additions & 0 deletions runner/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ type Options struct {
OutputMatchStatusCode string
OutputMatchContentLength string
OutputFilterStatusCode string
OutputFilterErrorPage bool
OutputFilterContentLength string
InputRawRequest string
rawRequest string
Expand Down Expand Up @@ -336,6 +337,7 @@ func ParseOptions() *Options {

flagSet.CreateGroup("filters", "Filters",
flagSet.StringVarP(&options.OutputFilterStatusCode, "filter-code", "fc", "", "filter response with specified status code (-fc 403,401)"),
flagSet.BoolVarP(&options.OutputFilterErrorPage, "filter-error-page", "fep", false, "filter response with ML based error page detection"),
flagSet.StringVarP(&options.OutputFilterContentLength, "filter-length", "fl", "", "filter response with specified content length (-fl 23,33)"),
flagSet.StringVarP(&options.OutputFilterLinesCount, "filter-line-count", "flc", "", "filter response body with specified line count (-flc 423,532)"),
flagSet.StringVarP(&options.OutputFilterWordsCount, "filter-word-count", "fwc", "", "filter response body with specified word count (-fwc 423,532)"),
Expand Down
69 changes: 58 additions & 11 deletions runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ import (
"github.com/PuerkitoBio/goquery"
asnmap "github.com/projectdiscovery/asnmap/libs"
dsl "github.com/projectdiscovery/dsl"
"github.com/projectdiscovery/fastdialer/fastdialer"
"github.com/projectdiscovery/httpx/common/customextract"
"github.com/projectdiscovery/httpx/common/errorpageclassifier"
"github.com/projectdiscovery/httpx/common/hashes/jarm"
"github.com/projectdiscovery/httpx/static"
"github.com/projectdiscovery/mapcidr/asn"
Expand Down Expand Up @@ -69,15 +71,16 @@ import (

// Runner is a client for running the enumeration process.
type Runner struct {
options *Options
hp *httpx.HTTPX
wappalyzer *wappalyzer.Wappalyze
scanopts ScanOptions
hm *hybrid.HybridMap
stats clistats.StatisticsClient
ratelimiter ratelimit.Limiter
HostErrorsCache gcache.Cache[string, int]
browser *Browser
options *Options
hp *httpx.HTTPX
wappalyzer *wappalyzer.Wappalyze
scanopts ScanOptions
hm *hybrid.HybridMap
stats clistats.StatisticsClient
ratelimiter ratelimit.Limiter
HostErrorsCache gcache.Cache[string, int]
browser *Browser
errorPageClassifier *errorpageclassifier.ErrorPageClassifier
}

// New creates a new client for running enumeration process.
Expand Down Expand Up @@ -311,6 +314,8 @@ func New(options *Options) (*Runner, error) {
runner.HostErrorsCache = gc
}

runner.errorPageClassifier = errorpageclassifier.New()

return runner, nil
}

Expand Down Expand Up @@ -742,8 +747,13 @@ func (r *Runner) RunEnumeration() {
gologger.Warning().Msgf("Could not decode response: %s\n", err)
continue
}
dslVars, _ := dslVariables()
dslVars, err := dslVariables()
if err != nil {
gologger.Warning().Msgf("Could not retrieve dsl variables: %s\n", err)
continue
}
flatMap := make(map[string]interface{})

for _, v := range dslVars {
flatMap[v] = rawMap[v]
}
Expand Down Expand Up @@ -772,6 +782,10 @@ func (r *Runner) RunEnumeration() {
}
}

if r.options.OutputFilterErrorPage && resp.KnowledgeBase["PageType"] == "error" {
logFilteredErrorPage(resp.URL)
continue
}
if len(r.options.filterStatusCode) > 0 && slice.IntSliceContains(r.options.filterStatusCode, resp.StatusCode) {
continue
}
Expand Down Expand Up @@ -1022,6 +1036,36 @@ func (r *Runner) RunEnumeration() {
wgoutput.Wait()
}

func logFilteredErrorPage(url string) {
fileName := "filtered_error_page.json"
file, err := os.OpenFile(fileName, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0600)
if err != nil {
gologger.Fatal().Msgf("Could not open/create output file '%s': %s\n", fileName, err)
return
}
defer file.Close()

info := map[string]interface{}{
"url": url,
"time_filtered": time.Now(),
}

data, err := json.Marshal(info)
if err != nil {
fmt.Println("Failed to marshal JSON:", err)
return
}

if _, err := file.Write(data); err != nil {
gologger.Fatal().Msgf("Failed to write to '%s': %s\n", fileName, err)
return
}

if _, err := file.WriteString("\n"); err != nil {
gologger.Fatal().Msgf("Failed to write newline to '%s': %s\n", fileName, err)
return
}
}
func openOrCreateFile(resume bool, filename string) *os.File {
var err error
var f *os.File
Expand Down Expand Up @@ -1243,7 +1287,7 @@ retry:
} else {
requestIP = target.CustomIP
}
ctx := context.WithValue(context.Background(), "ip", requestIP) //nolint
ctx := context.WithValue(context.Background(), fastdialer.IP, requestIP)
req, err = hp.NewRequestWithContext(ctx, method, URL.String())
} else {
req, err = hp.NewRequest(method, URL.String())
Expand Down Expand Up @@ -1884,6 +1928,9 @@ retry:
ScreenshotBytes: screenshotBytes,
ScreenshotPath: screenshotPath,
HeadlessBody: headlessBody,
KnowledgeBase: map[string]interface{}{
"PageType": r.errorPageClassifier.Classify(respData),
},
}
return result
}
Expand Down
3 changes: 2 additions & 1 deletion runner/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,13 @@ type Result struct {
ScreenshotBytes []byte `json:"screenshot_bytes,omitempty" csv:"screenshot_bytes"`
StoredResponsePath string `json:"stored_response_path,omitempty" csv:"stored_response_path"`
ScreenshotPath string `json:"screenshot_path,omitempty" csv:"screenshot_path"`
KnowledgeBase map[string]interface{} `json:"knowledgebase,omitempty" csv:"knowledgebase"`
}

// function to get dsl variables from result struct
func dslVariables() ([]string, error) {
fakeResult := Result{}
fieldsToIgnore := []string{"Hashes", "ResponseHeader", "Err"}
fieldsToIgnore := []string{"Hashes", "ResponseHeader", "Err", "KnowledgeBase"}
if err := faker.FakeData(&fakeResult, options.WithFieldsToIgnore(fieldsToIgnore...)); err != nil {
return nil, err
}
Expand Down

0 comments on commit b3de0bc

Please sign in to comment.