limit TitleExtractor to allow only Remark42 whitelisted domains

Allowed domains consist of `REMARK_URL` second-level domain (or whole IP in case it's IP like `127.0.0.1`) and `ALLOWED_HOSTS`. That is needed to prevent Remark42 from asking arbitrary servers and storing the page title as the comment.PostTitle. Previous behaviour allowed the caller of the API to create a comment with an arbitrary URL and learn the title of the page, which might be accessible to the server Remark42 is installed on but not to the user outside that network (CWE-918).
umputun · Oct 10, 2023 · f0071ae · f0071ae
1 parent a5c4e68
commit f0071ae
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 10 deletions.
diff --git a/backend/app/cmd/server.go b/backend/app/cmd/server.go
@@ -11,7 +11,6 @@ import (
 	"os/signal"
 	"path"
 	"regexp"
-	"slices"
 	"strings"
 	"syscall"
 	"time"
@@ -646,7 +645,7 @@ func (s *ServerCommand) getAllowedDomains() []string {
 		if rawURL == "self" || rawURL == "'self'" || rawURL == "\"self\"" {
 			continue
 		}
-		// AllowedHosts usually don't have https:// prefix
+		// AllowedHosts usually don't have https:// prefix, so we're adding it just to make parsing below work the same way as for RemarkURL
 		if !strings.HasPrefix(rawURL, "http://") && !strings.HasPrefix(rawURL, "https://") {
 			rawURL = "https://" + rawURL
 		}
@@ -656,16 +655,19 @@ func (s *ServerCommand) getAllowedDomains() []string {
 			continue
 		}
 		domain := parsedURL.Hostname()
+
+		if domain == "" || // don't add empty domain as it will allow everything to be extracted
+			(len(strings.Split(domain, ".")) < 2 && // don't allow single-word domains like "com"
+				domain != "localhost") { // localhost is an exceptional single-word domain which is allowed
+			continue
+		}
+
 		// if domain is not IP and has more than two levels, extract second level domain
 		if net.ParseIP(domain) == nil && len(strings.Split(domain, ".")) > 2 {
 			domain = strings.Join(strings.Split(domain, ".")[len(strings.Split(domain, "."))-2:], ".")
 		}
 
-		if domain != "" && // don't add empty domain as it will allow everything to be extracted
-			!slices.Contains(allowedDomains, domain) && // don't duplicate domains
-			(domain == "localhost" || len(strings.Split(domain, ".")) > 1) { // don't allow single-word domains like "com" except localhost
-			allowedDomains = append(allowedDomains, domain)
-		}
+		allowedDomains = append(allowedDomains, domain)
 	}
 	return allowedDomains
 }

diff --git a/backend/app/cmd/server_test.go b/backend/app/cmd/server_test.go
@@ -733,12 +733,13 @@ func Test_getAllowedDomains(t *testing.T) {
 	}{
 		// correct example, parsed and returned as allowed domain
 		{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "https://remark42.example.org"}}, []string{"example.org"}},
+		{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "http://remark42.example.org"}}, []string{"example.org"}},
 		{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "http://localhost"}}, []string{"localhost"}},
 		// incorrect URLs, so Hostname is empty but returned list doesn't include empty string as it would allow any domain
 		{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "bad hostname"}}, []string{}},
 		{ServerCommand{AllowedHosts: []string{}, CommonOpts: CommonOpts{RemarkURL: "not_a_hostname"}}, []string{}},
-		// test removal of 'self', multiple AllowedHosts and deduplication
-		{ServerCommand{AllowedHosts: []string{"'self'", "example.org", "test.example.org", "remark42.com"}, CommonOpts: CommonOpts{RemarkURL: "https://example.org"}}, []string{"example.org", "remark42.com"}},
+		// test removal of 'self', multiple AllowedHosts. No deduplication is expected
+		{ServerCommand{AllowedHosts: []string{"'self'", "example.org", "test.example.org", "remark42.com"}, CommonOpts: CommonOpts{RemarkURL: "https://example.org"}}, []string{"example.org", "example.org", "remark42.com", "example.org"}},
 	}
 	for i, tt := range tbl {
 		t.Run(strconv.Itoa(i), func(t *testing.T) {

diff --git a/backend/app/store/service/title.go b/backend/app/store/service/title.go
@@ -27,6 +27,7 @@ type TitleExtractor struct {
 
 // NewTitleExtractor makes extractor with cache. If memory cache failed, switching to no-cache
 func NewTitleExtractor(client http.Client, allowedDomains []string) *TitleExtractor {
+	log.Printf("[DEBUG] creating extractor, allowed domains %+v", allowedDomains)
 	res := TitleExtractor{
 		client:         client,
 		allowedDomains: allowedDomains,
@@ -49,7 +50,9 @@ func (t *TitleExtractor) Get(pageURL string) (string, error) {
 	}
 	allowed := false
 	for _, domain := range t.allowedDomains {
-		if strings.HasSuffix(u.Hostname(), domain) {
+		if u.Hostname() == domain ||
+			(strings.HasSuffix(u.Hostname(), domain) && // suffix match, e.g. "example.com" matches "www.example.com"
+				u.Hostname()[len(u.Hostname())-len(domain)-1] == '.') { // but we should not match "notexample.com"
 			allowed = true
 			break
 		}