diff --git a/CHANGELOG.md b/CHANGELOG.md index bc7292c..123cf40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## Unreleased +## [1.0.31] - 2024-12-23 + +### Fixed + +- Validation of TLD domains [#63] ### Added - Calculation of checksum for filters and `! Checksum` string to the filter list meta [#76] +[1.0.31]: https://github.com/AdguardTeam/HostlistCompiler/compare/v1.0.29...v1.0.31 +[#63]: https://github.com/AdguardTeam/HostlistCompiler/issues/63 [#76]: https://github.com/AdguardTeam/FiltersCompiler/issues/76 ## [1.0.29] - 2024-09-26 diff --git a/README.md b/README.md index b47cd64..87d308e 100644 --- a/README.md +++ b/README.md @@ -310,6 +310,10 @@ So here's what it does: - Discards rules with unsupported modifiers. [Click here](https://github.com/AdguardTeam/AdGuardHome/wiki/Hosts-Blocklists#-adblock-style-syntax) to learn more about which modifiers are supported. - Discards rules that are too short. - Discards IP addresses. If you need to keep IP addresses, use [ValidateAllowIp](#validate-allow-ip) instead. +- Removes rules that block entire top-level domains (TLDs) like `||*.org^`, unless they have specific limiting modifiers such as `$denyallow`, `$badfilter`, or `$client`. + Examples: + - `||*.org^` - this rule will be removed + - `||*.org^$denyallow=example.com` - this rule will be kept because it has a limiting modifier If there are comments preceding the invalid rule, they will be removed as well. diff --git a/src/transformations/validate.js b/src/transformations/validate.js index 0f979ae..efa3a37 100644 --- a/src/transformations/validate.js +++ b/src/transformations/validate.js @@ -4,20 +4,36 @@ const tldts = require('tldts'); const utils = require('../utils'); const ruleUtils = require('../rule'); +const DOMAIN_PREFIX = '||'; +const DOMAIN_SEPARATOR = '^'; +const WILDCARD = '*'; +const WILDCARD_DOMAIN_PART = '*.'; + +// TODO: Remove lodash from the project if possible + +/** + * The list of modifiers that limit the rule for specific domains. + */ +const ANY_PATTERN_MODIFIER = [ + 'denyallow', + 'badfilter', + // DNS-related modifiers. + 'client', +]; + /** * The list of modifiers supported by hosts-level blockers. */ const SUPPORTED_MODIFIERS = [ 'important', '~important', - 'badfilter', 'ctag', - 'denyallow', // DNS-related modifiers. - 'client', 'dnstype', 'dnsrewrite', 'ctag', + // modifiers that limit the rule for specific domains + ...ANY_PATTERN_MODIFIER, ]; /** @@ -31,9 +47,10 @@ const MAX_PATTERN_LENGTH = 5; * @param {string} hostname - hostname to check * @param {string} ruleText - original rule text (for logging) * @param {boolean} allowedIP - flag to determine if IP validation is allowed + * @param {boolean} hasLimitModifier - flag to determine if the rule has a limit modifier, e.g. denyallow * @returns {boolean} true if the hostname is okay to be in the blocklist. */ -function validHostname(hostname, ruleText, allowedIP) { +function validHostname(hostname, ruleText, allowedIP, hasLimitModifier) { const result = tldts.parse(hostname); if (!result.hostname) { @@ -46,7 +63,7 @@ function validHostname(hostname, ruleText, allowedIP) { return false; } - if (result.hostname === result.publicSuffix) { + if (result.hostname === result.publicSuffix && !hasLimitModifier) { consola.debug(`matching the whole public suffix ${hostname} is not allowed: ${ruleText}`); return false; } @@ -108,6 +125,9 @@ function validAdblockRule(ruleText, allowedIP) { return false; } + // need to check if rules with TLD has limit modifier + let hasLimitModifier = false; + // 1. It checks if the rule contains only supported modifiers. if (props.options) { // eslint-disable-next-line no-restricted-syntax @@ -116,6 +136,10 @@ function validAdblockRule(ruleText, allowedIP) { consola.debug(`Contains unsupported modifier ${option.name}: ${ruleText}`); return false; } + // if the rule has a limit modifier, TLD as a hostname is allowed + if (ANY_PATTERN_MODIFIER.includes(option.name)) { + hasLimitModifier = true; + } } } @@ -151,30 +175,49 @@ function validAdblockRule(ruleText, allowedIP) { // 4. Validate domain name // Note that we don't check rules that contain wildcard characters - const sepIdx = props.pattern.indexOf('^'); - const wildcardIdx = props.pattern.indexOf('*'); + const sepIdx = props.pattern.indexOf(DOMAIN_SEPARATOR); + const wildcardIdx = props.pattern.indexOf(WILDCARD); if (sepIdx !== -1 && wildcardIdx !== -1 && wildcardIdx > sepIdx) { // Smth like ||example.org^test* -- invalid return false; } - if (_.startsWith(props.pattern, '||') - && sepIdx !== -1 - && wildcardIdx === -1) { - const hostname = utils.substringBetween(ruleText, '||', '^'); - if (!validHostname(hostname, ruleText, allowedIP)) { - return false; - } + // Check if the pattern does not start with the domain prefix and does not contain a domain separator + if (!_.startsWith(props.pattern, DOMAIN_PREFIX) || sepIdx === -1) { + return true; + } + + // Extract the domain to check from the rule text + const domainToCheck = utils.substringBetween(ruleText, DOMAIN_PREFIX, DOMAIN_SEPARATOR); - // If there's something after ^ in the pattern - something went wrong - // unless it's `^|` which is a rather often case - if (props.pattern.length > (sepIdx + 1) - && props.pattern[sepIdx + 1] !== '|') { - return false; + // If there are wildcard characters in the pattern + if (wildcardIdx !== -1) { + // Check if the rule has wildcard characters but includes only TLD (e.g., ||*.org^ or ||*.co.uk^) + const startsWithWildcard = domainToCheck.startsWith(WILDCARD_DOMAIN_PART); + // Get the TLD pattern to check (e.g. ||*.org^ --> org) + const TLDPattern = domainToCheck.replace(WILDCARD_DOMAIN_PART, ''); + // Compare the TLD pattern with the public suffix + const isOnlyTLD = tldts.getPublicSuffix(TLDPattern) === TLDPattern; + // If it's a wildcard with TLD, validate the cleaned TLD + if (startsWithWildcard && isOnlyTLD) { + const cleanedDomain = domainToCheck.replace(WILDCARD_DOMAIN_PART, ''); + return validHostname(cleanedDomain, ruleText, allowedIP, hasLimitModifier); } + // If the rule has wildcard characters but is not a TLD (e.g., ||*.example.org^) + return true; } - return true; + // Validate the domain + if (!validHostname(domainToCheck, ruleText, allowedIP, hasLimitModifier)) { + return false; + } + + // Ensure there's nothing after the domain separator unless it's `^|` + // If there's something after ^ in the pattern - something went wrong + // unless it's `^|` which is a rather often case + // ||example.org^| -- valid + // @@||example.org^|$important -- invalid + return !(props.pattern.length > (sepIdx + 1) && props.pattern[sepIdx + 1] !== '|'); } /** @@ -197,6 +240,7 @@ function valid(ruleText, allowedIP) { if (ruleUtils.isEtcHostsRule(ruleText)) { return validEtcHostsRule(ruleText, allowedIP); } + return validAdblockRule(ruleText, allowedIP); } diff --git a/test/transformations/validate.test.js b/test/transformations/validate.test.js index f839943..425a4d1 100644 --- a/test/transformations/validate.test.js +++ b/test/transformations/validate.test.js @@ -67,4 +67,70 @@ describe('Validate', () => { '@@||example.org^|$important', ]); }); + + it('adblock-style rules with wildcard and denyallow modifier', () => { + const rules = `||*.org^$denyallow=example.com +||*.asia^ +||*.example.org^ +||*.asia^$denyallow=fap.bar +||xyz^$denyallow=example.com +||xyz^`.split(/\r?\n/); + const filtered = validate(rules); + + expect(filtered).toEqual([ + '||*.org^$denyallow=example.com', + '||*.example.org^', + '||*.asia^$denyallow=fap.bar', + '||xyz^$denyallow=example.com', + ]); + }); + + it('adblock-style rules with wildcard and badfilter modifier', () => { + const rules = `||*.org^$badfilter +||*.asia^ +||*.example.org^ +||*.asia^$badfilter +||xyz^$badfilter +||xyz^`.split(/\r?\n/); + const filtered = validate(rules); + + expect(filtered).toEqual([ + '||*.org^$badfilter', + '||*.example.org^', + '||*.asia^$badfilter', + '||xyz^$badfilter', + ]); + }); + + it('adblock-style rules with wildcard and client modifier', () => { + const rules = `@@||*.org^$client=127.0.0.1 +||*.asia^ +||*.example.org^ +||*.asia^$client=192.168.0.0/24 +||xyz^$client=192.168.0.0/24 +||xyz^`.split(/\r?\n/); + const filtered = validate(rules); + + expect(filtered).toEqual([ + '@@||*.org^$client=127.0.0.1', + '||*.example.org^', + '||*.asia^$client=192.168.0.0/24', + '||xyz^$client=192.168.0.0/24', + ]); + }); + + it('check for composite TLDs', () => { + const rules = `||*.com.tr^$denyallow=example.com +||*.com.tr^ +||*.co.uk^$client=127.0.0.1 +||*.co.uk^ +||*.example.org^`.split(/\r?\n/); + const filtered = validate(rules); + + expect(filtered).toEqual([ + '||*.com.tr^$denyallow=example.com', + '||*.co.uk^$client=127.0.0.1', + '||*.example.org^', + ]); + }); });