diff --git a/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap b/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap index 41d7d171cbc3..8eb212ca80cc 100644 --- a/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap +++ b/lighthouse-cli/test/cli/__snapshots__/index-test.js.snap @@ -348,6 +348,9 @@ Object { Object { "path": "dobetterweb/doctype", }, + Object { + "path": "dobetterweb/charset", + }, Object { "path": "dobetterweb/dom-size", }, @@ -704,6 +707,10 @@ Object { "id": "doctype", "weight": 1, }, + Object { + "id": "charset", + "weight": 1, + }, Object { "id": "no-vulnerable-libraries", "weight": 1, diff --git a/lighthouse-cli/test/smokehouse/test-definitions/dobetterweb/dbw-expectations.js b/lighthouse-cli/test/smokehouse/test-definitions/dobetterweb/dbw-expectations.js index 5889a1a872d1..06475854b00e 100644 --- a/lighthouse-cli/test/smokehouse/test-definitions/dobetterweb/dbw-expectations.js +++ b/lighthouse-cli/test/smokehouse/test-definitions/dobetterweb/dbw-expectations.js @@ -128,6 +128,7 @@ const expectations = [ { name: '', content: '', + charset: 'utf-8', }, { name: 'viewport', diff --git a/lighthouse-core/audits/dobetterweb/charset.js b/lighthouse-core/audits/dobetterweb/charset.js new file mode 100644 index 000000000000..266e890f31af --- /dev/null +++ b/lighthouse-core/audits/dobetterweb/charset.js @@ -0,0 +1,96 @@ +/** + * @license Copyright 2020 Google Inc. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + */ + +/** + * @fileoverview Audits a page to ensure charset it configured properly. + * It must be defined within the first 1024 bytes of the HTML document, defined in the HTTP header, or the document source starts with a BOM. + * + * @see: https://github.com/GoogleChrome/lighthouse/issues/10023 + */ +'use strict'; + +const Audit = require('../audit.js'); +const i18n = require('../../lib/i18n/i18n.js'); +const MainResource = require('../../computed/main-resource.js'); + +const UIStrings = { + /** Title of a Lighthouse audit that provides detail on if the charset is set properly for a page. This title is shown when the charset is defined correctly. Charset defines the character encoding (eg UTF-8) of the page content. */ + title: 'Properly defines charset', + /** Title of a Lighthouse audit that provides detail on if the charset is set properly for a page. This title is shown when the charset meta tag is missing or defined too late in the page. */ + failureTitle: 'Charset declaration is missing or occurs too late in the HTML', + /** Description of a Lighthouse audit that tells the user why the charset needs to be defined early on. */ + description: 'A character encoding declaration is required. It can be done with a tag' + + 'in the first 1024 bytes of the HTML or in the Content-Type HTTP response header. ' + + '[Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).', +}; + +const str_ = i18n.createMessageInstanceIdFn(__filename, UIStrings); + +const CONTENT_TYPE_HEADER = 'content-type'; +// /^[a-zA-Z0-9-_:.()]{2,}$/ matches all known IANA charset names (https://www.iana.org/assignments/character-sets/character-sets.xhtml) +const IANA_REGEX = /^[a-zA-Z0-9-_:.()]{2,}$/; +const CHARSET_HTML_REGEX = /]+charset[^<]+>/; +const CHARSET_HTTP_REGEX = /charset\s*=\s*[a-zA-Z0-9-_:.()]{2,}/; + +class CharsetDefined extends Audit { + /** + * @return {LH.Audit.Meta} + */ + static get meta() { + return { + id: 'charset', + title: str_(UIStrings.title), + failureTitle: str_(UIStrings.failureTitle), + description: str_(UIStrings.description), + requiredArtifacts: ['MainDocumentContent', 'URL', 'devtoolsLogs', 'MetaElements'], + }; + } + + /** + * @param {LH.Artifacts} artifacts + * @param {LH.Audit.Context} context + * @return {Promise} + */ + static async audit(artifacts, context) { + const devtoolsLog = artifacts.devtoolsLogs[Audit.DEFAULT_PASS]; + const mainResource = await MainResource.request({devtoolsLog, URL: artifacts.URL}, context); + let isCharsetSet = false; + // Check the http header 'content-type' to see if charset is defined there + if (mainResource.responseHeaders) { + const contentTypeHeader = mainResource.responseHeaders + .find(header => header.name.toLowerCase() === CONTENT_TYPE_HEADER); + + if (contentTypeHeader) { + isCharsetSet = CHARSET_HTTP_REGEX.test(contentTypeHeader.value); + } + } + + // Check if there is a BOM byte marker + const BOM_FIRSTCHAR = 65279; + isCharsetSet = isCharsetSet || artifacts.MainDocumentContent.charCodeAt(0) === BOM_FIRSTCHAR; + + // Check if charset-ish meta tag is defined within the first 1024 characters(~1024 bytes) of the HTML document + if (CHARSET_HTML_REGEX.test(artifacts.MainDocumentContent.slice(0, 1024))) { + // If so, double-check the DOM attributes, considering both legacy http-equiv and html5 charset styles. + isCharsetSet = isCharsetSet || artifacts.MetaElements.some(meta => { + return (meta.charset && IANA_REGEX.test(meta.charset)) || + (meta.httpEquiv === 'content-type' && + meta.content && + CHARSET_HTTP_REGEX.test(meta.content)); + }); + } + + return { + score: Number(isCharsetSet), + }; + } +} + +module.exports = CharsetDefined; +module.exports.UIStrings = UIStrings; +module.exports.CHARSET_HTML_REGEX = CHARSET_HTML_REGEX; +module.exports.CHARSET_HTTP_REGEX = CHARSET_HTTP_REGEX; +module.exports.IANA_REGEX = IANA_REGEX; diff --git a/lighthouse-core/config/default-config.js b/lighthouse-core/config/default-config.js index c479d2bff4a8..650d1176e21b 100644 --- a/lighthouse-core/config/default-config.js +++ b/lighthouse-core/config/default-config.js @@ -287,6 +287,7 @@ const defaultConfig = { 'byte-efficiency/efficient-animated-content', 'dobetterweb/appcache-manifest', 'dobetterweb/doctype', + 'dobetterweb/charset', 'dobetterweb/dom-size', 'dobetterweb/external-anchors-use-rel-noopener', 'dobetterweb/geolocation-on-start', @@ -505,6 +506,7 @@ const defaultConfig = { {id: 'external-anchors-use-rel-noopener', weight: 1}, {id: 'geolocation-on-start', weight: 1}, {id: 'doctype', weight: 1}, + {id: 'charset', weight: 1}, {id: 'no-vulnerable-libraries', weight: 1}, {id: 'js-libraries', weight: 0}, {id: 'notification-on-start', weight: 1}, diff --git a/lighthouse-core/gather/gatherers/meta-elements.js b/lighthouse-core/gather/gatherers/meta-elements.js index ad93d84e5c20..695f6526661f 100644 --- a/lighthouse-core/gather/gatherers/meta-elements.js +++ b/lighthouse-core/gather/gatherers/meta-elements.js @@ -26,6 +26,8 @@ class MetaElements extends Gatherer { name: meta.name.toLowerCase(), content: meta.content, property: meta.attributes.property ? meta.attributes.property.value : undefined, + httpEquiv: meta.httpEquiv ? meta.httpEquiv.toLowerCase() : undefined, + charset: meta.attributes.charset ? meta.attributes.charset.value : undefined, }; }); })()`, {useIsolation: true}); diff --git a/lighthouse-core/lib/i18n/locales/en-US.json b/lighthouse-core/lib/i18n/locales/en-US.json index 75358f4427b8..e0e2cae6a1d2 100644 --- a/lighthouse-core/lib/i18n/locales/en-US.json +++ b/lighthouse-core/lib/i18n/locales/en-US.json @@ -554,6 +554,15 @@ "lighthouse-core/audits/dobetterweb/appcache-manifest.js | title": { "message": "Avoids Application Cache" }, + "lighthouse-core/audits/dobetterweb/charset.js | description": { + "message": "A character encoding declaration is required. It can be done with a tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations)." + }, + "lighthouse-core/audits/dobetterweb/charset.js | failureTitle": { + "message": "Charset declaration is missing or occurs too late in the HTML" + }, + "lighthouse-core/audits/dobetterweb/charset.js | title": { + "message": "Properly defines charset" + }, "lighthouse-core/audits/dobetterweb/doctype.js | description": { "message": "Specifying a doctype prevents the browser from switching to quirks-mode. [Learn more](https://web.dev/doctype)." }, diff --git a/lighthouse-core/lib/i18n/locales/en-XL.json b/lighthouse-core/lib/i18n/locales/en-XL.json index ceb72c12bbd0..63f69bc377d4 100644 --- a/lighthouse-core/lib/i18n/locales/en-XL.json +++ b/lighthouse-core/lib/i18n/locales/en-XL.json @@ -554,6 +554,15 @@ "lighthouse-core/audits/dobetterweb/appcache-manifest.js | title": { "message": "Âv́ôíd̂ś Âṕp̂ĺîćât́îón̂ Ćâćĥé" }, + "lighthouse-core/audits/dobetterweb/charset.js | description": { + "message": "Â ćĥár̂áĉt́êŕ êńĉód̂ín̂ǵ d̂éĉĺâŕât́îón̂ íŝ ŕêq́ûír̂éd̂. Ít̂ ćâń b̂é d̂ón̂é ŵít̂h́ â t́âǵîń t̂h́ê f́îŕŝt́ 1024 b̂ýt̂éŝ óf̂ t́ĥé ĤT́M̂Ĺ ôŕ îń t̂h́ê Ćôńt̂én̂t́-T̂ýp̂é ĤT́T̂Ṕ r̂éŝṕôńŝé ĥéâd́êŕ. [L̂éâŕn̂ ḿôŕê](https://www.w3.org/International/questions/qa-html-encoding-declarations)." + }, + "lighthouse-core/audits/dobetterweb/charset.js | failureTitle": { + "message": "Ĉh́âŕŝét̂ d́êćl̂ár̂át̂íôń îś m̂íŝśîńĝ ór̂ óĉćûŕŝ t́ôó l̂át̂é îń t̂h́ê H́T̂ḾL̂" + }, + "lighthouse-core/audits/dobetterweb/charset.js | title": { + "message": "P̂ŕôṕêŕl̂ý d̂éf̂ín̂éŝ ćĥár̂śêt́" + }, "lighthouse-core/audits/dobetterweb/doctype.js | description": { "message": "Ŝṕêćîf́ŷín̂ǵ â d́ôćt̂ýp̂é p̂ŕêv́êńt̂ś t̂h́ê b́r̂óŵśêŕ f̂ŕôḿ ŝẃît́ĉh́îńĝ t́ô q́ûír̂ḱŝ-ḿôd́ê. [Ĺêár̂ń m̂ór̂é](https://web.dev/doctype)." }, diff --git a/lighthouse-core/test/audits/dobetterweb/charset-test.js b/lighthouse-core/test/audits/dobetterweb/charset-test.js new file mode 100644 index 000000000000..bc77ec36fb59 --- /dev/null +++ b/lighthouse-core/test/audits/dobetterweb/charset-test.js @@ -0,0 +1,165 @@ +/** + * @license Copyright 2020 Google Inc. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + */ +'use strict'; + +const CharsetDefinedAudit = require('../../../audits/dobetterweb/charset.js'); +const assert = require('assert'); +const networkRecordsToDevtoolsLog = require('../../network-records-to-devtools-log.js'); + +/* eslint-env jest */ + +const HTML_PRE = ''; +const HTML_POST = '

hello'; + +function generateArtifacts(htmlContent, contentTypeValue = 'text/html') { + const finalUrl = 'https://example.com/'; + const mainResource = { + url: finalUrl, + responseHeaders: [ + {name: 'content-type', value: contentTypeValue}, + ], + }; + const devtoolsLog = networkRecordsToDevtoolsLog([mainResource]); + const context = {computedCache: new Map()}; + return [{ + devtoolsLogs: {[CharsetDefinedAudit.DEFAULT_PASS]: devtoolsLog}, + URL: {finalUrl}, + MainDocumentContent: htmlContent, + MetaElements: [], + }, context]; +} + +describe('Charset defined audit', () => { + it('succeeds where the page contains the charset meta tag', async () => { + const htmlContent = HTML_PRE + '' + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}]; + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 1); + }); + + it('succeeds when the page has the charset defined in the content-type meta tag', async () => { + const htmlContent = HTML_PRE + + '' + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + artifacts.MetaElements = [ + {name: '', content: 'text/html; charset=utf-8', httpEquiv: 'content-type'}, + ]; + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 1); + }); + + it('succeeds when the page has the charset defined in the content-type http header', async () => { + const htmlContent = HTML_PRE + + '' + HTML_POST; + const contentTypeVal = 'text/html; charset=UTF-8'; + const [artifacts, context] = generateArtifacts(htmlContent, contentTypeVal); + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 1); + }); + + it('succeeds when the page has the charset defined via BOM', async () => { + const htmlContent = '\ufeff' + HTML_PRE + + '' + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 1); + }); + + it('fails when the page does not have charset defined', async () => { + const htmlContent = HTML_PRE + ''; + const [artifacts, context] = generateArtifacts(htmlContent); + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 0); + }); + + it('fails when the page has charset defined too late in the page', async () => { + const bigString = new Array(1024).fill(' ').join(''); + const htmlContent = HTML_PRE + bigString + '' + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}]; + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 0); + }); + + it('passes when the page has charset defined almost too late in the page', async () => { + const bigString = new Array(900).fill(' ').join(''); + const htmlContent = HTML_PRE + bigString + '' + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}]; + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 1); + }); + + it('fails when charset only partially defined in the first 1024 bytes of the page', async () => { + const charsetHTML = ''; + // 1024 bytes should be halfway through the meta tag + const bigString = new Array(1024 - HTML_PRE.length - charsetHTML.length / 2).fill(' ').join(''); + const htmlContent = HTML_PRE + bigString + charsetHTML + HTML_POST; + const [artifacts, context] = generateArtifacts(htmlContent); + artifacts.MetaElements = [{name: '', content: '', charset: 'utf-8'}]; + const auditResult = await CharsetDefinedAudit.audit(artifacts, context); + assert.equal(auditResult.score, 0); + }); +}); + +describe('Charset regex check', () => { + const HTML_REGEX = CharsetDefinedAudit.CHARSET_HTML_REGEX; + const HTTP_REGEX = CharsetDefinedAudit.CHARSET_HTTP_REGEX; + const IANA_REGEX = CharsetDefinedAudit.IANA_REGEX; + + it('handles html correctly', () => { + // Positive cases + assert.equal(HTML_REGEX.test(''), true); + assert.equal(HTML_REGEX.test(``), true); + assert.equal(HTML_REGEX.test(``), true); + assert.equal(HTML_REGEX.test(``), true); + assert.equal(HTML_REGEX.test(``), true); + assert.equal(HTML_REGEX.test( + `'`), + true); + assert.equal(HTML_REGEX.test( + `'`), + true); + + // Negative cases + assert.equal(HTML_REGEX.test(``), false); + assert.equal(HTML_REGEX.test(``), false); + assert.equal(HTML_REGEX.test( + `'`), + false); + assert.equal(HTML_REGEX.test( + `'`), + false); + }); + + it('handles http header correctly', () => { + // Positive cases + assert.equal(HTTP_REGEX.test('text/html; charset=UTF-8'), true); + assert.equal(HTTP_REGEX.test('text/html; charset = UTF-8'), true); + + // Negative cases + assert.equal(HTTP_REGEX.test('text/html; charset='), false); + assert.equal(HTTP_REGEX.test('text/html; charset=x'), false); + assert.equal(HTTP_REGEX.test('text/html; charset= '), false); + }); + + it('handles charset name validation correctly', () => { + // Positive cases + assert.equal(IANA_REGEX.test('utf-8'), true); + assert.equal(IANA_REGEX.test('utf-16'), true); + assert.equal(IANA_REGEX.test('IT'), true); + assert.equal(IANA_REGEX.test('NS_4551-1'), true); + assert.equal(IANA_REGEX.test('ISO_646.basic:1983'), true); + assert.equal(IANA_REGEX.test('NF_Z_62-010_(1973)'), true); + + // Negative cases + assert.equal(IANA_REGEX.test('a'), false); + assert.equal(IANA_REGEX.test(''), false); + assert.equal(IANA_REGEX.test('utf+8'), false); + assert.equal(IANA_REGEX.test('utf-16*'), false); + }); +}); diff --git a/lighthouse-core/test/results/sample_v2.json b/lighthouse-core/test/results/sample_v2.json index 27861221cadc..7a64d89ec23c 100644 --- a/lighthouse-core/test/results/sample_v2.json +++ b/lighthouse-core/test/results/sample_v2.json @@ -2782,6 +2782,13 @@ "score": 1, "scoreDisplayMode": "binary" }, + "charset": { + "id": "charset", + "title": "Charset declaration is missing or occurs too late in the HTML", + "description": "A character encoding declaration is required. It can be done with a tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).", + "score": 0, + "scoreDisplayMode": "binary" + }, "dom-size": { "id": "dom-size", "title": "Avoids an excessive DOM size", @@ -4042,6 +4049,10 @@ "id": "doctype", "weight": 1 }, + { + "id": "charset", + "weight": 1 + }, { "id": "no-vulnerable-libraries", "weight": 1 @@ -5223,6 +5234,12 @@ "duration": 100, "entryType": "measure" }, + { + "startTime": 0, + "name": "lh:audit:charset", + "duration": 100, + "entryType": "measure" + }, { "startTime": 0, "name": "lh:audit:dom-size", @@ -6349,6 +6366,12 @@ "lighthouse-core/audits/dobetterweb/doctype.js | description": [ "audits.doctype.description" ], + "lighthouse-core/audits/dobetterweb/charset.js | failureTitle": [ + "audits.charset.title" + ], + "lighthouse-core/audits/dobetterweb/charset.js | description": [ + "audits.charset.description" + ], "lighthouse-core/audits/dobetterweb/dom-size.js | title": [ "audits[dom-size].title" ], diff --git a/proto/sample_v2_round_trip.json b/proto/sample_v2_round_trip.json index b81c39a004ad..373e6b1d8ea4 100644 --- a/proto/sample_v2_round_trip.json +++ b/proto/sample_v2_round_trip.json @@ -225,6 +225,13 @@ "scoreDisplayMode": "notApplicable", "title": "Document has a valid `rel=canonical`" }, + "charset": { + "description": "A character encoding declaration is required. It can be done with a tagin the first 1024 bytes of the HTML or in the Content-Type HTTP response header. [Learn more](https://www.w3.org/International/questions/qa-html-encoding-declarations).", + "id": "charset", + "score": 0.0, + "scoreDisplayMode": "binary", + "title": "Charset declaration is missing or occurs too late in the HTML" + }, "color-contrast": { "description": "Low-contrast text is difficult or impossible for many users to read. [Learn more](https://web.dev/color-contrast/).", "details": { @@ -3667,6 +3674,10 @@ "id": "doctype", "weight": 1.0 }, + { + "id": "charset", + "weight": 1.0 + }, { "id": "no-vulnerable-libraries", "weight": 1.0 @@ -5146,6 +5157,12 @@ "name": "lh:audit:doctype", "startTime": 0.0 }, + { + "duration": 100.0, + "entryType": "measure", + "name": "lh:audit:charset", + "startTime": 0.0 + }, { "duration": 100.0, "entryType": "measure", diff --git a/types/artifacts.d.ts b/types/artifacts.d.ts index 8e75b1a7303a..3f1a1f3df2e4 100644 --- a/types/artifacts.d.ts +++ b/types/artifacts.d.ts @@ -72,7 +72,7 @@ declare global { /** All the link elements on the page or equivalently declared in `Link` headers. @see https://html.spec.whatwg.org/multipage/links.html */ LinkElements: Artifacts.LinkElement[]; /** The values of the elements in the head. */ - MetaElements: Array<{name: string, content?: string, property?: string}>; + MetaElements: Array<{name?: string, content?: string, property?: string, httpEquiv?: string, charset?: string}>; /** Set of exceptions thrown during page load. */ RuntimeExceptions: Crdp.Runtime.ExceptionThrownEvent[]; /** Information on all script elements in the page. Also contains the content of all requested scripts and the networkRecord requestId that contained their content. Note, HTML documents will have one entry per script tag, all with the same requestId. */