Skip to content

Commit

Permalink
Add support for parsing HTML numeric entities (#645)
Browse files Browse the repository at this point in the history
  • Loading branch information
DerZade authored Mar 16, 2024
1 parent 072b2b0 commit 391f24f
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 0 deletions.
2 changes: 2 additions & 0 deletions docs/v4/5.Entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ Following HTML entities are supported by the parser by default when `htmlEntitie
|| Indian Rupee | `&inr;` | `₹` |
---

In addition, [numeric character references](https://html.spec.whatwg.org/multipage/syntax.html#syntax-charref) are also supported. Both decimal (`num_dec`) and hexadecimal(`num_hex`).

In future version of FXP, we'll be supporting more features of DOCTYPE such as `ELEMENT`, reading content for an entity from a file etc.

## External Entities
Expand Down
35 changes: 35 additions & 0 deletions spec/entities_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,41 @@ describe("XMLParser Entities", function() {
expect(result).toEqual(expected);
});


it("should parse HTML numeric entities when htmlEntities:true", function() {
const xmlData = `
<?xml version="1.0" encoding="UTF-8"?>
<note>
<heading>Bear</heading>
<body face="&#x295;&#x2022;&#x1D25;&#x2022;&#x294;">Bears are called B&#228;ren in German!</body>
</note> `;

const expected = {
"?xml": {
"version": "1.0",
"encoding": "UTF-8"
},
"note": {
"heading": "Bear",
"body": {
"#text": "Bears are called Bären in German!",
"face": "ʕ•ᴥ•ʔ"
}
}
};

const options = {
attributeNamePrefix: "",
ignoreAttributes: false,
processEntities: true,
htmlEntities: true,
};
const parser = new XMLParser(options);
let result = parser.parse(xmlData);

expect(result).toEqual(expected);
});

it("should throw error if an entity name contains special char", function() {
const xmlData = `
<?xml version="1.0" encoding="UTF-8"?>
Expand Down
2 changes: 2 additions & 0 deletions src/v5/EntitiesParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const htmlEntities = {
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
"reg" : { regex: /&(reg|#174);/g, val: "®" },
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
};

class EntitiesParser{
Expand Down
2 changes: 2 additions & 0 deletions src/v5/valueParsers/EntitiesParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const htmlEntities = {
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
"reg" : { regex: /&(reg|#174);/g, val: "®" },
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
};

class EntitiesParser{
Expand Down
2 changes: 2 additions & 0 deletions src/xmlparser/OrderedObjParser.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ class OrderedObjParser{
"copyright" : { regex: /&(copy|#169);/g, val: "©" },
"reg" : { regex: /&(reg|#174);/g, val: "®" },
"inr" : { regex: /&(inr|#8377);/g, val: "₹" },
"num_dec": { regex: /&#([0-9]{1,7});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 10)) },
"num_hex": { regex: /&#x([0-9a-fA-F]{1,6});/g, val : (_, str) => String.fromCharCode(Number.parseInt(str, 16)) },
};
this.addExternalEntities = addExternalEntities;
this.parseXml = parseXml;
Expand Down

0 comments on commit 391f24f

Please sign in to comment.