From 470b3e6d3cb280145c2a00aa541865a3a0a00e45 Mon Sep 17 00:00:00 2001 From: Dmitry Soshnikov Date: Fri, 17 May 2019 15:23:15 -0700 Subject: [PATCH] Implement Unicode property escapes --- README.md | 51 +++ package-lock.json | 97 +++-- package.json | 2 +- .../parser-unicode-properties-test.js | 305 ++++++++++++++ src/parser/regexp.bnf | 61 +++ .../unicode/parser-unicode-properties.js | 395 ++++++++++++++++++ 6 files changed, 884 insertions(+), 27 deletions(-) create mode 100644 src/parser/__tests__/parser-unicode-properties-test.js create mode 100644 src/parser/unicode/parser-unicode-properties.js diff --git a/README.md b/README.md index 96fcf0d..af8cf66 100644 --- a/README.md +++ b/README.md @@ -1118,6 +1118,57 @@ The range value can be the same for `from` and `to`, and the special range `-` c [a-zA-Z0-9]+ ``` +#### Unicode properties + +Unicode property escapes are a new type of escape sequence available in regular expressions that have the `u` flag set. With this feature it is possible to write Unicode expressions as: + +```js +const greekSymbolRe = /\p{Script=Greek}/u; + +greekSymbolRe.test('π'); // true +``` + +The AST node for this expression is: + +```js +{ + type: 'RegExp', + body: { + type: 'UnicodeProperty', + name: 'Script', + value: 'Greek', + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: 'Greek' + }, + 'flags': 'u' +} +``` + +All possible property names, values, and their aliases can be found at the [specification](https://tc39.github.io/proposal-regexp-unicode-property-escapes/#table-nonbinary-unicode-properties). + +For `General_Category` it is possible to use a shorthand: + +```js +/\p{Letter}/u; // Shorthand + +/\p{General_Category=Letter}/u; // Full notation +``` + +Binary names use the single value as well: + +```js +/\p{ASCII_Hex_Digit}/u; // Same as: /[0-9A-Fa-f]/ +``` + +The capitalized `P` defines the negation of the expression: + +```js +/\P{ASCII_Hex_Digit}/u; // NOT a ASCII Hex digit +``` + #### Alternative An _alternative_ (or _concatenation_) defines a chain of patterns followed one after another: diff --git a/package-lock.json b/package-lock.json index c27159c..64c62b1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -65,6 +65,7 @@ "resolved": "https://registry.npmjs.org/align-text/-/align-text-0.1.4.tgz", "integrity": "sha1-DNkKVhCT810KmSVsIrcGlDP60Rc=", "dev": true, + "optional": true, "requires": { "kind-of": "^3.0.2", "longest": "^1.0.1", @@ -2016,7 +2017,8 @@ "ansi-regex": { "version": "2.1.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "aproba": { "version": "1.1.1", @@ -2067,7 +2069,8 @@ "balanced-match": { "version": "0.4.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "bcrypt-pbkdf": { "version": "1.0.1", @@ -2082,6 +2085,7 @@ "version": "0.0.9", "bundled": true, "dev": true, + "optional": true, "requires": { "inherits": "~2.0.0" } @@ -2090,6 +2094,7 @@ "version": "2.10.1", "bundled": true, "dev": true, + "optional": true, "requires": { "hoek": "2.x.x" } @@ -2098,6 +2103,7 @@ "version": "1.1.7", "bundled": true, "dev": true, + "optional": true, "requires": { "balanced-match": "^0.4.1", "concat-map": "0.0.1" @@ -2106,7 +2112,8 @@ "buffer-shims": { "version": "1.0.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "caseless": { "version": "0.12.0", @@ -2123,12 +2130,14 @@ "code-point-at": { "version": "1.1.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "combined-stream": { "version": "1.0.5", "bundled": true, "dev": true, + "optional": true, "requires": { "delayed-stream": "~1.0.0" } @@ -2136,22 +2145,26 @@ "concat-map": { "version": "0.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "console-control-strings": { "version": "1.1.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "core-util-is": { "version": "1.0.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "cryptiles": { "version": "2.0.5", "bundled": true, "dev": true, + "optional": true, "requires": { "boom": "2.x.x" } @@ -2191,7 +2204,8 @@ "delayed-stream": { "version": "1.0.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "delegates": { "version": "1.0.0", @@ -2223,7 +2237,8 @@ "extsprintf": { "version": "1.0.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "forever-agent": { "version": "0.6.1", @@ -2245,12 +2260,14 @@ "fs.realpath": { "version": "1.0.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "fstream": { "version": "1.0.11", "bundled": true, "dev": true, + "optional": true, "requires": { "graceful-fs": "^4.1.2", "inherits": "~2.0.0", @@ -2306,6 +2323,7 @@ "version": "7.1.2", "bundled": true, "dev": true, + "optional": true, "requires": { "fs.realpath": "^1.0.0", "inflight": "^1.0.4", @@ -2318,7 +2336,8 @@ "graceful-fs": { "version": "4.1.11", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "har-schema": { "version": "1.0.5", @@ -2346,6 +2365,7 @@ "version": "3.1.3", "bundled": true, "dev": true, + "optional": true, "requires": { "boom": "2.x.x", "cryptiles": "2.x.x", @@ -2356,7 +2376,8 @@ "hoek": { "version": "2.16.3", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "http-signature": { "version": "1.1.1", @@ -2373,6 +2394,7 @@ "version": "1.0.6", "bundled": true, "dev": true, + "optional": true, "requires": { "once": "^1.3.0", "wrappy": "1" @@ -2381,7 +2403,8 @@ "inherits": { "version": "2.0.3", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "ini": { "version": "1.3.4", @@ -2393,6 +2416,7 @@ "version": "1.0.0", "bundled": true, "dev": true, + "optional": true, "requires": { "number-is-nan": "^1.0.0" } @@ -2406,7 +2430,8 @@ "isarray": { "version": "1.0.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "isstream": { "version": "0.1.2", @@ -2479,12 +2504,14 @@ "mime-db": { "version": "1.27.0", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "mime-types": { "version": "2.1.15", "bundled": true, "dev": true, + "optional": true, "requires": { "mime-db": "~1.27.0" } @@ -2493,6 +2520,7 @@ "version": "3.0.4", "bundled": true, "dev": true, + "optional": true, "requires": { "brace-expansion": "^1.1.7" } @@ -2500,12 +2528,14 @@ "minimist": { "version": "0.0.8", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "mkdirp": { "version": "0.5.1", "bundled": true, "dev": true, + "optional": true, "requires": { "minimist": "0.0.8" } @@ -2560,7 +2590,8 @@ "number-is-nan": { "version": "1.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "oauth-sign": { "version": "0.8.2", @@ -2578,6 +2609,7 @@ "version": "1.4.0", "bundled": true, "dev": true, + "optional": true, "requires": { "wrappy": "1" } @@ -2607,7 +2639,8 @@ "path-is-absolute": { "version": "1.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "performance-now": { "version": "0.2.0", @@ -2618,7 +2651,8 @@ "process-nextick-args": { "version": "1.0.7", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "punycode": { "version": "1.4.1", @@ -2656,6 +2690,7 @@ "version": "2.2.9", "bundled": true, "dev": true, + "optional": true, "requires": { "buffer-shims": "~1.0.0", "core-util-is": "~1.0.0", @@ -2700,6 +2735,7 @@ "version": "2.6.1", "bundled": true, "dev": true, + "optional": true, "requires": { "glob": "^7.0.5" } @@ -2707,7 +2743,8 @@ "safe-buffer": { "version": "5.0.1", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "semver": { "version": "5.3.0", @@ -2731,6 +2768,7 @@ "version": "1.0.9", "bundled": true, "dev": true, + "optional": true, "requires": { "hoek": "2.x.x" } @@ -2764,6 +2802,7 @@ "version": "1.0.2", "bundled": true, "dev": true, + "optional": true, "requires": { "code-point-at": "^1.0.0", "is-fullwidth-code-point": "^1.0.0", @@ -2774,6 +2813,7 @@ "version": "1.0.1", "bundled": true, "dev": true, + "optional": true, "requires": { "safe-buffer": "^5.0.1" } @@ -2788,6 +2828,7 @@ "version": "3.0.1", "bundled": true, "dev": true, + "optional": true, "requires": { "ansi-regex": "^2.0.0" } @@ -2802,6 +2843,7 @@ "version": "2.2.1", "bundled": true, "dev": true, + "optional": true, "requires": { "block-stream": "*", "fstream": "^1.0.2", @@ -2857,7 +2899,8 @@ "util-deprecate": { "version": "1.0.2", "bundled": true, - "dev": true + "dev": true, + "optional": true }, "uuid": { "version": "3.0.1", @@ -2886,7 +2929,8 @@ "wrappy": { "version": "1.0.2", "bundled": true, - "dev": true + "dev": true, + "optional": true } } }, @@ -4309,7 +4353,8 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/longest/-/longest-1.0.1.tgz", "integrity": "sha1-MKCy2jj3N3DoKUoNIuZiXtd9AJc=", - "dev": true + "dev": true, + "optional": true }, "loose-envify": { "version": "1.3.1", @@ -4715,9 +4760,9 @@ "dev": true }, "prettier": { - "version": "1.11.1", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-1.11.1.tgz", - "integrity": "sha512-T/KD65Ot0PB97xTrG8afQ46x3oiVhnfGjGESSI9NWYcG92+OUPZKkwHqGWXH2t9jK1crnQjubECW0FuOth+hxw==", + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-1.17.1.tgz", + "integrity": "sha512-TzGRNvuUSmPgwivDqkZ9tM/qTGW9hqDKWOE9YHiyQdixlKbv7kvEqsmDPrcHJTKwthU774TQwZXVtaQ/mMsvjg==", "dev": true }, "pretty-format": { diff --git a/package.json b/package.json index ea06fb3..fa6ae35 100644 --- a/package.json +++ b/package.json @@ -31,7 +31,7 @@ "babel-preset-flow": "6.23.0", "eslint": "^4.11.0", "jest-cli": "^19.0.2", - "prettier": "^1.11.1", + "prettier": "^1.17.1", "shelljs": "^0.7.8", "syntax-cli": "^0.1.11" } diff --git a/src/parser/__tests__/parser-unicode-properties-test.js b/src/parser/__tests__/parser-unicode-properties-test.js new file mode 100644 index 0000000..b425f66 --- /dev/null +++ b/src/parser/__tests__/parser-unicode-properties-test.js @@ -0,0 +1,305 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2017-present Dmitry Soshnikov + */ + +const parser = require('..'); + +const { + getCanonicalValue, + + BINARY_PROP_NAMES_TO_ALIASES, + BINARY_ALIASES_TO_PROP_NAMES, + + GENERAL_CATEGORY_VALUE_TO_ALIASES, + GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES, + + SCRIPT_VALUE_TO_ALIASES, + SCRIPT_VALUE_ALIASES_TO_VALUE, +} = require('../unicode/parser-unicode-properties'); + +describe('parser-unicode-properties', () => { + it('general-category', () => { + expect(parser.parse('/\\p{General_Category=Letter}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value: 'Letter', + negative: false, + shorthand: false, + binary: false, + canonicalName: 'General_Category', + canonicalValue: 'Letter', + }); + }); + + it('shorthand', () => { + expect(parser.parse('/\\p{Letter}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value: 'Letter', + negative: false, + shorthand: true, + binary: false, + canonicalName: 'General_Category', + canonicalValue: 'Letter', + }); + }); + + it('negative', () => { + expect(parser.parse('/\\P{Letter}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value: 'Letter', + negative: true, + shorthand: true, + binary: false, + canonicalName: 'General_Category', + canonicalValue: 'Letter', + }); + }); + + it('binary', () => { + expect(parser.parse('/\\p{Hex_Digit}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'Hex_Digit', + value: 'Hex_Digit', + negative: false, + shorthand: false, + binary: true, + canonicalName: 'Hex_Digit', + canonicalValue: 'Hex_Digit', + }); + }); + + it('script', () => { + expect(parser.parse('/\\p{Script=Cyrillic}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'Script', + value: 'Cyrillic', + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: 'Cyrillic', + }); + }); + + it('script-extensions', () => { + expect(parser.parse('/\\p{Script_Extensions=Cyrillic}/u').body).toEqual({ + type: 'UnicodeProperty', + name: 'Script_Extensions', + value: 'Cyrillic', + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script_Extensions', + canonicalValue: 'Cyrillic', + }); + }); + + it('auto-general-category', () => { + for (const value in GENERAL_CATEGORY_VALUE_TO_ALIASES) { + expect(parser.parse(`/\\p{General_Category=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'General_Category', + canonicalValue: value, + }); + + expect(parser.parse(`/\\p{gc=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'gc', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'General_Category', + canonicalValue: value, + }); + } + + for (const value in GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES) { + expect(parser.parse(`/\\p{General_Category=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'General_Category', + canonicalValue: getCanonicalValue(value), + }); + + expect(parser.parse(`/\\p{gc=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'gc', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'General_Category', + canonicalValue: getCanonicalValue(value), + }); + } + }); + + it('auto-script', () => { + for (const value in SCRIPT_VALUE_TO_ALIASES) { + expect(parser.parse(`/\\p{Script=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'Script', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: value, + }); + + expect(parser.parse(`/\\p{sc=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'sc', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: value, + }); + } + + for (const value in SCRIPT_VALUE_ALIASES_TO_VALUE) { + expect(parser.parse(`/\\p{Script=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'Script', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: getCanonicalValue(value), + }); + + expect(parser.parse(`/\\p{sc=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'sc', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script', + canonicalValue: getCanonicalValue(value), + }); + } + }); + + it('auto-script-extensions', () => { + for (const value in SCRIPT_VALUE_TO_ALIASES) { + expect(parser.parse(`/\\p{Script_Extensions=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'Script_Extensions', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script_Extensions', + canonicalValue: value, + }); + + expect(parser.parse(`/\\p{scx=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'scx', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script_Extensions', + canonicalValue: value, + }); + } + + for (const value in SCRIPT_VALUE_ALIASES_TO_VALUE) { + expect(parser.parse(`/\\p{Script_Extensions=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'Script_Extensions', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script_Extensions', + canonicalValue: getCanonicalValue(value), + }); + + expect(parser.parse(`/\\p{scx=${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'scx', + value, + negative: false, + shorthand: false, + binary: false, + canonicalName: 'Script_Extensions', + canonicalValue: getCanonicalValue(value), + }); + } + }); + + it('auto-shorthand', () => { + for (const value in GENERAL_CATEGORY_VALUE_TO_ALIASES) { + expect(parser.parse(`/\\p{${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value, + negative: false, + shorthand: true, + binary: false, + canonicalName: 'General_Category', + canonicalValue: value, + }); + } + + for (const value in GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES) { + expect(parser.parse(`/\\p{${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: 'General_Category', + value, + negative: false, + shorthand: true, + binary: false, + canonicalName: 'General_Category', + canonicalValue: getCanonicalValue(value), + }); + } + }); + + it('auto-binary', () => { + for (const value in BINARY_PROP_NAMES_TO_ALIASES) { + expect(parser.parse(`/\\p{${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: value, + value, + negative: false, + shorthand: false, + binary: true, + canonicalName: value, + canonicalValue: value, + }); + } + + for (const value in BINARY_ALIASES_TO_PROP_NAMES) { + expect(parser.parse(`/\\p{${value}}/u`).body).toEqual({ + type: 'UnicodeProperty', + name: value, + value, + negative: false, + shorthand: false, + binary: true, + canonicalName: getCanonicalValue(value), + canonicalValue: getCanonicalValue(value), + }); + } + }); +}); diff --git a/src/parser/regexp.bnf b/src/parser/regexp.bnf index f11bbac..826022d 100644 --- a/src/parser/regexp.bnf +++ b/src/parser/regexp.bnf @@ -24,6 +24,8 @@ U_TRAIL_SURROGATE [dD][c-fC-F][0-9a-fA-F]{2} GROUP_NAME ([\w$]|\\'u'[0-9a-fA-F]{4}|\\'u{'[0-9a-fA-F]{1,}'}')+ +NAME \w+ + /** * Lexer state for character class. * @@ -101,6 +103,8 @@ GROUP_NAME ([\w$]|\\'u'[0-9a-fA-F]{4}|\\'u{'[0-9a-fA-F]{1,}'}')+ {ESC}'u'{U}{4} return 'U_CODE' +{ESC}[pP]'{'{NAME}(?:\={NAME})?'}' return 'U_PROP_VALUE_EXP' + {ESC}'x'{U}{2} return 'HEX_CODE' {ESC}[tnrdDsSwWvf] return 'META_CHAR' @@ -237,6 +241,60 @@ function checkClassRange(from, to) { } } +// ---------------------- Unicode property ------------------------------------------- + +const unicodeProperties = require('../unicode/parser-unicode-properties.js'); + +/** + * Unicode property. + */ +function UnicodeProperty(matched, loc) { + const negative = matched[1] === 'P'; + const separatorIdx = matched.indexOf('='); + + let name = matched.slice(3, separatorIdx !== -1 ? separatorIdx : -1); + let value; + + // General_Category allows using only value as a shorthand. + const isShorthand = separatorIdx === -1 && + unicodeProperties.isGeneralCategoryValue(name); + + // Binary propery name. + const isBinaryProperty = separatorIdx === -1 && + unicodeProperties.isBinaryPropertyName(name); + + if (isShorthand) { + value = name; + name = 'General_Category'; + } else if (isBinaryProperty) { + value = name; + } else { + if (!unicodeProperties.isValidName(name)) { + throw new SyntaxError(`Invalid unicode property name: ${name}.`); + } + + value = matched.slice(separatorIdx + 1, -1); + + if (!unicodeProperties.isValidValue(name, value)) { + throw new SyntaxError(`Invalid ${name} unicode property value: ${value}.`); + } + } + + return Node({ + type: 'UnicodeProperty', + name, + value, + negative, + shorthand: isShorthand, + binary: isBinaryProperty, + canonicalName: unicodeProperties.getCanonicalName(name) || name, + canonicalValue: unicodeProperties.getCanonicalValue(value) || value, + }, loc); +} + +// ---------------------------------------------------------------------------------- + + /** * Creates a character node. */ @@ -680,6 +738,9 @@ SourceCharacter | U_CODE { $$ = Char($1, 'unicode', @$) } + | U_PROP_VALUE_EXP + { $$ = UnicodeProperty($1, @$) } + | CTRL_CH { $$ = Char($1, 'control', @$) } diff --git a/src/parser/unicode/parser-unicode-properties.js b/src/parser/unicode/parser-unicode-properties.js new file mode 100644 index 0000000..55c9731 --- /dev/null +++ b/src/parser/unicode/parser-unicode-properties.js @@ -0,0 +1,395 @@ +/** + * The MIT License (MIT) + * Copyright (c) 2017-present Dmitry Soshnikov + */ + +const NON_BINARY_PROP_NAMES_TO_ALIASES = { + General_Category: 'gc', + Script: 'sc', + Script_Extensions: 'scx', +}; + +const NON_BINARY_ALIASES_TO_PROP_NAMES = inverseMap( + NON_BINARY_PROP_NAMES_TO_ALIASES +); + +const BINARY_PROP_NAMES_TO_ALIASES = { + ASCII: 'ASCII', + ASCII_Hex_Digit: 'AHex', + Alphabetic: 'Alpha', + Any: 'Any', + Assigned: 'Assigned', + Bidi_Control: 'Bidi_C', + Bidi_Mirrored: 'Bidi_M', + Case_Ignorable: 'CI', + Cased: 'Cased', + Changes_When_Casefolded: 'CWCF', + Changes_When_Casemapped: 'CWCM', + Changes_When_Lowercased: 'CWL', + Changes_When_NFKC_Casefolded: 'CWKCF', + Changes_When_Titlecased: 'CWT', + Changes_When_Uppercased: 'CWU', + Dash: 'Dash', + Default_Ignorable_Code_Point: 'DI', + Deprecated: 'Dep', + Diacritic: 'Dia', + Emoji: 'Emoji', + Emoji_Component: 'Emoji_Component', + Emoji_Modifier: 'Emoji_Modifier', + Emoji_Modifier_Base: 'Emoji_Modifier_Base', + Emoji_Presentation: 'Emoji_Presentation', + Extender: 'Ext', + Grapheme_Base: 'Gr_Base', + Grapheme_Extend: 'Gr_Ext', + Hex_Digit: 'Hex', + IDS_Binary_Operator: 'IDSB', + IDS_Trinary_Operator: 'IDST', + ID_Continue: 'IDC', + ID_Start: 'IDS', + Ideographic: 'Ideo', + Join_Control: 'Join_C', + Logical_Order_Exception: 'LOE', + Lowercase: 'Lower', + Math: 'Math', + Noncharacter_Code_Point: 'NChar', + Pattern_Syntax: 'Pat_Syn', + Pattern_White_Space: 'Pat_WS', + Quotation_Mark: 'QMark', + Radical: 'Radical', + Regional_Indicator: 'RI', + Sentence_Terminal: 'STerm', + Soft_Dotted: 'SD', + Terminal_Punctuation: 'Term', + Unified_Ideograph: 'UIdeo', + Uppercase: 'Upper', + Variation_Selector: 'VS', + White_Space: 'space', + XID_Continue: 'XIDC', + XID_Start: 'XIDS', +}; + +const BINARY_ALIASES_TO_PROP_NAMES = inverseMap(BINARY_PROP_NAMES_TO_ALIASES); + +const GENERAL_CATEGORY_VALUE_TO_ALIASES = { + Cased_Letter: 'LC', + Close_Punctuation: 'Pe', + Connector_Punctuation: 'Pc', + Control: ['Cc', 'cntrl'], + Currency_Symbol: 'Sc', + Dash_Punctuation: 'Pd', + Decimal_Number: ['Nd', 'digit'], + Enclosing_Mark: 'Me', + Final_Punctuation: 'Pf', + Format: 'Cf', + Initial_Punctuation: 'Pi', + Letter: 'L', + Letter_Number: 'Nl', + Line_Separator: 'Zl', + Lowercase_Letter: 'Ll', + Mark: ['M', 'Combining_Mark'], + Math_Symbol: 'Sm', + Modifier_Letter: 'Lm', + Modifier_Symbol: 'Sk', + Nonspacing_Mark: 'Mn', + Number: 'N', + Open_Punctuation: 'Ps', + Other: 'C', + Other_Letter: 'Lo', + Other_Number: 'No', + Other_Punctuation: 'Po', + Other_Symbol: 'So', + Paragraph_Separator: 'Zp', + Private_Use: 'Co', + Punctuation: ['P', 'punct'], + Separator: 'Z', + Space_Separator: 'Zs', + Spacing_Mark: 'Mc', + Surrogate: 'Cs', + Symbol: 'S', + Titlecase_Letter: 'Lt', + Unassigned: 'Cn', + Uppercase_Letter: 'Lu', +}; + +const GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES = inverseMap( + GENERAL_CATEGORY_VALUE_TO_ALIASES +); + +const SCRIPT_VALUE_TO_ALIASES = { + Adlam: 'Adlm', + Ahom: 'Ahom', + Anatolian_Hieroglyphs: 'Hluw', + Arabic: 'Arab', + Armenian: 'Armn', + Avestan: 'Avst', + Balinese: 'Bali', + Bamum: 'Bamu', + Bassa_Vah: 'Bass', + Batak: 'Batk', + Bengali: 'Beng', + Bhaiksuki: 'Bhks', + Bopomofo: 'Bopo', + Brahmi: 'Brah', + Braille: 'Brai', + Buginese: 'Bugi', + Buhid: 'Buhd', + Canadian_Aboriginal: 'Cans', + Carian: 'Cari', + Caucasian_Albanian: 'Aghb', + Chakma: 'Cakm', + Cham: 'Cham', + Cherokee: 'Cher', + Common: 'Zyyy', + Coptic: ['Copt', 'Qaac'], + Cuneiform: 'Xsux', + Cypriot: 'Cprt', + Cyrillic: 'Cyrl', + Deseret: 'Dsrt', + Devanagari: 'Deva', + Duployan: 'Dupl', + Egyptian_Hieroglyphs: 'Egyp', + Elbasan: 'Elba', + Ethiopic: 'Ethi', + Georgian: 'Geor', + Glagolitic: 'Glag', + Gothic: 'Goth', + Grantha: 'Gran', + Greek: 'Grek', + Gujarati: 'Gujr', + Gurmukhi: 'Guru', + Han: 'Hani', + Hangul: 'Hang', + Hanunoo: 'Hano', + Hatran: 'Hatr', + Hebrew: 'Hebr', + Hiragana: 'Hira', + Imperial_Aramaic: 'Armi', + Inherited: ['Zinh', 'Qaai'], + Inscriptional_Pahlavi: 'Phli', + Inscriptional_Parthian: 'Prti', + Javanese: 'Java', + Kaithi: 'Kthi', + Kannada: 'Knda', + Katakana: 'Kana', + Kayah_Li: 'Kali', + Kharoshthi: 'Khar', + Khmer: 'Khmr', + Khojki: 'Khoj', + Khudawadi: 'Sind', + Lao: 'Laoo', + Latin: 'Latn', + Lepcha: 'Lepc', + Limbu: 'Limb', + Linear_A: 'Lina', + Linear_B: 'Linb', + Lisu: 'Lisu', + Lycian: 'Lyci', + Lydian: 'Lydi', + Mahajani: 'Mahj', + Malayalam: 'Mlym', + Mandaic: 'Mand', + Manichaean: 'Mani', + Marchen: 'Marc', + Masaram_Gondi: 'Gonm', + Meetei_Mayek: 'Mtei', + Mende_Kikakui: 'Mend', + Meroitic_Cursive: 'Merc', + Meroitic_Hieroglyphs: 'Mero', + Miao: 'Plrd', + Modi: 'Modi', + Mongolian: 'Mong', + Mro: 'Mroo', + Multani: 'Mult', + Myanmar: 'Mymr', + Nabataean: 'Nbat', + New_Tai_Lue: 'Talu', + Newa: 'Newa', + Nko: 'Nkoo', + Nushu: 'Nshu', + Ogham: 'Ogam', + Ol_Chiki: 'Olck', + Old_Hungarian: 'Hung', + Old_Italic: 'Ital', + Old_North_Arabian: 'Narb', + Old_Permic: 'Perm', + Old_Persian: 'Xpeo', + Old_South_Arabian: 'Sarb', + Old_Turkic: 'Orkh', + Oriya: 'Orya', + Osage: 'Osge', + Osmanya: 'Osma', + Pahawh_Hmong: 'Hmng', + Palmyrene: 'Palm', + Pau_Cin_Hau: 'Pauc', + Phags_Pa: 'Phag', + Phoenician: 'Phnx', + Psalter_Pahlavi: 'Phlp', + Rejang: 'Rjng', + Runic: 'Runr', + Samaritan: 'Samr', + Saurashtra: 'Saur', + Sharada: 'Shrd', + Shavian: 'Shaw', + Siddham: 'Sidd', + SignWriting: 'Sgnw', + Sinhala: 'Sinh', + Sora_Sompeng: 'Sora', + Soyombo: 'Soyo', + Sundanese: 'Sund', + Syloti_Nagri: 'Sylo', + Syriac: 'Syrc', + Tagalog: 'Tglg', + Tagbanwa: 'Tagb', + Tai_Le: 'Tale', + Tai_Tham: 'Lana', + Tai_Viet: 'Tavt', + Takri: 'Takr', + Tamil: 'Taml', + Tangut: 'Tang', + Telugu: 'Telu', + Thaana: 'Thaa', + Thai: 'Thai', + Tibetan: 'Tibt', + Tifinagh: 'Tfng', + Tirhuta: 'Tirh', + Ugaritic: 'Ugar', + Vai: 'Vaii', + Warang_Citi: 'Wara', + Yi: 'Yiii', + Zanabazar_Square: 'Zanb', +}; + +const SCRIPT_VALUE_ALIASES_TO_VALUE = inverseMap(SCRIPT_VALUE_TO_ALIASES); + +function inverseMap(data) { + const inverse = {}; + + for (let name in data) { + if (!data.hasOwnProperty(name)) { + continue; + } + const value = data[name]; + if (Array.isArray(value)) { + for (let i = 0; i < value.length; i++) { + inverse[value[i]] = name; + } + } else { + inverse[value] = name; + } + } + + return inverse; +} + +function isValidName(name) { + return ( + NON_BINARY_PROP_NAMES_TO_ALIASES.hasOwnProperty(name) || + NON_BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name) || + BINARY_PROP_NAMES_TO_ALIASES.hasOwnProperty(name) || + BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name) + ); +} + +function isValidValue(name, value) { + if (isGeneralCategoryName(name)) { + return isGeneralCategoryValue(value); + } + + if (isScriptCategoryName(name)) { + return isScriptCategoryValue(value); + } + + return false; +} + +function isAlias(name) { + return ( + NON_BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name) || + BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name) + ); +} + +function isGeneralCategoryName(name) { + return name === 'General_Category' || name == 'gc'; +} + +function isScriptCategoryName(name) { + return ( + name === 'Script' || + name === 'Script_Extensions' || + name === 'sc' || + name === 'scx' + ); +} + +function isGeneralCategoryValue(value) { + return ( + GENERAL_CATEGORY_VALUE_TO_ALIASES.hasOwnProperty(value) || + GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES.hasOwnProperty(value) + ); +} + +function isScriptCategoryValue(value) { + return ( + SCRIPT_VALUE_TO_ALIASES.hasOwnProperty(value) || + SCRIPT_VALUE_ALIASES_TO_VALUE.hasOwnProperty(value) + ); +} + +function isBinaryPropertyName(name) { + return ( + BINARY_PROP_NAMES_TO_ALIASES.hasOwnProperty(name) || + BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name) + ); +} + +function getCanonicalName(name) { + if (NON_BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name)) { + return NON_BINARY_ALIASES_TO_PROP_NAMES[name]; + } + + if (BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(name)) { + return BINARY_ALIASES_TO_PROP_NAMES[name]; + } + + return null; +} + +function getCanonicalValue(value) { + if (GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES.hasOwnProperty(value)) { + return GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES[value]; + } + + if (SCRIPT_VALUE_ALIASES_TO_VALUE.hasOwnProperty(value)) { + return SCRIPT_VALUE_ALIASES_TO_VALUE[value]; + } + + if (BINARY_ALIASES_TO_PROP_NAMES.hasOwnProperty(value)) { + return BINARY_ALIASES_TO_PROP_NAMES[value]; + } + + return null; +} + +module.exports = { + isAlias, + isValidName, + isValidValue, + isGeneralCategoryValue, + isScriptCategoryValue, + isBinaryPropertyName, + getCanonicalName, + getCanonicalValue, + + NON_BINARY_PROP_NAMES_TO_ALIASES, + NON_BINARY_ALIASES_TO_PROP_NAMES, + + BINARY_PROP_NAMES_TO_ALIASES, + BINARY_ALIASES_TO_PROP_NAMES, + + GENERAL_CATEGORY_VALUE_TO_ALIASES, + GENERAL_CATEGORY_VALUE_ALIASES_TO_VALUES, + + SCRIPT_VALUE_TO_ALIASES, + SCRIPT_VALUE_ALIASES_TO_VALUE, +};