From 70c9af0338878be37289e65525b8efc1e5eb6143 Mon Sep 17 00:00:00 2001 From: IKEDA Soji Date: Sun, 25 Jun 2017 12:07:36 +0900 Subject: [PATCH 1/4] Emphasis and East Asian text: code changes. --- src/inlines.c | 10 ++++++++-- src/utf8.c | 19 +++++++++++++++++++ src/utf8.h | 1 + 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/inlines.c b/src/inlines.c index bbda78f3b..4c21e8137 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -421,17 +421,23 @@ static int scan_delims(subject *subj, unsigned char c, bool *can_open, } left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) && (!cmark_utf8proc_is_punctuation(after_char) || + cmark_utf8proc_is_eastasian_punctuation(after_char) || cmark_utf8proc_is_space(before_char) || cmark_utf8proc_is_punctuation(before_char)); right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) && (!cmark_utf8proc_is_punctuation(before_char) || + cmark_utf8proc_is_eastasian_punctuation(before_char) || cmark_utf8proc_is_space(after_char) || cmark_utf8proc_is_punctuation(after_char)); if (c == '_') { *can_open = left_flanking && - (!right_flanking || cmark_utf8proc_is_punctuation(before_char)); + (!right_flanking || + cmark_utf8proc_is_punctuation(before_char) || + cmark_utf8proc_is_eastasian_punctuation(after_char)); *can_close = right_flanking && - (!left_flanking || cmark_utf8proc_is_punctuation(after_char)); + (!left_flanking || + cmark_utf8proc_is_punctuation(after_char) || + cmark_utf8proc_is_eastasian_punctuation(before_char)); } else if (c == '\'' || c == '"') { *can_open = left_flanking && !right_flanking && before_char != ']' && before_char != ')'; diff --git a/src/utf8.c b/src/utf8.c index c29bbf770..6cfd676b9 100644 --- a/src/utf8.c +++ b/src/utf8.c @@ -315,3 +315,22 @@ int cmark_utf8proc_is_punctuation(int32_t uc) { uc == 92917 || (uc >= 92983 && uc <= 92987) || uc == 92996 || uc == 113823); } + +// matches punctuations with East_Asian_Width property value A, W, F or H. +int cmark_utf8proc_is_eastasian_punctuation(int32_t uc) { + return ( + uc == 161 || uc == 183 || uc == 191 || uc == 8208 || + (uc >= 8211 && uc <= 8214) || uc == 8216 || uc == 8217 || uc == 8220 || + uc == 8221 || (uc >= 8224 && uc <= 8226) || (uc >= 8228 && uc <= 8231) || + uc == 8240 || uc == 8242 || uc == 8243 || uc == 8245 || uc == 8251 || + uc == 8254 || uc == 9001 || uc == 9002 || (uc >= 12289 && uc <= 12291) || + (uc >= 12296 && uc <= 12305) || (uc >= 12308 && uc <= 12319) || + uc == 12336 || uc == 12349 || uc == 12448 || uc == 12539 || + (uc >= 65040 && uc <= 65049) || (uc >= 65072 && uc <= 65106) || + (uc >= 65108 && uc <= 65121) || uc == 65123 || uc == 65128 || + uc == 65130 || uc == 65131 || (uc >= 65281 && uc <= 65283) || + (uc >= 65285 && uc <= 65290) || (uc >= 65292 && uc <= 65295) || + uc == 65306 || uc == 65307 || uc == 65311 || uc == 65312 || + (uc >= 65339 && uc <= 65341) || uc == 65343 || uc == 65371 || + uc == 65373 || (uc >= 65375 && uc <= 65381)); +} diff --git a/src/utf8.h b/src/utf8.h index 8e45714d4..91b8a6b01 100644 --- a/src/utf8.h +++ b/src/utf8.h @@ -16,6 +16,7 @@ void cmark_utf8proc_check(cmark_strbuf *dest, const uint8_t *line, bufsize_t size); int cmark_utf8proc_is_space(int32_t uc); int cmark_utf8proc_is_punctuation(int32_t uc); +int cmark_utf8proc_is_eastasian_punctuation(int32_t uc); #ifdef __cplusplus } From d529d6c8f001167bfb91d856299c0eab7c748afc Mon Sep 17 00:00:00 2001 From: IKEDA Soji Date: Sun, 25 Jun 2017 12:12:37 +0900 Subject: [PATCH 2/4] Emphasis and East Asian text: Proposed changes to spec. --- test/spec.txt | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/test/spec.txt b/test/spec.txt index 686283830..fd914c943 100644 --- a/test/spec.txt +++ b/test/spec.txt @@ -335,6 +335,9 @@ A [punctuation character](@) is an [ASCII punctuation character] or anything in the general Unicode categories `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. +An [East Asian punctuation character](@) is a [punctuation character] with +the East_Asian_Width property values `A`, `F`, `H` or `W`. + ## Tabs Tabs in lines are not expanded to [spaces]. However, @@ -5984,14 +5987,16 @@ a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is a [delimiter run] that is (a) not followed by [Unicode whitespace], -and (b) not followed by a [punctuation character], or +and (b) not followed by a [punctuation character] which is not +[East Asian punctuation character], or preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is a [delimiter run] that is (a) not preceded by [Unicode whitespace], -and (b) not preceded by a [punctuation character], or +and (b) not preceded by a [punctuation character] which is not +[East Asian punctuation character], or followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. @@ -6045,36 +6050,44 @@ The following rules define emphasis and strong emphasis: 2. A single `_` character [can open emphasis] iff it is part of a [left-flanking delimiter run] - and either (a) not part of a [right-flanking delimiter run] - or (b) part of a [right-flanking delimiter run] - preceded by punctuation. + and either (a) not part of a [right-flanking delimiter run], + (b) part of a [right-flanking delimiter run] + preceded by punctuation + or (c) part of a [right-flanking delimiter run] + followed by East Asian punctuation. 3. A single `*` character [can close emphasis](@) iff it is part of a [right-flanking delimiter run]. 4. A single `_` character [can close emphasis] iff it is part of a [right-flanking delimiter run] - and either (a) not part of a [left-flanking delimiter run] - or (b) part of a [left-flanking delimiter run] - followed by punctuation. + and either (a) not part of a [left-flanking delimiter run], + (b) part of a [left-flanking delimiter run] + followed by punctuation + or (c) part of a [left-flanking delimiter run] + preceded by East Asian punctuation. 5. A double `**` [can open strong emphasis](@) iff it is part of a [left-flanking delimiter run]. 6. A double `__` [can open strong emphasis] iff it is part of a [left-flanking delimiter run] - and either (a) not part of a [right-flanking delimiter run] - or (b) part of a [right-flanking delimiter run] - preceded by punctuation. + and either (a) not part of a [right-flanking delimiter run], + (b) part of a [right-flanking delimiter run] + preceded by punctuation + or (c) part of a [right-flanking delimiter run] + followed by East Asian punctuation. 7. A double `**` [can close strong emphasis](@) iff it is part of a [right-flanking delimiter run]. 8. A double `__` [can close strong emphasis] iff it is part of a [right-flanking delimiter run] - and either (a) not part of a [left-flanking delimiter run] - or (b) part of a [left-flanking delimiter run] - followed by punctuation. + and either (a) not part of a [left-flanking delimiter run], + (b) part of a [left-flanking delimiter run] + followed by punctuation + or (c) part of a [left-flanking delimiter run] + preceded by East Asian punctuation. 9. Emphasis begins with a delimiter that [can open emphasis] and ends with a delimiter that [can close emphasis], and that uses the same From ff2a079b0091041547b7374f92b4e942dc0a117a Mon Sep 17 00:00:00 2001 From: IKEDA Soji Date: Sun, 25 Jun 2017 12:18:09 +0900 Subject: [PATCH 3/4] Emphasis and East Asian text: Adding test cases. --- test/spec.txt | 144 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) diff --git a/test/spec.txt b/test/spec.txt index fd914c943..b8cb27b5e 100644 --- a/test/spec.txt +++ b/test/spec.txt @@ -6177,6 +6177,24 @@ a*"foo"* ```````````````````````````````` +This is emphasis, because the opening `*` is preceded +by an alphanumeric and followed by East Asian punctuation, and hence +part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a*「foo」* +. +

a「foo」

+```````````````````````````````` + + +```````````````````````````````` example +『a』*「foo」* +. +

『a』「foo」

+```````````````````````````````` + + Unicode nonbreaking spaces count as whitespace, too: ```````````````````````````````` example @@ -6202,6 +6220,16 @@ foo*bar* ```````````````````````````````` +Emphasis including East Asian punctuation without leading whitespace is +permitted: + +```````````````````````````````` example +5*「6」*78 +. +

5「6」78

+```````````````````````````````` + + Rule 2: ```````````````````````````````` example @@ -6231,6 +6259,25 @@ a_"foo"_ ```````````````````````````````` +This is emphasis, because the opening `_` is preceded +by an alphanumeric and followed by East Asian punctuation, and hence +part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a_「foo」_ +. +

a「foo」

+```````````````````````````````` + + +```````````````````````````````` example +『a』_「foo」_ +. +

『a』「foo」

+```````````````````````````````` + + +Unicode nonbreaking spaces count as whitespace, too: Emphasis with `_` is not allowed inside words: ```````````````````````````````` example @@ -6254,6 +6301,13 @@ foo_bar_ ```````````````````````````````` +```````````````````````````````` example +五_六_七八 +. +

五_六_七八

+```````````````````````````````` + + Here `_` does not generate emphasis, because the first delimiter run is right-flanking and the second left-flanking: @@ -6393,6 +6447,13 @@ _foo_bar_baz_ ```````````````````````````````` +```````````````````````````````` example +_み_か_ん_ +. +

み_か_ん

+```````````````````````````````` + + This is emphasis, even though the closing delimiter is both left- and right-flanking, because it is followed by punctuation: @@ -6434,6 +6495,24 @@ a**"foo"** ```````````````````````````````` +This is strong emphasis, because the opening `**` is preceded +by an alphanumeric and followed by East Asian punctuation, and hence +part of a [left-flanking delimiter run]: + +```````````````````````````````` example +a**「foo」** +. +

a「foo」

+```````````````````````````````` + + +```````````````````````````````` example +『a』**「foo」** +. +

『a』「foo」

+```````````````````````````````` + + Intraword strong emphasis with `**` is permitted: ```````````````````````````````` example @@ -6482,6 +6561,23 @@ a__"foo"__ ```````````````````````````````` +This is strong emphasis, because the opening `__` is preceded +by an alphanumeric and followed by East Asian punctuation: + +```````````````````````````````` example +a__「foo」__ +. +

a「foo」

+```````````````````````````````` + + +```````````````````````````````` example +『a』__「foo」__ +. +

『a』「foo」

+```````````````````````````````` + + Intraword strong emphasis is forbidden with `__`: ```````````````````````````````` example @@ -6505,6 +6601,13 @@ foo__bar__ ```````````````````````````````` +```````````````````````````````` example +五__六__七八 +. +

五__六__七八

+```````````````````````````````` + + ```````````````````````````````` example __foo, __bar__, baz__ . @@ -6512,6 +6615,15 @@ __foo, __bar__, baz__ ```````````````````````````````` +East Asian punctuations introduce ambiguity: + +```````````````````````````````` example +__foo、__bar__、baz__ +. +

foo、bar、baz

+```````````````````````````````` + + This is strong emphasis, even though the opening delimiter is both left- and right-flanking, because it is preceded by punctuation: @@ -6549,6 +6661,15 @@ preceded by punctuation and followed by an alphanumeric: ```````````````````````````````` +East Asian punctuations is not the case: + +```````````````````````````````` example +**〔**foo〕 +. +

foo〕

+```````````````````````````````` + + The point of this restriction is more easily appreciated with these examples: @@ -6575,6 +6696,13 @@ with these examples: ```````````````````````````````` +```````````````````````````````` example +**foo「*bar*」foo** +. +

foo「bar」foo

+```````````````````````````````` + + Intraword emphasis: ```````````````````````````````` example @@ -6606,6 +6734,15 @@ __(__foo) ```````````````````````````````` +East Asian punctuations is not the case: + +```````````````````````````````` example +__〔__foo〕 +. +

foo〕

+```````````````````````````````` + + The point of this restriction is more easily appreciated with this example: @@ -6632,6 +6769,13 @@ __пристаням__стремятся ```````````````````````````````` +```````````````````````````````` example +__か__き +. +

__か__き

+```````````````````````````````` + + ```````````````````````````````` example __foo__bar__baz__ . From 9a24044403268ab6c50177b6f3bf7724f8481700 Mon Sep 17 00:00:00 2001 From: IKEDA Soji Date: Tue, 27 Jun 2017 23:48:53 +0900 Subject: [PATCH 4/4] Copyedit on proposed spec. --- test/spec.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/spec.txt b/test/spec.txt index b8cb27b5e..273c843eb 100644 --- a/test/spec.txt +++ b/test/spec.txt @@ -337,6 +337,8 @@ the general Unicode categories `Pc`, `Pd`, `Pe`, `Pf`, `Pi`, `Po`, or `Ps`. An [East Asian punctuation character](@) is a [punctuation character] with the East_Asian_Width property values `A`, `F`, `H` or `W`. +An [non-East Asian punctuation character](@) is a [punctuation character] with +the East_Asian_Width property values `N` or `Na`. ## Tabs @@ -5987,16 +5989,14 @@ a non-backslash-escaped `_` character. A [left-flanking delimiter run](@) is a [delimiter run] that is (a) not followed by [Unicode whitespace], -and (b) not followed by a [punctuation character] which is not -[East Asian punctuation character], or +and (b) not followed by a [non-East Asian punctuation character], or preceded by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. A [right-flanking delimiter run](@) is a [delimiter run] that is (a) not preceded by [Unicode whitespace], -and (b) not preceded by a [punctuation character] which is not -[East Asian punctuation character], or +and (b) not preceded by a [non-East Asian punctuation character], or followed by [Unicode whitespace] or a [punctuation character]. For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.