brucemiller · brucemiller · Aug 25, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml
@@ -14,7 +14,7 @@ package LaTeXML::Package::Pool;
 use strict;
 use warnings;
 use LaTeXML::Package;
-use Unicode::Normalize;
+use LaTeXML::Util::Unicode;
 
 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 # Character Family of primitive control sequences
@@ -70,43 +70,31 @@ sub applyAccent {
 
 # Defines an accent command using a combining char that follows the
 # 1st char of the argument.  In cases where there is no argument, $standalonechar is used.
+# Ideally, the pair match up with an entry in Util::Unicode's accents
 sub DefAccent {
   my ($accent, $combiningchar, $standalonechar, %options) = @_;
-  $options{above} = 1 if !(defined $options{above}) && !$options{below};
-  # Used for converting a char used as an above-accent to a combining char (See \accent)
-  AssignMapping('accent_combiner_above', $standalonechar => $combiningchar) if $options{above};
-  AssignMapping('accent_combiner_below', $standalonechar => $combiningchar) unless $options{above};
-  DefMacroI($accent, "{}",
-    Tokens(T_CS('\lx@applyaccent'), T_OTHER($accent),
-      T_OTHER($combiningchar), T_OTHER($standalonechar),
-      T_BEGIN, T_ARG(1), T_END),
+  $accent = T_CS($accent) unless ref $accent;
+  DefPrimitiveI($accent, "{}", sub {
+      my ($stomach, $letter) = @_;
+      applyAccent($stomach, $letter, $combiningchar, $standalonechar,
+        Tokens($accent, T_BEGIN, $letter, T_END)); },
     protected => 1);
   return; }
 
-DefPrimitiveI('\lx@applyaccent', "DefToken Token Token {}", sub {
-    my ($stomach, $accent, $combiningchar, $standalonechar, $letter) = @_;
-    applyAccent($stomach, $letter, $combiningchar->getString, $standalonechar->getString,
-      Tokens(T_CS($accent->getString), T_BEGIN, $letter, T_END)); },
-  mode => 'text');
-
-# This will fail if there really are "assignments" after the number!
-# We're given a number pointing into the font, from which we can derive the standalone char.
-# From that, we want to figure out the combining character, but there could be one for
-# both the above & below cases!  We'll prefer the above case.
+# This will fail if there really are "assignments" after the number! (See TeX Book)
+# We're given a number pointing into the font; the FontMap presumably has the standalone char.
+# If there's no letter to be accented, just use the stanadalone.
+# Otherwise, use the Util::Unicode module to find the appropriate combining character
 DefPrimitive('\accent Number {}', sub {
     my ($stomach, $num, $letter) = @_;
     my $n        = $num->valueOf;
-    my $fontinfo = lookupFontinfo(LookupValue('textfont_0'));
-    my $acc      = ($fontinfo && $$fontinfo{encoding} ? FontDecode($n, $$fontinfo{encoding}) : chr($n));
-    my $reversion = Invocation(T_CS('\accent'), $num, $letter);
-    # NOTE: REVERSE LOOKUP in above accent list for the non-spacing accent char
-    # BUT, \accent always (?) makes an above type accent... doesn't it?
-    if (my $combiner = LookupMapping('accent_combiner_above', $acc)
-      || LookupMapping('accent_combiner_below', $acc)) {
-      applyAccent($stomach, $letter, $combiner, $acc, $reversion); }
-    else {
-      Warn('unexpected', "accent$n", $stomach, "Accent '$n' not recognized");
-      Box(ToString($letter), undef, undef, $reversion); } });
+    my $encoding = LookupValue('font')->getEncoding || 'OT1';
+    my $char     = ($encoding ? FontDecode($n, $encoding) : chr($n));
+    if (my $entry = unicode_accent($char)) {
+      applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
+        Invocation(T_CS('\accent'), $num, $letter)); }
+    else {    # Unknown accent ?  Really should OVERLAY it on top of $letter???
+      List(Digest($letter), Box($char)); } });
 
 #======================================================================
 # \chardef        iq provides an alternate way to define a control sequence that returns a character.

diff --git a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
@@ -140,37 +140,28 @@ DeclareFontMap('ASCII',
     'p',   'q',   'r',   's',   't',   'u',   'v',   'w',
     'x',   'y',   'z',   "{",   "|",   "}",   "~",   undef]);
 
-# Note that several entries are used for accents, and in practice will actually
-# be used in something like an m:mover; thus they needn't (shouldn't?) be "small"
-# There are also some questions about which choices are best
-# grave & acute accents (entry 0x12 & 0x13) (often typed using 0x60 & 0x27)
-#   are probably best using U+60(grave accent) & U+B4(acute accent)
-#   but could be U+2035 (reversed prime) & U+2032 (prime).  (particularly for math?)
-#   [we do use these for \prime, however!]
-#   or U+02CB (modifier letter grave accent) & U+02CA (modifier letter acute accent)
-# Similarly, hat & tilde (entries 0x5E & 0x7E)
-#   typed using ^ 0x5E circumflex accent) & ~ 0x7E  tilde
-#   are probably best just sticking with U+5E & U+7E
-#   but could be U+02C6 (modifier letter circumflex accent) U+02DC (small tilde)
-# [Note that generally we're using codepoints characterized as "modifier letter"
-# only when no other spacing point is available.]
+# Note that several entries are used for accents.
+# TeX fonts typically contain a standalone version of an accent, ie smallish & raised.
+# We'll consult a table in LaTeXML::Util::Unicode to determine the equivalent combining character,
+# as well as an "unwrapped" one for use in Math tokens (eg. as an overaccent)
+# NOTE: 0x12--0x18, 0x5E-0x5F, 0x7D-0x7F are accents
 DeclareFontMap('OT1',
   ["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",
     "\x{03A6}", "\x{03A8}",      "\x{03A9}", "\x{FB00}", "\x{FB01}", "\x{FB02}", "\x{FB03}", "\x{FB04}",
     "\x{0131}", "\x{0237}",      UTF(0x60),  UTF(0xB4),  "\x{02C7}", "\x{02D8}", UTF(0xAF),  "\x{02DA}",
     UTF(0xB8),  UTF(0xDF),       UTF(0xE6),  "\x{0153}", UTF(0xF8),  UTF(0xC6),  "\x{152}",  UTF(0xD8),
-    UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#',        '$',        '%',        '&',       "\x{2019}",
-    '(',                    ')', '*',        '+',        ',',        '-',        '.',       '/',
-    '0',                    '1', '2',        '3',        '4',        '5',        '6',       '7',
-    '8',                    '9', ':',        ';',        UTF(0xA1),  '=',        UTF(0xBF), '?',
-    '@',                    'A', 'B',        'C',        'D',        'E',        'F',       'G',
-    'H',                    'I', 'J',        'K',        'L',        'M',        'N',       'O',
-    'P',                    'Q', 'R',        'S',        'T',        'U',        'V',       'W',
-    'X',                    'Y', 'Z',        '[',        "\x{201C}", ']',        "^",       "\x{02D9}",
-    "\x{2018}",             'a', 'b',        'c',        'd',        'e',        'f',       'g',
-    'h',                    'i', 'j',        'k',        'l',        'm',        'n',       'o',
-    'p',                    'q', 'r',        's',        't',        'u',        'v',       'w',
-    'x',                    'y', 'z',        "\x{2013}", "\x{2014}", "\x{02DD}", UTF(0x7E), UTF(0xA8)]);
+    UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#',        '$',        '%',        '&',        "\x{2019}",
+    '(',                    ')', '*',        '+',        ',',        '-',        '.',        '/',
+    '0',                    '1', '2',        '3',        '4',        '5',        '6',        '7',
+    '8',                    '9', ':',        ';',        UTF(0xA1),  '=',        UTF(0xBF),  '?',
+    '@',                    'A', 'B',        'C',        'D',        'E',        'F',        'G',
+    'H',                    'I', 'J',        'K',        'L',        'M',        'N',        'O',
+    'P',                    'Q', 'R',        'S',        'T',        'U',        'V',        'W',
+    'X',                    'Y', 'Z',        '[',        "\x{201C}", ']',        "\x{02C6}", "\x{02D9}",
+    "\x{2018}",             'a', 'b',        'c',        'd',        'e',        'f',        'g',
+    'h',                    'i', 'j',        'k',        'l',        'm',        'n',        'o',
+    'p',                    'q', 'r',        's',        't',        'u',        'v',        'w',
+    'x',                    'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", "\x{02DC}", UTF(0xA8)]);
 
 DeclareFontMap('OT1',
   ["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",

diff --git a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml
@@ -618,14 +618,19 @@ DefPrimitive('\mathchardef Token SkipSpaces SkipMatch:=', sub {
     return; });
 
 DefConstructor('\mathaccent Number Digested',
-  "<ltx:XMApp><ltx:XMTok role='OVERACCENT'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
+  "<ltx:XMApp><ltx:XMTok role='#accrole'>#glyph</ltx:XMTok><ltx:XMArg>#2</ltx:XMArg></ltx:XMApp>",
   sizer       => '#2',    # Close enough?
   afterDigest => sub {
     my ($stomach, $whatsit) = @_;
     my $n = $whatsit->getArg(1)->valueOf;
     my ($role, $glyph) = decodeMathChar($n);
-    $whatsit->setProperty(glyph => $glyph)                                  if $glyph;
-    $whatsit->setProperty(font  => LookupValue('font')->specialize($glyph)) if $glyph;
+    my $accrole = 'OVERACCENT';
+    if (my $entry = unicode_accent($glyph)) {
+      $glyph   = $$entry{unwrapped};
+      $accrole = $$entry{pos}; }
+    $whatsit->setProperty(glyph   => $glyph)                                  if $glyph;
+    $whatsit->setProperty(font    => LookupValue('font')->specialize($glyph)) if $glyph;
+    $whatsit->setProperty(accrole => $accrole)                                if $glyph;
     return; });
 
 # # Only used for active math characters, so far

diff --git a/lib/LaTeXML/Engine/plain.pool.ltxml b/lib/LaTeXML/Engine/plain.pool.ltxml
@@ -696,21 +696,21 @@ DefPrimitiveI('\pounds',    undef, UTF(0xA3));                          # POUND
 
 DefAccent('\`',           "\x{0300}", UTF(0x60));  # COMBINING GRAVE ACCENT & GRAVE ACCENT
 DefAccent("\\'",          "\x{0301}", UTF(0xB4));  # COMBINING ACUTE ACCENT & ACUTE ACCENT
-DefAccent('\^',           "\x{0302}", UTF(0x5E));  # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
+DefAccent('\^',           "\x{0302}", "\x{02C6}"); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
 DefAccent('\"',           "\x{0308}", UTF(0xA8));  # COMBINING DIAERESIS & DIAERESIS
-DefAccent('\~',           "\x{0303}", "~");        # COMBINING TILDE
+DefAccent('\~',           "\x{0303}", "\x{02DC}"); # COMBINING TILDE
 DefAccent('\=',           "\x{0304}", UTF(0xAF));  # COMBINING MACRON & MACRON
 DefAccent('\.',           "\x{0307}", "\x{02D9}"); # COMBINING DOT ABOVE & DOT ABOVE
 DefAccent('\u',           "\x{0306}", "\x{02D8}"); # COMBINING BREVE & BREVE
 DefAccent('\v',           "\x{030C}", "\x{02C7}"); # COMBINING CARON & CARON
-DefAccent('\@ringaccent', "\x{030A}", "o");        # COMBINING RING ABOVE & non-combining
-DefAccent('\r',           "\x{030A}", "o");        # COMBINING RING ABOVE & non-combining
+DefAccent('\@ringaccent', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
+DefAccent('\r',           "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
 DefAccent('\H',           "\x{030B}", "\x{02DD}"); # COMBINING DOUBLE ACUTE ACCENT & non-combining
 DefAccent('\c',           "\x{0327}", UTF(0xB8), below => 1);    # COMBINING CEDILLA & CEDILLA
     # NOTE: The next two get define for math, as well; See below
-DefAccent('\@text@daccent', "\x{0323}", '.',       below => 1);   # COMBINING DOT BELOW & DOT (?)
-DefAccent('\@text@baccent', "\x{0331}", UTF(0xAF), below => 1);   # COMBINING MACRON BELOW  & MACRON
-DefAccent('\t',             "\x{0361}", "-");    # COMBINING DOUBLE INVERTED BREVE & ???? What????
+DefAccent('\@text@daccent', "\x{0323}", '.', below => 1);    # COMBINING DOT BELOW & DOT (?)
+DefAccent('\@text@baccent', "\x{0331}", '_', below => 1);    # COMBINING MACRON BELOW  & MACRON
+DefAccent('\t', "\x{0361}", NBSP() . "\x{0361}");  # COMBINING DOUBLE INVERTED BREVE & ???? What????
     # this one's actually defined in mathscinet.sty, but just stick it here!
 DefAccent('\lfhook', "\x{0326}", ",", below => 1);   # COMBINING COMMA BELOW
                                                      # I doubt that latter covers multiple chars...?

diff --git a/lib/LaTeXML/Package/cleveref.sty.ltxml b/lib/LaTeXML/Package/cleveref.sty.ltxml
@@ -77,7 +77,7 @@ sub crefMulti {
     return @tokens; } }
 # Since we're not grouping by type, we're ignoring \crefpairgroupconjunction, etc
 
-DefConstructor('\lx@cref OptionalMatch:* {} Semiverbatim',
+DefConstructor('\lx@cref OptionalMatch:* HyperVerbatim Semiverbatim',
   "<ltx:ref labelref='#label' show='#2' ?#1(class='ltx_nolink')() _force_font='true'/>",
   properties => sub { (label => CleanLabel($_[3])); });
 

diff --git a/lib/LaTeXML/Package/textcomp.sty.ltxml b/lib/LaTeXML/Package/textcomp.sty.ltxml
@@ -25,17 +25,17 @@ DefAccent('\capitalacute',        "\x{0301}", UTF(0xB4));                # \'
 DefAccent('\capitalbreve',        "\x{0306}", "\x{02D8}");               # \u
 DefAccent('\capitalcaron',        "\x{030C}", "\x{02C7}");               # \v
 DefAccent('\capitalcedilla',      "\x{0327}", UTF(0xB8), below => 1);    # \c
-DefAccent('\capitalcircumflex',   "\x{0302}", UTF(0x5E));                # \^
+DefAccent('\capitalcircumflex',   "\x{0302}", "\x{02C6}");               # \^
 DefAccent('\capitaldieresis',     "\x{0308}", UTF(0xA8));                # \"
 DefAccent('\capitaldotaccent',    "\x{0307}", "\x{02D9}");               # \.
 DefAccent('\capitalgrave',        "\x{0300}", UTF(0x60));                # \`
 DefAccent('\capitalhungarumlaut', "\x{030B}", "\x{02DD}");               # \H
 DefAccent('\capitalmacron',       "\x{0304}", UTF(0xAF));                # \=
 DefAccent('\capitalnewtie',       "\x{0361}", "-");                      # \t
 DefAccent('\capitalogonek',       "\x{0328}", "\x{02DB}");               #
-DefAccent('\capitalring',         "\x{030A}", "o");                      # \r
+DefAccent('\capitalring',         "\x{030A}", "\x{02DA}");               # \r
 DefAccent('\capitaltie',          "\x{0361}", "-");                      # \t
-DefAccent('\capitaltilde',        "\x{0303}", "~");                      # \~
+DefAccent('\capitaltilde',        "\x{0303}", "\x{02DC}");               # \~
 DefAccent('\newtie',              "\x{0361}", "-");                      # \t
 
 #======================================================================

diff --git a/lib/LaTeXML/Util/Unicode.pm b/lib/LaTeXML/Util/Unicode.pm
@@ -13,8 +13,7 @@ package LaTeXML::Util::Unicode;
 use strict;
 use warnings;
 use base qw(Exporter);
-our @EXPORT = qw( &UTF &unicode_mathvariant &unicode_convert);
-
+our @EXPORT = qw( &UTF &NBSP &unicode_accent &unicode_mathvariant &unicode_convert);
 #======================================================================
 # Unicode manipulation utilities useful for LaTeXML
 # Mostly, but not exclusively, about Mathematics
@@ -24,6 +23,73 @@ sub UTF {
   my ($code) = @_;
   return pack('U', $code); }
 
+my $NBSP = UTF(0xA0);
+sub NBSP { return $NBSP; }
+
+#======================================================================
+# Accents
+# There are potentially several Unicode codepoints that characterize a given accent:
+#  combiner   : unicode combining character that effects the accent when following a base char.
+#       generally in Combining block
+#  standalone : form that shows accent w/o base, but small(ish) and already raised/lowered!
+#       sometimes called "isolated". Usually a "spacing" form, else NBSP followed by combiner.
+#  unwrapped  : form that shows the accent itself, typically larger and on baseline;
+#        Used in operand for eg. MathML mover/munder
+#  name       : arbitrary short descriptive, for good measure
+# The ideal glyphs for each of these don't necessarily exist in Unicode,
+# nor are the best choices always clear.
+# Ideally, we would cover ALL accents that might appear in TeX!
+our @accent_data = (
+  { name => 'grave', combiner => "\x{0300}", standalone => UTF(0x60),    # \'
+    unwrapped => "`", pos => 'OVERACCENT' },                             #  (OR \x{2035} or UTF(0x60) ?)
+  { name => 'acute', combiner => "\x{0301}", standalone => UTF(0xB4),    # \\'
+    unwrapped => UTF(0xB4), pos => 'OVERACCENT' },                       # (OR \x{2032} or UTF(0xB4)?)
+  { name => 'hat', combiner => "\x{0302}", standalone => "\x{02C6}",     # \^
+    unwrapped => UTF(0x5E), pos => 'OVERACCENT' },
+  { name => 'ddot', combiner => "\x{0308}", standalone => UTF(0xA8),     # \"
+    unwrapped => UTF(0xA8), pos => 'OVERACCENT' },                       # (or \x{22C5})
+  { name => 'tilde', combiner => "\x{0303}", standalone => "\x{02DC}",    # \~
+    unwrapped => UTF(0x7E), pos => 'OVERACCENT' },
+  { name => 'bar', combiner => "\x{0304}", standalone => UTF(0xAF),       # \=
+    unwrapped => UTF(0xAF), pos => 'OVERACCENT' },
+  { name => 'dot', combiner => "\x{0307}", standalone => "\x{02D9}",      # \.
+    unwrapped => "\x{02D9}", pos => 'OVERACCENT' },                       # (OR \x{22C5} or \x{0209} ?
+  { name => 'dtick', combiner => "\x{030B}", standalone => "\x{02DD}",    # \H
+    unwrapped => "\x{2032}\x{2032}", pos => 'OVERACCENT' },               # (Or UTF(0xA8) or " ?)
+  { name => 'breve', combiner => "\x{0306}", standalone => "\x{02D8}",    # \u
+    unwrapped => "\x{02D8}", pos => 'OVERACCENT' },
+  { name => 'check', combiner => "\x{030C}", standalone => "\x{02C7}",    # \v
+    unwrapped => "\x{02C7}", pos => 'OVERACCENT' },
+  { name => 'ring', combiner => "\x{030A}", standalone => "\x{02DA}",     # \r
+    unwrapped => "\x{02DA}", pos => 'OVERACCENT' },                       # (or \x{2218} ?)
+  { name => 'vec', combiner => "\x{20D7}", standalone => $NBSP . "\x{20D7}",    # \vec
+    unwrapped => "\x{2192}", pos => 'OVERACCENT' },
+  { name => 'tie', combiner => "\x{0361}", standalone => $NBSP . "\x{0361}",    # \t
+    unwrapped => "u", pos => 'OVERACCENT' },
+  ## UNDERACCENT accents
+  { name => 'cedilla', combiner => "\x{0327}", standalone => UTF(0xB8),         # \c
+    unwrapped => UTF(0xB8), pos => 'UNDERACCENT' },                             # not even math?
+  { name => 'underdot', combiner => "\x{0323}", standalone => '.',              #  \@text@daccent
+    unwrapped => "\x{22C5}", pos => 'UNDERACCENT' },                            # (Or \x{02D9} ?)
+  { name => 'underbar', combiner => "\x{0331}", standalone => '_',
+    unwrapped => UTF(0xAF), pos => 'UNDERACCENT' },
+  { name => 'lfhook', combiner => "\x{0326}", standalone => ",",                # '\lfhook'
+    unwrapped => ',', pos => 'UNDERACCENT' },
+  { name => 'ogonek', combiner => "\x{0328}", standalone => "\x{02DB}",
+    unwrapped => "\x{02DB}", pos => 'UNDERACCENT' },                            # not even math???
+);
+# Set up a hash keyed on both standalone & combiner chars
+our %accent_data = ();
+foreach my $entry (@accent_data) {
+  $accent_data{ $$entry{standalone} } = $entry;
+  $accent_data{ $$entry{combiner} }   = $entry;
+}
+
+# Lookup accent data keyed by either combiner or standalone unicode.
+sub unicode_accent {
+  my ($char) = @_;
+  return (defined $char) && $accent_data{$char}; }
+
 #======================================================================
 # Unicode Math Codepoints
 # The basic latin and greek alphabets, as well as numbers appear in several

diff --git a/t/digestion/io.xml b/t/digestion/io.xml
@@ -341,15 +341,15 @@ Line (eTeX): “macro “foo      followed by spacesﬂ”<break/>
 
 Line (eTeX): “keyword = 1ﬂ”<break/>
 
-Line (eTeX): “fake spaces^^20^^20^^20ﬂ”<break/>
+Line (eTeX): “fake spacesˆˆ20ˆˆ20ˆˆ20ﬂ”<break/>
 
-Line (eTeX): “macro followed by fake spaces “foo^^20^^20^^20ﬂ”<break/>
+Line (eTeX): “macro followed by fake spaces “fooˆˆ20ˆˆ20ˆˆ20ﬂ”<break/>
 
-Line (eTeX): “fake cr^^M apparently looses remainder of lineﬂ”<break/>
+Line (eTeX): “fake crˆˆM apparently looses remainder of lineﬂ”<break/>
 
 Line (eTeX): “face cr on next lineﬂ”<break/>
 
-Line (eTeX): “^^Mﬂ”<break/>
+Line (eTeX): “ˆˆMﬂ”<break/>
 
 Line (eTeX): “line – in bracketsﬂ”<break/>