diff --git a/lib/LaTeXML/Engine/TeX.pool.ltxml b/lib/LaTeXML/Engine/TeX.pool.ltxml
index 01e380c85..c3abcaaf8 100644
--- a/lib/LaTeXML/Engine/TeX.pool.ltxml
+++ b/lib/LaTeXML/Engine/TeX.pool.ltxml
@@ -16,6 +16,7 @@ use warnings;
use LaTeXML::Package;
use Unicode::Normalize;
use LaTeXML::Util::Pathname;
+use charnames ':full';
use List::Util qw(min max);
###$LaTeXML::DEBUG{compiled} = 1 unless $LaTeXML::DEBUG{compiling} || $LaTeXML::DEBUG{nocompiled};
diff --git a/lib/LaTeXML/Engine/TeX_Box.pool.ltxml b/lib/LaTeXML/Engine/TeX_Box.pool.ltxml
index c46060a1e..579719e57 100644
--- a/lib/LaTeXML/Engine/TeX_Box.pool.ltxml
+++ b/lib/LaTeXML/Engine/TeX_Box.pool.ltxml
@@ -53,14 +53,23 @@ DefConstructor('\lx@hidden@egroup', '',
reversion => '');
#======================================================================
-DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi');
+# A few useful low-level boxing things
DefConstructor('\lx@framed[]{}',
"#2",
properties => { frame => sub { ToString($_[1] || 'rectangle'); } });
+
DefConstructor('\lx@hflipped{}',
"#1");
+DefConstructor('\lx@overlay{}{}',
+ ""
+ . "#1"
+ . "#2");
+
+#======================================================================
+DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi');
+
sub reportNoUnicode {
my ($cs) = @_;
$cs = ToString($cs);
@@ -77,7 +86,7 @@ DefPrimitive('\lx@math@nounicode DefToken', sub {
Box(ToString($cs), undef, undef, $cs, class => 'ltx_nounicode'); });
DefConstructor('\lx@text@nounicode DefToken',
- "#1",
+ "#1",
afterDigest => sub {
reportNoUnicode(ToString($_[1]->getArg(0))); });
diff --git a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml
index 8eaeafa93..c727ed67d 100644
--- a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml
+++ b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml
@@ -14,7 +14,7 @@ package LaTeXML::Package::Pool;
use strict;
use warnings;
use LaTeXML::Package;
-use Unicode::Normalize;
+use LaTeXML::Util::Unicode;
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Character Family of primitive control sequences
@@ -70,43 +70,31 @@ sub applyAccent {
# Defines an accent command using a combining char that follows the
# 1st char of the argument. In cases where there is no argument, $standalonechar is used.
+# Ideally, the pair match up with an entry in Util::Unicode's accents
sub DefAccent {
my ($accent, $combiningchar, $standalonechar, %options) = @_;
- $options{above} = 1 if !(defined $options{above}) && !$options{below};
- # Used for converting a char used as an above-accent to a combining char (See \accent)
- AssignMapping('accent_combiner_above', $standalonechar => $combiningchar) if $options{above};
- AssignMapping('accent_combiner_below', $standalonechar => $combiningchar) unless $options{above};
- DefMacroI($accent, "{}",
- Tokens(T_CS('\lx@applyaccent'), T_OTHER($accent),
- T_OTHER($combiningchar), T_OTHER($standalonechar),
- T_BEGIN, T_ARG(1), T_END),
+ $accent = T_CS($accent) unless ref $accent;
+ DefPrimitiveI($accent, "{}", sub {
+ my ($stomach, $letter) = @_;
+ applyAccent($stomach, $letter, $combiningchar, $standalonechar,
+ Tokens($accent, T_BEGIN, $letter, T_END)); },
protected => 1);
return; }
-DefPrimitiveI('\lx@applyaccent', "DefToken Token Token {}", sub {
- my ($stomach, $accent, $combiningchar, $standalonechar, $letter) = @_;
- applyAccent($stomach, $letter, $combiningchar->getString, $standalonechar->getString,
- Tokens(T_CS($accent->getString), T_BEGIN, $letter, T_END)); },
- mode => 'text');
-
-# This will fail if there really are "assignments" after the number!
-# We're given a number pointing into the font, from which we can derive the standalone char.
-# From that, we want to figure out the combining character, but there could be one for
-# both the above & below cases! We'll prefer the above case.
+# This will fail if there really are "assignments" after the number! (See TeX Book)
+# We're given a number pointing into the font; the FontMap presumably has the standalone char.
+# If there's no letter to be accented, just use the stanadalone.
+# Otherwise, use the Util::Unicode module to find the appropriate combining character
DefPrimitive('\accent Number {}', sub {
my ($stomach, $num, $letter) = @_;
my $n = $num->valueOf;
- my $fontinfo = lookupFontinfo(LookupValue('textfont_0'));
- my $acc = ($fontinfo && $$fontinfo{encoding} ? FontDecode($n, $$fontinfo{encoding}) : chr($n));
- my $reversion = Invocation(T_CS('\accent'), $num, $letter);
- # NOTE: REVERSE LOOKUP in above accent list for the non-spacing accent char
- # BUT, \accent always (?) makes an above type accent... doesn't it?
- if (my $combiner = LookupMapping('accent_combiner_above', $acc)
- || LookupMapping('accent_combiner_below', $acc)) {
- applyAccent($stomach, $letter, $combiner, $acc, $reversion); }
- else {
- Warn('unexpected', "accent$n", $stomach, "Accent '$n' not recognized");
- Box(ToString($letter), undef, undef, $reversion); } });
+ my $encoding = LookupValue('font')->getEncoding || 'OT1';
+ my $char = ($encoding ? FontDecode($n, $encoding) : chr($n));
+ if (my $entry = unicode_accent($char)) {
+ applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
+ Invocation(T_CS('\accent'), $num, $letter)); }
+ else { # Unknown accent ? Attempt to OVERLAY the accent on top of $letter
+ Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($char), T_END)); } });
#======================================================================
# \chardef iq provides an alternate way to define a control sequence that returns a character.
diff --git a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
index 9f3d82b21..b8cecb755 100644
--- a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
+++ b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
@@ -140,37 +140,28 @@ DeclareFontMap('ASCII',
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
'x', 'y', 'z', "{", "|", "}", "~", undef]);
-# Note that several entries are used for accents, and in practice will actually
-# be used in something like an m:mover; thus they needn't (shouldn't?) be "small"
-# There are also some questions about which choices are best
-# grave & acute accents (entry 0x12 & 0x13) (often typed using 0x60 & 0x27)
-# are probably best using U+60(grave accent) & U+B4(acute accent)
-# but could be U+2035 (reversed prime) & U+2032 (prime). (particularly for math?)
-# [we do use these for \prime, however!]
-# or U+02CB (modifier letter grave accent) & U+02CA (modifier letter acute accent)
-# Similarly, hat & tilde (entries 0x5E & 0x7E)
-# typed using ^ 0x5E circumflex accent) & ~ 0x7E tilde
-# are probably best just sticking with U+5E & U+7E
-# but could be U+02C6 (modifier letter circumflex accent) U+02DC (small tilde)
-# [Note that generally we're using codepoints characterized as "modifier letter"
-# only when no other spacing point is available.]
+# Note that several entries are used for accents.
+# TeX fonts typically contain a standalone version of an accent, ie smallish & raised.
+# We'll consult a table in LaTeXML::Util::Unicode to determine the equivalent combining character,
+# as well as an "unwrapped" one for use in Math tokens (eg. as an overaccent)
+# NOTE: 0x12--0x18, 0x5E-0x5F, 0x7D-0x7F are accents
DeclareFontMap('OT1',
["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",
"\x{03A6}", "\x{03A8}", "\x{03A9}", "\x{FB00}", "\x{FB01}", "\x{FB02}", "\x{FB03}", "\x{FB04}",
"\x{0131}", "\x{0237}", UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{152}", UTF(0xD8),
- UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}",
- '(', ')', '*', '+', ',', '-', '.', '/',
- '0', '1', '2', '3', '4', '5', '6', '7',
- '8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?',
- '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
- 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
- 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
- 'X', 'Y', 'Z', '[', "\x{201C}", ']', "^", "\x{02D9}",
- "\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g',
- 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
- 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
- 'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", UTF(0x7E), UTF(0xA8)]);
+ UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}",
+ '(', ')', '*', '+', ',', '-', '.', '/',
+ '0', '1', '2', '3', '4', '5', '6', '7',
+ '8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?',
+ '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
+ 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
+ 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
+ 'X', 'Y', 'Z', '[', "\x{201C}", ']', "\x{02C6}", "\x{02D9}",
+ "\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g',
+ 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
+ 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
+ 'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", "\x{02DC}", UTF(0xA8)]);
DeclareFontMap('OT1',
["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}",
diff --git a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml
index b6c0f6437..5ebd644eb 100644
--- a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml
+++ b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml
@@ -618,14 +618,19 @@ DefPrimitive('\mathchardef Token SkipSpaces SkipMatch:=', sub {
return; });
DefConstructor('\mathaccent Number Digested',
- "#glyph#2",
+ "#glyph#2",
sizer => '#2', # Close enough?
afterDigest => sub {
my ($stomach, $whatsit) = @_;
my $n = $whatsit->getArg(1)->valueOf;
my ($role, $glyph) = decodeMathChar($n);
- $whatsit->setProperty(glyph => $glyph) if $glyph;
- $whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
+ my $accrole = 'OVERACCENT';
+ if (my $entry = unicode_accent($glyph)) {
+ $glyph = $$entry{unwrapped};
+ $accrole = $$entry{role}; }
+ $whatsit->setProperty(glyph => $glyph) if $glyph;
+ $whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph;
+ $whatsit->setProperty(accrole => $accrole) if $glyph;
return; });
# # Only used for active math characters, so far
diff --git a/lib/LaTeXML/Engine/plain.pool.ltxml b/lib/LaTeXML/Engine/plain.pool.ltxml
index 4047a73b1..672d71c87 100644
--- a/lib/LaTeXML/Engine/plain.pool.ltxml
+++ b/lib/LaTeXML/Engine/plain.pool.ltxml
@@ -14,6 +14,7 @@ package LaTeXML::Package::Pool;
use strict;
use warnings;
use LaTeXML::Package;
+use charnames ':full';
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#**********************************************************************
@@ -696,21 +697,21 @@ DefPrimitiveI('\pounds', undef, UTF(0xA3)); # POUND
DefAccent('\`', "\x{0300}", UTF(0x60)); # COMBINING GRAVE ACCENT & GRAVE ACCENT
DefAccent("\\'", "\x{0301}", UTF(0xB4)); # COMBINING ACUTE ACCENT & ACUTE ACCENT
-DefAccent('\^', "\x{0302}", UTF(0x5E)); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
+DefAccent('\^', "\x{0302}", "\x{02C6}"); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT
DefAccent('\"', "\x{0308}", UTF(0xA8)); # COMBINING DIAERESIS & DIAERESIS
-DefAccent('\~', "\x{0303}", "~"); # COMBINING TILDE
+DefAccent('\~', "\x{0303}", "\x{02DC}"); # COMBINING TILDE
DefAccent('\=', "\x{0304}", UTF(0xAF)); # COMBINING MACRON & MACRON
DefAccent('\.', "\x{0307}", "\x{02D9}"); # COMBINING DOT ABOVE & DOT ABOVE
DefAccent('\u', "\x{0306}", "\x{02D8}"); # COMBINING BREVE & BREVE
DefAccent('\v', "\x{030C}", "\x{02C7}"); # COMBINING CARON & CARON
-DefAccent('\@ringaccent', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining
-DefAccent('\r', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining
+DefAccent('\@ringaccent', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
+DefAccent('\r', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining
DefAccent('\H', "\x{030B}", "\x{02DD}"); # COMBINING DOUBLE ACUTE ACCENT & non-combining
DefAccent('\c', "\x{0327}", UTF(0xB8), below => 1); # COMBINING CEDILLA & CEDILLA
# NOTE: The next two get define for math, as well; See below
-DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?)
-DefAccent('\@text@baccent', "\x{0331}", UTF(0xAF), below => 1); # COMBINING MACRON BELOW & MACRON
-DefAccent('\t', "\x{0361}", "-"); # COMBINING DOUBLE INVERTED BREVE & ???? What????
+DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?)
+DefAccent('\@text@baccent', "\x{0331}", '_', below => 1); # COMBINING MACRON BELOW & MACRON
+DefAccent('\t', "\x{0361}", "\N{NBSP}\x{0361}"); # COMBINING DOUBLE INVERTED BREVE & ???? What????
# this one's actually defined in mathscinet.sty, but just stick it here!
DefAccent('\lfhook', "\x{0326}", ",", below => 1); # COMBINING COMMA BELOW
# I doubt that latter covers multiple chars...?
diff --git a/lib/LaTeXML/Package/cleveref.sty.ltxml b/lib/LaTeXML/Package/cleveref.sty.ltxml
index 3336bb149..cff030f18 100644
--- a/lib/LaTeXML/Package/cleveref.sty.ltxml
+++ b/lib/LaTeXML/Package/cleveref.sty.ltxml
@@ -77,7 +77,7 @@ sub crefMulti {
return @tokens; } }
# Since we're not grouping by type, we're ignoring \crefpairgroupconjunction, etc
-DefConstructor('\lx@cref OptionalMatch:* {} Semiverbatim',
+DefConstructor('\lx@cref OptionalMatch:* HyperVerbatim Semiverbatim',
"",
properties => sub { (label => CleanLabel($_[3])); });
diff --git a/lib/LaTeXML/Package/textcomp.sty.ltxml b/lib/LaTeXML/Package/textcomp.sty.ltxml
index ea0cc74ef..e8e363455 100644
--- a/lib/LaTeXML/Package/textcomp.sty.ltxml
+++ b/lib/LaTeXML/Package/textcomp.sty.ltxml
@@ -25,7 +25,7 @@ DefAccent('\capitalacute', "\x{0301}", UTF(0xB4)); # \'
DefAccent('\capitalbreve', "\x{0306}", "\x{02D8}"); # \u
DefAccent('\capitalcaron', "\x{030C}", "\x{02C7}"); # \v
DefAccent('\capitalcedilla', "\x{0327}", UTF(0xB8), below => 1); # \c
-DefAccent('\capitalcircumflex', "\x{0302}", UTF(0x5E)); # \^
+DefAccent('\capitalcircumflex', "\x{0302}", "\x{02C6}"); # \^
DefAccent('\capitaldieresis', "\x{0308}", UTF(0xA8)); # \"
DefAccent('\capitaldotaccent', "\x{0307}", "\x{02D9}"); # \.
DefAccent('\capitalgrave', "\x{0300}", UTF(0x60)); # \`
@@ -33,9 +33,9 @@ DefAccent('\capitalhungarumlaut', "\x{030B}", "\x{02DD}"); # \H
DefAccent('\capitalmacron', "\x{0304}", UTF(0xAF)); # \=
DefAccent('\capitalnewtie', "\x{0361}", "-"); # \t
DefAccent('\capitalogonek', "\x{0328}", "\x{02DB}"); #
-DefAccent('\capitalring', "\x{030A}", "o"); # \r
+DefAccent('\capitalring', "\x{030A}", "\x{02DA}"); # \r
DefAccent('\capitaltie', "\x{0361}", "-"); # \t
-DefAccent('\capitaltilde', "\x{0303}", "~"); # \~
+DefAccent('\capitaltilde', "\x{0303}", "\x{02DC}"); # \~
DefAccent('\newtie', "\x{0361}", "-"); # \t
#======================================================================
diff --git a/lib/LaTeXML/Post.pm b/lib/LaTeXML/Post.pm
index 8159e3d12..8859e5d72 100644
--- a/lib/LaTeXML/Post.pm
+++ b/lib/LaTeXML/Post.pm
@@ -266,6 +266,7 @@ use LaTeXML::Post;
use LaTeXML::Common::Error;
use base qw(LaTeXML::Post::Processor);
use LaTeXML::Common::XML;
+use charnames ':full';
# This is an abstract class; A complete MathProcessor will need to define:
# $self->convertNode($doc,$xmath)
@@ -451,7 +452,6 @@ sub combineParallel {
# AND the nested math needs to be converted to ONLY the current target's markup
# NOT parallel within each nested math, although it should still be cross-referencable to others!
# moreover, the math will need the outerWrapper.
-my $NBSP = pack('U', 0xA0); # CONSTANT
sub convertXMTextContent {
my ($self, $doc, $convertspaces, @nodes) = @_;
@@ -460,7 +460,7 @@ sub convertXMTextContent {
if ($node->nodeType == XML_TEXT_NODE) {
my $string = $node->textContent;
if ($convertspaces) {
- $string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/; }
+ $string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/; }
push(@result, $string); }
else {
my $tag = $doc->getQName($node);
diff --git a/lib/LaTeXML/Post/CrossRef.pm b/lib/LaTeXML/Post/CrossRef.pm
index aed2f74e4..72cbc3a0c 100644
--- a/lib/LaTeXML/Post/CrossRef.pm
+++ b/lib/LaTeXML/Post/CrossRef.pm
@@ -21,8 +21,6 @@ use charnames qw(:full);
use LaTeXML::Post;
use base qw(LaTeXML::Post::Processor);
-my $NBSP = pack('U', 0xA0); # CONSTANT
-
sub new {
my ($class, %options) = @_;
my $self = $class->SUPER::new(%options);
@@ -635,7 +633,7 @@ sub make_bibcite {
elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {}
push(@stuff, $1) if $1; }
elsif ($show =~ s/^~//) { # Pass-thru spaces
- push(@stuff, $NBSP) if @stuff; }
+ push(@stuff, "\N{NBSP}") if @stuff; }
elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces
push(@stuff, $1) if @stuff; }
elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords
@@ -782,7 +780,7 @@ sub generateRef_aux {
elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {}
push(@stuff, $1) if $1; }
elsif ($show =~ s/^~//) { # Pass-thru spaces
- push(@stuff, $NBSP) if @stuff; }
+ push(@stuff, "\N{NBSP}") if @stuff; }
elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces
push(@stuff, $1) if @stuff; }
elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords
diff --git a/lib/LaTeXML/Post/MathML.pm b/lib/LaTeXML/Post/MathML.pm
index 3c4403e9c..a147757a1 100644
--- a/lib/LaTeXML/Post/MathML.pm
+++ b/lib/LaTeXML/Post/MathML.pm
@@ -18,8 +18,9 @@ use LaTeXML::Util::Unicode;
use LaTeXML::Post;
use LaTeXML::Common::Font;
use List::Util qw(max);
-use base qw(LaTeXML::Post::MathProcessor);
-use base qw(Exporter);
+use charnames ':full';
+use base qw(LaTeXML::Post::MathProcessor);
+use base qw(Exporter);
our @EXPORT = (
qw( &DefMathML ),
qw( &pmml &pmml_scriptsize &pmml_smaller
@@ -383,8 +384,6 @@ sub getXMHintSpacing {
else {
return 0; } }
-my $NBSP = pack('U', 0xA0); # CONSTANT
-
sub pmml_internal {
no warnings 'recursion';
my ($node) = @_;
@@ -506,8 +505,8 @@ sub pmml_internal {
['m:mtext', {}, $node->textContent]]; }
else {
my $text = $node->textContent; # Spaces are significant here
- $text =~ s/^\s+/$NBSP/;
- $text =~ s/\s+$/$NBSP/;
+ $text =~ s/^\s+/\N{NBSP}/;
+ $text =~ s/\s+$/\N{NBSP}/;
return ['m:mtext', {}, $text]; } }
sub needsMathstyle {
@@ -1027,7 +1026,7 @@ sub pmml_text_aux {
my $type = $node->nodeType;
if ($type == XML_TEXT_NODE) {
my ($string, %mmlattr) = stylizeContent($node, 'm:mtext', %attr);
- $string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/;
+ $string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/;
return ['m:mtext', {%mmlattr}, $string]; }
elsif ($type == XML_DOCUMENT_FRAG_NODE) {
return map { pmml_text_aux($_, %attr) } $node->childNodes; }
diff --git a/lib/LaTeXML/Util/Unicode.pm b/lib/LaTeXML/Util/Unicode.pm
index be7b30540..01f8cc14b 100644
--- a/lib/LaTeXML/Util/Unicode.pm
+++ b/lib/LaTeXML/Util/Unicode.pm
@@ -13,8 +13,8 @@ package LaTeXML::Util::Unicode;
use strict;
use warnings;
use base qw(Exporter);
-our @EXPORT = qw( &UTF &unicode_mathvariant &unicode_convert);
-
+use charnames ':full';
+our @EXPORT = qw( &UTF &unicode_accent &unicode_mathvariant &unicode_convert);
#======================================================================
# Unicode manipulation utilities useful for LaTeXML
# Mostly, but not exclusively, about Mathematics
@@ -24,6 +24,70 @@ sub UTF {
my ($code) = @_;
return pack('U', $code); }
+#======================================================================
+# Accents
+# There are potentially several Unicode codepoints that characterize a given accent:
+# combiner : unicode combining character that effects the accent when following a base char.
+# generally in Combining block
+# standalone : form that shows accent w/o base, but small(ish) and already raised/lowered!
+# sometimes called "isolated". Usually a "spacing" form, else NBSP followed by combiner.
+# unwrapped + role : form that shows the accent itself, typically larger and on baseline;
+# Used in operand for eg. MathML mover/munder
+# name : arbitrary short descriptive, for good measure
+# The ideal glyphs for each of these don't necessarily exist in Unicode,
+# nor are the best choices always clear.
+# Ideally, we would cover ALL accents that might appear in TeX!
+our @accent_data = (
+ { name => 'grave', combiner => "\x{0300}", standalone => UTF(0x60), # \'
+ unwrapped => "`", role => 'OVERACCENT' }, # (OR \x{2035} or UTF(0x60) ?)
+ { name => 'acute', combiner => "\x{0301}", standalone => UTF(0xB4), # \\'
+ unwrapped => UTF(0xB4), role => 'OVERACCENT' }, # (OR \x{2032} or UTF(0xB4)?)
+ { name => 'hat', combiner => "\x{0302}", standalone => "\x{02C6}", # \^
+ unwrapped => UTF(0x5E), role => 'OVERACCENT' },
+ { name => 'ddot', combiner => "\x{0308}", standalone => UTF(0xA8), # \"
+ unwrapped => UTF(0xA8), role => 'OVERACCENT' }, # (or \x{22C5})
+ { name => 'tilde', combiner => "\x{0303}", standalone => "\x{02DC}", # \~
+ unwrapped => UTF(0x7E), role => 'OVERACCENT' },
+ { name => 'bar', combiner => "\x{0304}", standalone => UTF(0xAF), # \=
+ unwrapped => UTF(0xAF), role => 'OVERACCENT' },
+ { name => 'dot', combiner => "\x{0307}", standalone => "\x{02D9}", # \.
+ unwrapped => "\x{02D9}", role => 'OVERACCENT' }, # (OR \x{22C5} or \x{0209} ?
+ { name => 'dtick', combiner => "\x{030B}", standalone => "\x{02DD}", # \H
+ unwrapped => "\x{2032}\x{2032}", role => 'OVERACCENT' }, # (Or UTF(0xA8) or " ?)
+ { name => 'breve', combiner => "\x{0306}", standalone => "\x{02D8}", # \u
+ unwrapped => "\x{02D8}", role => 'OVERACCENT' },
+ { name => 'check', combiner => "\x{030C}", standalone => "\x{02C7}", # \v
+ unwrapped => "\x{02C7}", role => 'OVERACCENT' },
+ { name => 'ring', combiner => "\x{030A}", standalone => "\x{02DA}", # \r
+ unwrapped => "\x{02DA}", role => 'OVERACCENT' }, # (or \x{2218} ?)
+ { name => 'vec', combiner => "\x{20D7}", standalone => "\N{NBSP}\x{20D7}", # \vec
+ unwrapped => "\x{2192}", role => 'OVERACCENT' },
+ { name => 'tie', combiner => "\x{0361}", standalone => "\N{NBSP}\x{0361}", # \t
+ unwrapped => "u", role => 'OVERACCENT' },
+ ## UNDERACCENT accents
+ { name => 'cedilla', combiner => "\x{0327}", standalone => UTF(0xB8), # \c
+ unwrapped => UTF(0xB8), role => 'UNDERACCENT' }, # not even math?
+ { name => 'underdot', combiner => "\x{0323}", standalone => '.', # \@text@daccent
+ unwrapped => "\x{22C5}", role => 'UNDERACCENT' }, # (Or \x{02D9} ?)
+ { name => 'underbar', combiner => "\x{0331}", standalone => '_',
+ unwrapped => UTF(0xAF), role => 'UNDERACCENT' },
+ { name => 'lfhook', combiner => "\x{0326}", standalone => ",", # '\lfhook'
+ unwrapped => ',', role => 'UNDERACCENT' },
+ { name => 'ogonek', combiner => "\x{0328}", standalone => "\x{02DB}",
+ unwrapped => "\x{02DB}", role => 'UNDERACCENT' }, # not even math???
+);
+# Set up a hash keyed on both standalone & combiner chars
+our %accent_data_lookup = ();
+foreach my $entry (@accent_data) {
+ $accent_data_lookup{ $$entry{standalone} } = $entry;
+ $accent_data_lookup{ $$entry{combiner} } = $entry;
+}
+
+# Lookup accent data keyed by either combiner or standalone unicode.
+sub unicode_accent {
+ my ($char) = @_;
+ return (defined $char) && $accent_data_lookup{$char}; }
+
#======================================================================
# Unicode Math Codepoints
# The basic latin and greek alphabets, as well as numbers appear in several
diff --git a/lib/LaTeXML/resources/CSS/LaTeXML.css b/lib/LaTeXML/resources/CSS/LaTeXML.css
index fe81cfa62..b954b01b1 100644
--- a/lib/LaTeXML/resources/CSS/LaTeXML.css
+++ b/lib/LaTeXML/resources/CSS/LaTeXML.css
@@ -507,3 +507,6 @@ cite { font-style: normal; }
.ltx_minipage > .ltx_graphics {
max-width:100%;
}
+
+.ltx_overlay {position:relative; }
+.ltx_overlay > span:nth-child(2) {position:absolute; left:0; }
diff --git a/t/digestion/io.xml b/t/digestion/io.xml
index f7dccd946..14e45cf6a 100644
--- a/t/digestion/io.xml
+++ b/t/digestion/io.xml
@@ -341,15 +341,15 @@ Line (eTeX): “macro “foo followed by spacesfl”
Line (eTeX): “keyword = 1fl”
-Line (eTeX): “fake spaces^^20^^20^^20fl”
+Line (eTeX): “fake spacesˆˆ20ˆˆ20ˆˆ20fl”
-Line (eTeX): “macro followed by fake spaces “foo^^20^^20^^20fl”
+Line (eTeX): “macro followed by fake spaces “fooˆˆ20ˆˆ20ˆˆ20fl”
-Line (eTeX): “fake cr^^M apparently looses remainder of linefl”
+Line (eTeX): “fake crˆˆM apparently looses remainder of linefl”
Line (eTeX): “face cr on next linefl”
-Line (eTeX): “^^Mfl”
+Line (eTeX): “ˆˆMfl”
Line (eTeX): “line – in bracketsfl”
diff --git a/t/encoding/ansinew.xml b/t/encoding/ansinew.xml
index 34f58a92f..addbed526 100644
--- a/t/encoding/ansinew.xml
+++ b/t/encoding/ansinew.xml
@@ -48,7 +48,7 @@
… |
† |
‡ |
- ^ |
+ ˆ |
‰ |
Š |
‹ |
@@ -67,7 +67,7 @@
• |
– |
— |
- ~ |
+ ˜ |
™ |
š |
› |
diff --git a/t/encoding/applemac.xml b/t/encoding/applemac.xml
index 641b6941f..b7e717880 100644
--- a/t/encoding/applemac.xml
+++ b/t/encoding/applemac.xml
@@ -253,12 +253,12 @@
Û |
Ù |
ı |
- ^ |
- ~ |
+ ˆ |
+ ˜ |
¯ |
˘ |
˙ |
- o |
+ ˚ |
¸ |
˝ |
˛ |
diff --git a/t/encoding/cp1252.xml b/t/encoding/cp1252.xml
index 35429a699..40f0e8c83 100644
--- a/t/encoding/cp1252.xml
+++ b/t/encoding/cp1252.xml
@@ -48,7 +48,7 @@
… |
† |
‡ |
- ^ |
+ ˆ |
‰ |
Š |
‹ |
@@ -67,7 +67,7 @@
• |
– |
— |
- ~ |
+ ˜ |
™ |
š |
› |
diff --git a/t/encoding/ot1.xml b/t/encoding/ot1.xml
index 6777b0fd2..8f85a5534 100644
--- a/t/encoding/ot1.xml
+++ b/t/encoding/ot1.xml
@@ -154,7 +154,7 @@
[ |
“ |
] |
- ^ |
+ ˆ |
˙ |
@@ -198,7 +198,7 @@
– |
— |
˝ |
- ~ |
+ ˜ |
¨ |
diff --git a/t/expansion/definedness.xml b/t/expansion/definedness.xml
index 199895a59..c15b713e5 100644
--- a/t/expansion/definedness.xml
+++ b/t/expansion/definedness.xml
@@ -15,7 +15,7 @@ it is NOT relax,
it is NOT relax,
- The upcaret: superscript character ^.
+
The upcaret: superscript character ˆ.
IS defined,
it is NOT relax,
diff --git a/t/expansion/etex.xml b/t/expansion/etex.xml
index f775918b5..5fdd9ce25 100644
--- a/t/expansion/etex.xml
+++ b/t/expansion/etex.xml
@@ -79,7 +79,7 @@ Got Was expanded Later.
5Tokens
- This˙and^that.
+ This˙andˆthat.
A Shooting STAR.
diff --git a/t/expansion/noexpand_conditional.xml b/t/expansion/noexpand_conditional.xml
index 61358b5c5..2c3a814c2 100644
--- a/t/expansion/noexpand_conditional.xml
+++ b/t/expansion/noexpand_conditional.xml
@@ -175,7 +175,7 @@
(ifcat) T F T F.
- 3.10 ^ vs @
+
3.10 ˆ vs @
(ifx) F F F F;
(if) F F F F;
(ifcat) F F F F.
diff --git a/t/fonts/accents.pdf b/t/fonts/accents.pdf
index b5e202985..e1701e410 100644
Binary files a/t/fonts/accents.pdf and b/t/fonts/accents.pdf differ
diff --git a/t/fonts/accents.tex b/t/fonts/accents.tex
index af64d3221..a821df56b 100644
--- a/t/fonts/accents.tex
+++ b/t/fonts/accents.tex
@@ -19,5 +19,6 @@ \section{Testing Accents}
c & \c{a} & \c{o} & \c{A} & \c{O} & \c{\i} & \c{ao} & \accent24 a\\
d & \d{a} & \d{o} & \d{A} & \d{O} & \d{\i} & \d{ao} & \accent`. a\\
b & \b{a} & \b{o} & \b{A} & \b{O} & \b{\i} & \b{ao} & \accent22 a\\
+ & & & & & & \accent"30 X\\
\end{tabular}
\end{document}
diff --git a/t/fonts/accents.xml b/t/fonts/accents.xml
index 4b4c0d9a1..52044c97f 100644
--- a/t/fonts/accents.xml
+++ b/t/fonts/accents.xml
@@ -166,6 +166,16 @@
a̱o |
ā |
+
+ |
+ |
+ |
+ |
+ |
+ |
+ X0 |
+ |
+
diff --git a/t/tokenize/hashes.xml b/t/tokenize/hashes.xml
index 468b0d02c..76ca9f727 100644
--- a/t/tokenize/hashes.xml
+++ b/t/tokenize/hashes.xml
@@ -44,15 +44,15 @@
3File io
Reading file badchars.tex:
-Line:“Bad chars: ^b$@##˙x ! ”
+Line:“Bad chars: ˆb$@##˙x ! ”
-Line:“More Bad chars: ^b$@##˙x ! ”
+Line:“More Bad chars: ˆb$@##˙x ! ”
-Line:“And More Bad chars: ^b$@##˙x ! ”
+Line:“And More Bad chars: ˆb$@##˙x ! ”
Line:““par ”
-Full Contents:“Bad chars: ^b$@##˙x ! More Bad chars: ^b$@##˙x ! And More Bad chars: ^b$@##˙x ! “par ”
+Full Contents:“Bad chars: ˆb$@##˙x ! More Bad chars: ˆb$@##˙x ! And More Bad chars: ˆb$@##˙x ! “par ”
diff --git a/tools/xtest b/tools/xtest
index a41f3f436..19628b52c 100755
--- a/tools/xtest
+++ b/tools/xtest
@@ -97,7 +97,7 @@ foreach my $set (@testsets) {
"--dest=$dest",
"--log=$logdir/$set-$test-$type.log",
"--format=$type",
- "--javascript=LaTeXML-maybeMathjax.js",
+ # "--javascript=LaTeXML-maybeMathjax.js",
(defined $timestamp ? ("--timestamp=$timestamp") : ()),
($verbosity > 0 ? map { "--verbose" } 1 .. $verbosity
: ($verbosity < 0 ? map { "--quiet" } 1 .. -$verbosity