diff --git a/lib/LaTeXML/Engine/TeX.pool.ltxml b/lib/LaTeXML/Engine/TeX.pool.ltxml index 01e380c85..c3abcaaf8 100644 --- a/lib/LaTeXML/Engine/TeX.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX.pool.ltxml @@ -16,6 +16,7 @@ use warnings; use LaTeXML::Package; use Unicode::Normalize; use LaTeXML::Util::Pathname; +use charnames ':full'; use List::Util qw(min max); ###$LaTeXML::DEBUG{compiled} = 1 unless $LaTeXML::DEBUG{compiling} || $LaTeXML::DEBUG{nocompiled}; diff --git a/lib/LaTeXML/Engine/TeX_Box.pool.ltxml b/lib/LaTeXML/Engine/TeX_Box.pool.ltxml index c46060a1e..579719e57 100644 --- a/lib/LaTeXML/Engine/TeX_Box.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Box.pool.ltxml @@ -53,14 +53,23 @@ DefConstructor('\lx@hidden@egroup', '', reversion => ''); #====================================================================== -DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi'); +# A few useful low-level boxing things DefConstructor('\lx@framed[]{}', "#2", properties => { frame => sub { ToString($_[1] || 'rectangle'); } }); + DefConstructor('\lx@hflipped{}', "#1"); +DefConstructor('\lx@overlay{}{}', + "" + . "#1" + . "#2"); + +#====================================================================== +DefMacro('\lx@nounicode {}', '\ifmmode\lx@math@nounicode#1\else\lx@text@nounicode#1\fi'); + sub reportNoUnicode { my ($cs) = @_; $cs = ToString($cs); @@ -77,7 +86,7 @@ DefPrimitive('\lx@math@nounicode DefToken', sub { Box(ToString($cs), undef, undef, $cs, class => 'ltx_nounicode'); }); DefConstructor('\lx@text@nounicode DefToken', - "#1", + "#1", afterDigest => sub { reportNoUnicode(ToString($_[1]->getArg(0))); }); diff --git a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml index 8eaeafa93..c727ed67d 100644 --- a/lib/LaTeXML/Engine/TeX_Character.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Character.pool.ltxml @@ -14,7 +14,7 @@ package LaTeXML::Package::Pool; use strict; use warnings; use LaTeXML::Package; -use Unicode::Normalize; +use LaTeXML::Util::Unicode; #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% # Character Family of primitive control sequences @@ -70,43 +70,31 @@ sub applyAccent { # Defines an accent command using a combining char that follows the # 1st char of the argument. In cases where there is no argument, $standalonechar is used. +# Ideally, the pair match up with an entry in Util::Unicode's accents sub DefAccent { my ($accent, $combiningchar, $standalonechar, %options) = @_; - $options{above} = 1 if !(defined $options{above}) && !$options{below}; - # Used for converting a char used as an above-accent to a combining char (See \accent) - AssignMapping('accent_combiner_above', $standalonechar => $combiningchar) if $options{above}; - AssignMapping('accent_combiner_below', $standalonechar => $combiningchar) unless $options{above}; - DefMacroI($accent, "{}", - Tokens(T_CS('\lx@applyaccent'), T_OTHER($accent), - T_OTHER($combiningchar), T_OTHER($standalonechar), - T_BEGIN, T_ARG(1), T_END), + $accent = T_CS($accent) unless ref $accent; + DefPrimitiveI($accent, "{}", sub { + my ($stomach, $letter) = @_; + applyAccent($stomach, $letter, $combiningchar, $standalonechar, + Tokens($accent, T_BEGIN, $letter, T_END)); }, protected => 1); return; } -DefPrimitiveI('\lx@applyaccent', "DefToken Token Token {}", sub { - my ($stomach, $accent, $combiningchar, $standalonechar, $letter) = @_; - applyAccent($stomach, $letter, $combiningchar->getString, $standalonechar->getString, - Tokens(T_CS($accent->getString), T_BEGIN, $letter, T_END)); }, - mode => 'text'); - -# This will fail if there really are "assignments" after the number! -# We're given a number pointing into the font, from which we can derive the standalone char. -# From that, we want to figure out the combining character, but there could be one for -# both the above & below cases! We'll prefer the above case. +# This will fail if there really are "assignments" after the number! (See TeX Book) +# We're given a number pointing into the font; the FontMap presumably has the standalone char. +# If there's no letter to be accented, just use the stanadalone. +# Otherwise, use the Util::Unicode module to find the appropriate combining character DefPrimitive('\accent Number {}', sub { my ($stomach, $num, $letter) = @_; my $n = $num->valueOf; - my $fontinfo = lookupFontinfo(LookupValue('textfont_0')); - my $acc = ($fontinfo && $$fontinfo{encoding} ? FontDecode($n, $$fontinfo{encoding}) : chr($n)); - my $reversion = Invocation(T_CS('\accent'), $num, $letter); - # NOTE: REVERSE LOOKUP in above accent list for the non-spacing accent char - # BUT, \accent always (?) makes an above type accent... doesn't it? - if (my $combiner = LookupMapping('accent_combiner_above', $acc) - || LookupMapping('accent_combiner_below', $acc)) { - applyAccent($stomach, $letter, $combiner, $acc, $reversion); } - else { - Warn('unexpected', "accent$n", $stomach, "Accent '$n' not recognized"); - Box(ToString($letter), undef, undef, $reversion); } }); + my $encoding = LookupValue('font')->getEncoding || 'OT1'; + my $char = ($encoding ? FontDecode($n, $encoding) : chr($n)); + if (my $entry = unicode_accent($char)) { + applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone}, + Invocation(T_CS('\accent'), $num, $letter)); } + else { # Unknown accent ? Attempt to OVERLAY the accent on top of $letter + Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($char), T_END)); } }); #====================================================================== # \chardef iq provides an alternate way to define a control sequence that returns a character. diff --git a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml index 9f3d82b21..b8cecb755 100644 --- a/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml @@ -140,37 +140,28 @@ DeclareFontMap('ASCII', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "{", "|", "}", "~", undef]); -# Note that several entries are used for accents, and in practice will actually -# be used in something like an m:mover; thus they needn't (shouldn't?) be "small" -# There are also some questions about which choices are best -# grave & acute accents (entry 0x12 & 0x13) (often typed using 0x60 & 0x27) -# are probably best using U+60(grave accent) & U+B4(acute accent) -# but could be U+2035 (reversed prime) & U+2032 (prime). (particularly for math?) -# [we do use these for \prime, however!] -# or U+02CB (modifier letter grave accent) & U+02CA (modifier letter acute accent) -# Similarly, hat & tilde (entries 0x5E & 0x7E) -# typed using ^ 0x5E circumflex accent) & ~ 0x7E tilde -# are probably best just sticking with U+5E & U+7E -# but could be U+02C6 (modifier letter circumflex accent) U+02DC (small tilde) -# [Note that generally we're using codepoints characterized as "modifier letter" -# only when no other spacing point is available.] +# Note that several entries are used for accents. +# TeX fonts typically contain a standalone version of an accent, ie smallish & raised. +# We'll consult a table in LaTeXML::Util::Unicode to determine the equivalent combining character, +# as well as an "unwrapped" one for use in Math tokens (eg. as an overaccent) +# NOTE: 0x12--0x18, 0x5E-0x5F, 0x7D-0x7F are accents DeclareFontMap('OT1', ["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}", "\x{03A6}", "\x{03A8}", "\x{03A9}", "\x{FB00}", "\x{FB01}", "\x{FB02}", "\x{FB03}", "\x{FB04}", "\x{0131}", "\x{0237}", UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}", UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{152}", UTF(0xD8), - UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}", - '(', ')', '*', '+', ',', '-', '.', '/', - '0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?', - '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', - 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', - 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', - 'X', 'Y', 'Z', '[', "\x{201C}", ']', "^", "\x{02D9}", - "\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g', - 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', - 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', - 'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", UTF(0x7E), UTF(0xA8)]); + UTF(0xA0) . "\x{0335}", '!', "\x{201D}", '#', '$', '%', '&', "\x{2019}", + '(', ')', '*', '+', ',', '-', '.', '/', + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', ':', ';', UTF(0xA1), '=', UTF(0xBF), '?', + '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', + 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', + 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', + 'X', 'Y', 'Z', '[', "\x{201C}", ']', "\x{02C6}", "\x{02D9}", + "\x{2018}", 'a', 'b', 'c', 'd', 'e', 'f', 'g', + 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', + 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', + 'x', 'y', 'z', "\x{2013}", "\x{2014}", "\x{02DD}", "\x{02DC}", UTF(0xA8)]); DeclareFontMap('OT1', ["\x{0393}", "\x{0394}", "\x{0398}", "\x{039B}", "\x{039E}", "\x{03A0}", "\x{03A3}", "\x{03A5}", diff --git a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml index b6c0f6437..5ebd644eb 100644 --- a/lib/LaTeXML/Engine/TeX_Math.pool.ltxml +++ b/lib/LaTeXML/Engine/TeX_Math.pool.ltxml @@ -618,14 +618,19 @@ DefPrimitive('\mathchardef Token SkipSpaces SkipMatch:=', sub { return; }); DefConstructor('\mathaccent Number Digested', - "#glyph#2", + "#glyph#2", sizer => '#2', # Close enough? afterDigest => sub { my ($stomach, $whatsit) = @_; my $n = $whatsit->getArg(1)->valueOf; my ($role, $glyph) = decodeMathChar($n); - $whatsit->setProperty(glyph => $glyph) if $glyph; - $whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph; + my $accrole = 'OVERACCENT'; + if (my $entry = unicode_accent($glyph)) { + $glyph = $$entry{unwrapped}; + $accrole = $$entry{role}; } + $whatsit->setProperty(glyph => $glyph) if $glyph; + $whatsit->setProperty(font => LookupValue('font')->specialize($glyph)) if $glyph; + $whatsit->setProperty(accrole => $accrole) if $glyph; return; }); # # Only used for active math characters, so far diff --git a/lib/LaTeXML/Engine/plain.pool.ltxml b/lib/LaTeXML/Engine/plain.pool.ltxml index 4047a73b1..672d71c87 100644 --- a/lib/LaTeXML/Engine/plain.pool.ltxml +++ b/lib/LaTeXML/Engine/plain.pool.ltxml @@ -14,6 +14,7 @@ package LaTeXML::Package::Pool; use strict; use warnings; use LaTeXML::Package; +use charnames ':full'; #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #********************************************************************** @@ -696,21 +697,21 @@ DefPrimitiveI('\pounds', undef, UTF(0xA3)); # POUND DefAccent('\`', "\x{0300}", UTF(0x60)); # COMBINING GRAVE ACCENT & GRAVE ACCENT DefAccent("\\'", "\x{0301}", UTF(0xB4)); # COMBINING ACUTE ACCENT & ACUTE ACCENT -DefAccent('\^', "\x{0302}", UTF(0x5E)); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT +DefAccent('\^', "\x{0302}", "\x{02C6}"); # COMBINING CIRCUMFLEX ACCENT & CIRCUMFLEX ACCENT DefAccent('\"', "\x{0308}", UTF(0xA8)); # COMBINING DIAERESIS & DIAERESIS -DefAccent('\~', "\x{0303}", "~"); # COMBINING TILDE +DefAccent('\~', "\x{0303}", "\x{02DC}"); # COMBINING TILDE DefAccent('\=', "\x{0304}", UTF(0xAF)); # COMBINING MACRON & MACRON DefAccent('\.', "\x{0307}", "\x{02D9}"); # COMBINING DOT ABOVE & DOT ABOVE DefAccent('\u', "\x{0306}", "\x{02D8}"); # COMBINING BREVE & BREVE DefAccent('\v', "\x{030C}", "\x{02C7}"); # COMBINING CARON & CARON -DefAccent('\@ringaccent', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining -DefAccent('\r', "\x{030A}", "o"); # COMBINING RING ABOVE & non-combining +DefAccent('\@ringaccent', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining +DefAccent('\r', "\x{030A}", "\x{02DA}"); # COMBINING RING ABOVE & non-combining DefAccent('\H', "\x{030B}", "\x{02DD}"); # COMBINING DOUBLE ACUTE ACCENT & non-combining DefAccent('\c', "\x{0327}", UTF(0xB8), below => 1); # COMBINING CEDILLA & CEDILLA # NOTE: The next two get define for math, as well; See below -DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?) -DefAccent('\@text@baccent', "\x{0331}", UTF(0xAF), below => 1); # COMBINING MACRON BELOW & MACRON -DefAccent('\t', "\x{0361}", "-"); # COMBINING DOUBLE INVERTED BREVE & ???? What???? +DefAccent('\@text@daccent', "\x{0323}", '.', below => 1); # COMBINING DOT BELOW & DOT (?) +DefAccent('\@text@baccent', "\x{0331}", '_', below => 1); # COMBINING MACRON BELOW & MACRON +DefAccent('\t', "\x{0361}", "\N{NBSP}\x{0361}"); # COMBINING DOUBLE INVERTED BREVE & ???? What???? # this one's actually defined in mathscinet.sty, but just stick it here! DefAccent('\lfhook', "\x{0326}", ",", below => 1); # COMBINING COMMA BELOW # I doubt that latter covers multiple chars...? diff --git a/lib/LaTeXML/Package/cleveref.sty.ltxml b/lib/LaTeXML/Package/cleveref.sty.ltxml index 3336bb149..cff030f18 100644 --- a/lib/LaTeXML/Package/cleveref.sty.ltxml +++ b/lib/LaTeXML/Package/cleveref.sty.ltxml @@ -77,7 +77,7 @@ sub crefMulti { return @tokens; } } # Since we're not grouping by type, we're ignoring \crefpairgroupconjunction, etc -DefConstructor('\lx@cref OptionalMatch:* {} Semiverbatim', +DefConstructor('\lx@cref OptionalMatch:* HyperVerbatim Semiverbatim', "", properties => sub { (label => CleanLabel($_[3])); }); diff --git a/lib/LaTeXML/Package/textcomp.sty.ltxml b/lib/LaTeXML/Package/textcomp.sty.ltxml index ea0cc74ef..e8e363455 100644 --- a/lib/LaTeXML/Package/textcomp.sty.ltxml +++ b/lib/LaTeXML/Package/textcomp.sty.ltxml @@ -25,7 +25,7 @@ DefAccent('\capitalacute', "\x{0301}", UTF(0xB4)); # \' DefAccent('\capitalbreve', "\x{0306}", "\x{02D8}"); # \u DefAccent('\capitalcaron', "\x{030C}", "\x{02C7}"); # \v DefAccent('\capitalcedilla', "\x{0327}", UTF(0xB8), below => 1); # \c -DefAccent('\capitalcircumflex', "\x{0302}", UTF(0x5E)); # \^ +DefAccent('\capitalcircumflex', "\x{0302}", "\x{02C6}"); # \^ DefAccent('\capitaldieresis', "\x{0308}", UTF(0xA8)); # \" DefAccent('\capitaldotaccent', "\x{0307}", "\x{02D9}"); # \. DefAccent('\capitalgrave', "\x{0300}", UTF(0x60)); # \` @@ -33,9 +33,9 @@ DefAccent('\capitalhungarumlaut', "\x{030B}", "\x{02DD}"); # \H DefAccent('\capitalmacron', "\x{0304}", UTF(0xAF)); # \= DefAccent('\capitalnewtie', "\x{0361}", "-"); # \t DefAccent('\capitalogonek', "\x{0328}", "\x{02DB}"); # -DefAccent('\capitalring', "\x{030A}", "o"); # \r +DefAccent('\capitalring', "\x{030A}", "\x{02DA}"); # \r DefAccent('\capitaltie', "\x{0361}", "-"); # \t -DefAccent('\capitaltilde', "\x{0303}", "~"); # \~ +DefAccent('\capitaltilde', "\x{0303}", "\x{02DC}"); # \~ DefAccent('\newtie', "\x{0361}", "-"); # \t #====================================================================== diff --git a/lib/LaTeXML/Post.pm b/lib/LaTeXML/Post.pm index 8159e3d12..8859e5d72 100644 --- a/lib/LaTeXML/Post.pm +++ b/lib/LaTeXML/Post.pm @@ -266,6 +266,7 @@ use LaTeXML::Post; use LaTeXML::Common::Error; use base qw(LaTeXML::Post::Processor); use LaTeXML::Common::XML; +use charnames ':full'; # This is an abstract class; A complete MathProcessor will need to define: # $self->convertNode($doc,$xmath) @@ -451,7 +452,6 @@ sub combineParallel { # AND the nested math needs to be converted to ONLY the current target's markup # NOT parallel within each nested math, although it should still be cross-referencable to others! # moreover, the math will need the outerWrapper. -my $NBSP = pack('U', 0xA0); # CONSTANT sub convertXMTextContent { my ($self, $doc, $convertspaces, @nodes) = @_; @@ -460,7 +460,7 @@ sub convertXMTextContent { if ($node->nodeType == XML_TEXT_NODE) { my $string = $node->textContent; if ($convertspaces) { - $string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/; } + $string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/; } push(@result, $string); } else { my $tag = $doc->getQName($node); diff --git a/lib/LaTeXML/Post/CrossRef.pm b/lib/LaTeXML/Post/CrossRef.pm index aed2f74e4..72cbc3a0c 100644 --- a/lib/LaTeXML/Post/CrossRef.pm +++ b/lib/LaTeXML/Post/CrossRef.pm @@ -21,8 +21,6 @@ use charnames qw(:full); use LaTeXML::Post; use base qw(LaTeXML::Post::Processor); -my $NBSP = pack('U', 0xA0); # CONSTANT - sub new { my ($class, %options) = @_; my $self = $class->SUPER::new(%options); @@ -635,7 +633,7 @@ sub make_bibcite { elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {} push(@stuff, $1) if $1; } elsif ($show =~ s/^~//) { # Pass-thru spaces - push(@stuff, $NBSP) if @stuff; } + push(@stuff, "\N{NBSP}") if @stuff; } elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces push(@stuff, $1) if @stuff; } elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords @@ -782,7 +780,7 @@ sub generateRef_aux { elsif ($show =~ s/^\{([^\}]*)\}//) { # pass-thru literal, quoted with {} push(@stuff, $1) if $1; } elsif ($show =~ s/^~//) { # Pass-thru spaces - push(@stuff, $NBSP) if @stuff; } + push(@stuff, "\N{NBSP}") if @stuff; } elsif ($show =~ s/^(\s+)//) { # Pass-thru spaces push(@stuff, $1) if @stuff; } elsif ($show =~ s/^(\W+)//) { # Pass-thru non show keywords diff --git a/lib/LaTeXML/Post/MathML.pm b/lib/LaTeXML/Post/MathML.pm index 3c4403e9c..a147757a1 100644 --- a/lib/LaTeXML/Post/MathML.pm +++ b/lib/LaTeXML/Post/MathML.pm @@ -18,8 +18,9 @@ use LaTeXML::Util::Unicode; use LaTeXML::Post; use LaTeXML::Common::Font; use List::Util qw(max); -use base qw(LaTeXML::Post::MathProcessor); -use base qw(Exporter); +use charnames ':full'; +use base qw(LaTeXML::Post::MathProcessor); +use base qw(Exporter); our @EXPORT = ( qw( &DefMathML ), qw( &pmml &pmml_scriptsize &pmml_smaller @@ -383,8 +384,6 @@ sub getXMHintSpacing { else { return 0; } } -my $NBSP = pack('U', 0xA0); # CONSTANT - sub pmml_internal { no warnings 'recursion'; my ($node) = @_; @@ -506,8 +505,8 @@ sub pmml_internal { ['m:mtext', {}, $node->textContent]]; } else { my $text = $node->textContent; # Spaces are significant here - $text =~ s/^\s+/$NBSP/; - $text =~ s/\s+$/$NBSP/; + $text =~ s/^\s+/\N{NBSP}/; + $text =~ s/\s+$/\N{NBSP}/; return ['m:mtext', {}, $text]; } } sub needsMathstyle { @@ -1027,7 +1026,7 @@ sub pmml_text_aux { my $type = $node->nodeType; if ($type == XML_TEXT_NODE) { my ($string, %mmlattr) = stylizeContent($node, 'm:mtext', %attr); - $string =~ s/^\s+/$NBSP/; $string =~ s/\s+$/$NBSP/; + $string =~ s/^\s+/\N{NBSP}/; $string =~ s/\s+$/\N{NBSP}/; return ['m:mtext', {%mmlattr}, $string]; } elsif ($type == XML_DOCUMENT_FRAG_NODE) { return map { pmml_text_aux($_, %attr) } $node->childNodes; } diff --git a/lib/LaTeXML/Util/Unicode.pm b/lib/LaTeXML/Util/Unicode.pm index be7b30540..01f8cc14b 100644 --- a/lib/LaTeXML/Util/Unicode.pm +++ b/lib/LaTeXML/Util/Unicode.pm @@ -13,8 +13,8 @@ package LaTeXML::Util::Unicode; use strict; use warnings; use base qw(Exporter); -our @EXPORT = qw( &UTF &unicode_mathvariant &unicode_convert); - +use charnames ':full'; +our @EXPORT = qw( &UTF &unicode_accent &unicode_mathvariant &unicode_convert); #====================================================================== # Unicode manipulation utilities useful for LaTeXML # Mostly, but not exclusively, about Mathematics @@ -24,6 +24,70 @@ sub UTF { my ($code) = @_; return pack('U', $code); } +#====================================================================== +# Accents +# There are potentially several Unicode codepoints that characterize a given accent: +# combiner : unicode combining character that effects the accent when following a base char. +# generally in Combining block +# standalone : form that shows accent w/o base, but small(ish) and already raised/lowered! +# sometimes called "isolated". Usually a "spacing" form, else NBSP followed by combiner. +# unwrapped + role : form that shows the accent itself, typically larger and on baseline; +# Used in operand for eg. MathML mover/munder +# name : arbitrary short descriptive, for good measure +# The ideal glyphs for each of these don't necessarily exist in Unicode, +# nor are the best choices always clear. +# Ideally, we would cover ALL accents that might appear in TeX! +our @accent_data = ( + { name => 'grave', combiner => "\x{0300}", standalone => UTF(0x60), # \' + unwrapped => "`", role => 'OVERACCENT' }, # (OR \x{2035} or UTF(0x60) ?) + { name => 'acute', combiner => "\x{0301}", standalone => UTF(0xB4), # \\' + unwrapped => UTF(0xB4), role => 'OVERACCENT' }, # (OR \x{2032} or UTF(0xB4)?) + { name => 'hat', combiner => "\x{0302}", standalone => "\x{02C6}", # \^ + unwrapped => UTF(0x5E), role => 'OVERACCENT' }, + { name => 'ddot', combiner => "\x{0308}", standalone => UTF(0xA8), # \" + unwrapped => UTF(0xA8), role => 'OVERACCENT' }, # (or \x{22C5}) + { name => 'tilde', combiner => "\x{0303}", standalone => "\x{02DC}", # \~ + unwrapped => UTF(0x7E), role => 'OVERACCENT' }, + { name => 'bar', combiner => "\x{0304}", standalone => UTF(0xAF), # \= + unwrapped => UTF(0xAF), role => 'OVERACCENT' }, + { name => 'dot', combiner => "\x{0307}", standalone => "\x{02D9}", # \. + unwrapped => "\x{02D9}", role => 'OVERACCENT' }, # (OR \x{22C5} or \x{0209} ? + { name => 'dtick', combiner => "\x{030B}", standalone => "\x{02DD}", # \H + unwrapped => "\x{2032}\x{2032}", role => 'OVERACCENT' }, # (Or UTF(0xA8) or " ?) + { name => 'breve', combiner => "\x{0306}", standalone => "\x{02D8}", # \u + unwrapped => "\x{02D8}", role => 'OVERACCENT' }, + { name => 'check', combiner => "\x{030C}", standalone => "\x{02C7}", # \v + unwrapped => "\x{02C7}", role => 'OVERACCENT' }, + { name => 'ring', combiner => "\x{030A}", standalone => "\x{02DA}", # \r + unwrapped => "\x{02DA}", role => 'OVERACCENT' }, # (or \x{2218} ?) + { name => 'vec', combiner => "\x{20D7}", standalone => "\N{NBSP}\x{20D7}", # \vec + unwrapped => "\x{2192}", role => 'OVERACCENT' }, + { name => 'tie', combiner => "\x{0361}", standalone => "\N{NBSP}\x{0361}", # \t + unwrapped => "u", role => 'OVERACCENT' }, + ## UNDERACCENT accents + { name => 'cedilla', combiner => "\x{0327}", standalone => UTF(0xB8), # \c + unwrapped => UTF(0xB8), role => 'UNDERACCENT' }, # not even math? + { name => 'underdot', combiner => "\x{0323}", standalone => '.', # \@text@daccent + unwrapped => "\x{22C5}", role => 'UNDERACCENT' }, # (Or \x{02D9} ?) + { name => 'underbar', combiner => "\x{0331}", standalone => '_', + unwrapped => UTF(0xAF), role => 'UNDERACCENT' }, + { name => 'lfhook', combiner => "\x{0326}", standalone => ",", # '\lfhook' + unwrapped => ',', role => 'UNDERACCENT' }, + { name => 'ogonek', combiner => "\x{0328}", standalone => "\x{02DB}", + unwrapped => "\x{02DB}", role => 'UNDERACCENT' }, # not even math??? +); +# Set up a hash keyed on both standalone & combiner chars +our %accent_data_lookup = (); +foreach my $entry (@accent_data) { + $accent_data_lookup{ $$entry{standalone} } = $entry; + $accent_data_lookup{ $$entry{combiner} } = $entry; +} + +# Lookup accent data keyed by either combiner or standalone unicode. +sub unicode_accent { + my ($char) = @_; + return (defined $char) && $accent_data_lookup{$char}; } + #====================================================================== # Unicode Math Codepoints # The basic latin and greek alphabets, as well as numbers appear in several diff --git a/lib/LaTeXML/resources/CSS/LaTeXML.css b/lib/LaTeXML/resources/CSS/LaTeXML.css index fe81cfa62..b954b01b1 100644 --- a/lib/LaTeXML/resources/CSS/LaTeXML.css +++ b/lib/LaTeXML/resources/CSS/LaTeXML.css @@ -507,3 +507,6 @@ cite { font-style: normal; } .ltx_minipage > .ltx_graphics { max-width:100%; } + +.ltx_overlay {position:relative; } +.ltx_overlay > span:nth-child(2) {position:absolute; left:0; } diff --git a/t/digestion/io.xml b/t/digestion/io.xml index f7dccd946..14e45cf6a 100644 --- a/t/digestion/io.xml +++ b/t/digestion/io.xml @@ -341,15 +341,15 @@ Line (eTeX): “macro “foo followed by spacesfl” Line (eTeX): “keyword = 1fl” -Line (eTeX): “fake spaces^^20^^20^^20fl” +Line (eTeX): “fake spacesˆˆ20ˆˆ20ˆˆ20fl” -Line (eTeX): “macro followed by fake spaces “foo^^20^^20^^20fl” +Line (eTeX): “macro followed by fake spaces “fooˆˆ20ˆˆ20ˆˆ20fl” -Line (eTeX): “fake cr^^M apparently looses remainder of linefl” +Line (eTeX): “fake crˆˆM apparently looses remainder of linefl” Line (eTeX): “face cr on next linefl” -Line (eTeX): “^^Mfl” +Line (eTeX): “ˆˆMfl” Line (eTeX): “line – in bracketsfl” diff --git a/t/encoding/ansinew.xml b/t/encoding/ansinew.xml index 34f58a92f..addbed526 100644 --- a/t/encoding/ansinew.xml +++ b/t/encoding/ansinew.xml @@ -48,7 +48,7 @@ … † ‡ - ^ + ˆ ‰ Š ‹ @@ -67,7 +67,7 @@ • – — - ~ + ˜ ™ š › diff --git a/t/encoding/applemac.xml b/t/encoding/applemac.xml index 641b6941f..b7e717880 100644 --- a/t/encoding/applemac.xml +++ b/t/encoding/applemac.xml @@ -253,12 +253,12 @@ Û Ù ı - ^ - ~ + ˆ + ˜ ¯ ˘ ˙ - o + ˚ ¸ ˝ ˛ diff --git a/t/encoding/cp1252.xml b/t/encoding/cp1252.xml index 35429a699..40f0e8c83 100644 --- a/t/encoding/cp1252.xml +++ b/t/encoding/cp1252.xml @@ -48,7 +48,7 @@ … † ‡ - ^ + ˆ ‰ Š ‹ @@ -67,7 +67,7 @@ • – — - ~ + ˜ ™ š › diff --git a/t/encoding/ot1.xml b/t/encoding/ot1.xml index 6777b0fd2..8f85a5534 100644 --- a/t/encoding/ot1.xml +++ b/t/encoding/ot1.xml @@ -154,7 +154,7 @@ [ “ ] - ^ + ˆ ˙ @@ -198,7 +198,7 @@ – — ˝ - ~ + ˜ ¨ diff --git a/t/expansion/definedness.xml b/t/expansion/definedness.xml index 199895a59..c15b713e5 100644 --- a/t/expansion/definedness.xml +++ b/t/expansion/definedness.xml @@ -15,7 +15,7 @@ it is NOT relax,

it is NOT relax,

-

The upcaret: superscript character ^. +

The upcaret: superscript character ˆ. IS defined, it is NOT relax,

diff --git a/t/expansion/etex.xml b/t/expansion/etex.xml index f775918b5..5fdd9ce25 100644 --- a/t/expansion/etex.xml +++ b/t/expansion/etex.xml @@ -79,7 +79,7 @@ Got Was expanded Later.

<tag close=" ">5</tag>Tokens -

This˙and^that.

+

This˙andˆthat.

A Shooting STAR.

diff --git a/t/expansion/noexpand_conditional.xml b/t/expansion/noexpand_conditional.xml index 61358b5c5..2c3a814c2 100644 --- a/t/expansion/noexpand_conditional.xml +++ b/t/expansion/noexpand_conditional.xml @@ -175,7 +175,7 @@ (ifcat) T F T F.

-

3.10 ^ vs @ +

3.10 ˆ vs @ (ifx) F F F F; (if) F F F F; (ifcat) F F F F.

diff --git a/t/fonts/accents.pdf b/t/fonts/accents.pdf index b5e202985..e1701e410 100644 Binary files a/t/fonts/accents.pdf and b/t/fonts/accents.pdf differ diff --git a/t/fonts/accents.tex b/t/fonts/accents.tex index af64d3221..a821df56b 100644 --- a/t/fonts/accents.tex +++ b/t/fonts/accents.tex @@ -19,5 +19,6 @@ \section{Testing Accents} c & \c{a} & \c{o} & \c{A} & \c{O} & \c{\i} & \c{ao} & \accent24 a\\ d & \d{a} & \d{o} & \d{A} & \d{O} & \d{\i} & \d{ao} & \accent`. a\\ b & \b{a} & \b{o} & \b{A} & \b{O} & \b{\i} & \b{ao} & \accent22 a\\ + & & & & & & \accent"30 X\\ \end{tabular} \end{document} diff --git a/t/fonts/accents.xml b/t/fonts/accents.xml index 4b4c0d9a1..52044c97f 100644 --- a/t/fonts/accents.xml +++ b/t/fonts/accents.xml @@ -166,6 +166,16 @@ a̱o ā + + + + + + + + X0 + +
diff --git a/t/tokenize/hashes.xml b/t/tokenize/hashes.xml index 468b0d02c..76ca9f727 100644 --- a/t/tokenize/hashes.xml +++ b/t/tokenize/hashes.xml @@ -44,15 +44,15 @@ <tag close=" ">3</tag>File io

Reading file badchars.tex: -Line:“Bad chars: ^b$@##˙x ! ” +Line:“Bad chars: ˆb$@##˙x ! ” -Line:“More Bad chars: ^b$@##˙x ! ” +Line:“More Bad chars: ˆb$@##˙x ! ” -Line:“And More Bad chars: ^b$@##˙x ! ” +Line:“And More Bad chars: ˆb$@##˙x ! ” Line:““par ” -Full Contents:“Bad chars: ^b$@##˙x ! More Bad chars: ^b$@##˙x ! And More Bad chars: ^b$@##˙x ! “par ”

+Full Contents:“Bad chars: ˆb$@##˙x ! More Bad chars: ˆb$@##˙x ! And More Bad chars: ˆb$@##˙x ! “par ”

diff --git a/tools/xtest b/tools/xtest index a41f3f436..19628b52c 100755 --- a/tools/xtest +++ b/tools/xtest @@ -97,7 +97,7 @@ foreach my $set (@testsets) { "--dest=$dest", "--log=$logdir/$set-$test-$type.log", "--format=$type", - "--javascript=LaTeXML-maybeMathjax.js", + # "--javascript=LaTeXML-maybeMathjax.js", (defined $timestamp ? ("--timestamp=$timestamp") : ()), ($verbosity > 0 ? map { "--verbose" } 1 .. $verbosity : ($verbosity < 0 ? map { "--quiet" } 1 .. -$verbosity