-
Notifications
You must be signed in to change notification settings - Fork 101
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Accents #2404
Accents #2404
Changes from 8 commits
3b3a651
d65604a
b3d434c
c40389e
ff8bec4
c3f28b3
0a495ec
8cf43d5
a5b8e94
650cbfb
642281a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,8 +13,7 @@ package LaTeXML::Util::Unicode; | |
use strict; | ||
use warnings; | ||
use base qw(Exporter); | ||
our @EXPORT = qw( &UTF &unicode_mathvariant &unicode_convert); | ||
|
||
our @EXPORT = qw( &UTF &NBSP &unicode_accent &unicode_mathvariant &unicode_convert); | ||
#====================================================================== | ||
# Unicode manipulation utilities useful for LaTeXML | ||
# Mostly, but not exclusively, about Mathematics | ||
|
@@ -24,6 +23,73 @@ sub UTF { | |
my ($code) = @_; | ||
return pack('U', $code); } | ||
|
||
my $NBSP = UTF(0xA0); | ||
sub NBSP { return $NBSP; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this attempting to guard the lexical Surprised it wasn't introduced as a package-level There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, switched over to cleaner |
||
|
||
#====================================================================== | ||
# Accents | ||
# There are potentially several Unicode codepoints that characterize a given accent: | ||
# combiner : unicode combining character that effects the accent when following a base char. | ||
# generally in Combining block | ||
# standalone : form that shows accent w/o base, but small(ish) and already raised/lowered! | ||
# sometimes called "isolated". Usually a "spacing" form, else NBSP followed by combiner. | ||
# unwrapped : form that shows the accent itself, typically larger and on baseline; | ||
# Used in operand for eg. MathML mover/munder | ||
# name : arbitrary short descriptive, for good measure | ||
# The ideal glyphs for each of these don't necessarily exist in Unicode, | ||
# nor are the best choices always clear. | ||
# Ideally, we would cover ALL accents that might appear in TeX! | ||
our @accent_data = ( | ||
{ name => 'grave', combiner => "\x{0300}", standalone => UTF(0x60), # \' | ||
unwrapped => "`", pos => 'OVERACCENT' }, # (OR \x{2035} or UTF(0x60) ?) | ||
{ name => 'acute', combiner => "\x{0301}", standalone => UTF(0xB4), # \\' | ||
unwrapped => UTF(0xB4), pos => 'OVERACCENT' }, # (OR \x{2032} or UTF(0xB4)?) | ||
{ name => 'hat', combiner => "\x{0302}", standalone => "\x{02C6}", # \^ | ||
unwrapped => UTF(0x5E), pos => 'OVERACCENT' }, | ||
{ name => 'ddot', combiner => "\x{0308}", standalone => UTF(0xA8), # \" | ||
unwrapped => UTF(0xA8), pos => 'OVERACCENT' }, # (or \x{22C5}) | ||
{ name => 'tilde', combiner => "\x{0303}", standalone => "\x{02DC}", # \~ | ||
unwrapped => UTF(0x7E), pos => 'OVERACCENT' }, | ||
{ name => 'bar', combiner => "\x{0304}", standalone => UTF(0xAF), # \= | ||
unwrapped => UTF(0xAF), pos => 'OVERACCENT' }, | ||
{ name => 'dot', combiner => "\x{0307}", standalone => "\x{02D9}", # \. | ||
unwrapped => "\x{02D9}", pos => 'OVERACCENT' }, # (OR \x{22C5} or \x{0209} ? | ||
{ name => 'dtick', combiner => "\x{030B}", standalone => "\x{02DD}", # \H | ||
unwrapped => "\x{2032}\x{2032}", pos => 'OVERACCENT' }, # (Or UTF(0xA8) or " ?) | ||
{ name => 'breve', combiner => "\x{0306}", standalone => "\x{02D8}", # \u | ||
unwrapped => "\x{02D8}", pos => 'OVERACCENT' }, | ||
{ name => 'check', combiner => "\x{030C}", standalone => "\x{02C7}", # \v | ||
unwrapped => "\x{02C7}", pos => 'OVERACCENT' }, | ||
{ name => 'ring', combiner => "\x{030A}", standalone => "\x{02DA}", # \r | ||
unwrapped => "\x{02DA}", pos => 'OVERACCENT' }, # (or \x{2218} ?) | ||
{ name => 'vec', combiner => "\x{20D7}", standalone => $NBSP . "\x{20D7}", # \vec | ||
unwrapped => "\x{2192}", pos => 'OVERACCENT' }, | ||
{ name => 'tie', combiner => "\x{0361}", standalone => $NBSP . "\x{0361}", # \t | ||
unwrapped => "u", pos => 'OVERACCENT' }, | ||
## UNDERACCENT accents | ||
{ name => 'cedilla', combiner => "\x{0327}", standalone => UTF(0xB8), # \c | ||
unwrapped => UTF(0xB8), pos => 'UNDERACCENT' }, # not even math? | ||
{ name => 'underdot', combiner => "\x{0323}", standalone => '.', # \@text@daccent | ||
unwrapped => "\x{22C5}", pos => 'UNDERACCENT' }, # (Or \x{02D9} ?) | ||
{ name => 'underbar', combiner => "\x{0331}", standalone => '_', | ||
unwrapped => UTF(0xAF), pos => 'UNDERACCENT' }, | ||
{ name => 'lfhook', combiner => "\x{0326}", standalone => ",", # '\lfhook' | ||
unwrapped => ',', pos => 'UNDERACCENT' }, | ||
{ name => 'ogonek', combiner => "\x{0328}", standalone => "\x{02DB}", | ||
unwrapped => "\x{02DB}", pos => 'UNDERACCENT' }, # not even math??? | ||
); | ||
# Set up a hash keyed on both standalone & combiner chars | ||
our %accent_data = (); | ||
foreach my $entry (@accent_data) { | ||
$accent_data{ $$entry{standalone} } = $entry; | ||
$accent_data{ $$entry{combiner} } = $entry; | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, this is rather dangerous. Could you change the name of either I was thrown for a bit not knowing what happened, seeing the hash indexing syntax in Such a setup is bound to generate a subtle bug one day, when someone uses the wrong delimiters in haste: my @a = ('x','y','z');
my %a = ('1'=>'a', '2'=>'b', '3'=>'c');
print $a[2]; # z
print $a{2}; # b There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the hash variant can be kept, and you can even fully expand the loop that adds Simplest is to run it once, print with Dumper, then copy the result back into the file. And on second read - aren't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed a bit to make it less scary. |
||
|
||
# Lookup accent data keyed by either combiner or standalone unicode. | ||
sub unicode_accent { | ||
my ($char) = @_; | ||
return (defined $char) && $accent_data{$char}; } | ||
|
||
#====================================================================== | ||
# Unicode Math Codepoints | ||
# The basic latin and greek alphabets, as well as numbers appear in several | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have a small test for using an unknown accent? Might be useful to keep checking as things move forward.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, put in a simplistic handling for this case, along w/CSS and an extra line in test.