Skip to content

Commit

Permalink
add islower/isupper functions (JuliaStrings#196)
Browse files Browse the repository at this point in the history
* add islower/isupper functions

* added test

* more tests + bugfix

* Makefile fix

* rm iscase test on make clean
  • Loading branch information
stevengj authored Aug 25, 2020
1 parent 08f9999 commit 5622a0a
Show file tree
Hide file tree
Showing 9 changed files with 7,527 additions and 5,938 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
/test/valid
/test/iterate
/test/case
/test/iscase
/test/custom
/tmp/
/mingw_static/
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ clean:
ifneq ($(OS),Darwin)
rm -f libutf8proc.so.$(MAJOR)
endif
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
rm -rf MANIFEST.new tmp
$(MAKE) -C bench clean
$(MAKE) -C data clean
Expand Down Expand Up @@ -129,6 +129,12 @@ data/NormalizationTest.txt:
data/GraphemeBreakTest.txt:
$(MAKE) -C data GraphemeBreakTest.txt

data/Lowercase.txt:
$(MAKE) -C data Lowercase.txt

data/Uppercase.txt:
$(MAKE) -C data Uppercase.txt

test/tests.o: test/tests.c test/tests.h utf8proc.h
$(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c

Expand All @@ -150,6 +156,9 @@ test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@

test/iscase: test/iscase.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/iscase.c test/tests.o utf8proc.o -o $@

test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@

Expand All @@ -159,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@

check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
$(MAKE) -C bench
test/normtest data/NormalizationTest.txt
test/graphemetest data/GraphemeBreakTest.txt
Expand All @@ -168,4 +177,5 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
test/valid
test/iterate
test/case
test/iscase data/Lowercase.txt data/Uppercase.txt
test/custom
7 changes: 7 additions & 0 deletions data/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ GraphemeBreakTest.txt:
emoji-data.txt:
$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt

Uppercase.txt: DerivedCoreProperties.txt
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@

Lowercase.txt: DerivedCoreProperties.txt
$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@

clean:
rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
rm -f Uppercase.txt Lowercase.txt
rm -f utf8proc_data.c.new
26 changes: 24 additions & 2 deletions data/data_generator.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,26 @@
end
end

$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]
$uppercase = []
$uppercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $uppercase << e2 }
elsif entry =~ /^[0-9A-F]+/
$uppercase << $&.hex
end
end

$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]
$lowercase = []
$lowercase_list.each_line do |entry|
if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
$1.hex.upto($2.hex) { |e2| $lowercase << e2 }
elsif entry =~ /^[0-9A-F]+/
$lowercase << $&.hex
end
end

$grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
$grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
$grapheme_boundclass_list.each_line do |entry|
Expand Down Expand Up @@ -204,8 +224,10 @@ def initialize(line)
$8.split.collect { |element| element.hex }
@bidi_mirrored = ($13=='Y') ? true : false
# issue #130: use nonstandard uppercase ß -> ẞ
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
@lowercase_mapping = ($17=='') ? nil : $17.hex
# issue #195: if character is uppercase but has no lowercase mapping,
# then make lowercase mapping = itself (vice versa for lowercase)
@uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
@lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
@titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
end
def case_folding
Expand Down
62 changes: 62 additions & 0 deletions test/iscase.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "tests.h"

int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
{
unsigned char buf[8192];
size_t len = simple_getline(buf, f);
size_t pos = skipspaces(buf, 0);
unsigned char s[16];
if (pos == len || buf[pos] == '#') return 0;
pos += encode(s, buf + pos) - 1;
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
if (buf[pos] == '.' && buf[pos+1] == '.') {
encode(s, buf + pos + 2);
check(s[0], "invalid line %s in data", buf);
utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
}
else
*end = *start;
return 1;
}

int test_iscase(const char *fname, int (*iscase)(utf8proc_int32_t),
utf8proc_int32_t (*thatcase)(utf8proc_int32_t))
{
FILE *f = fopen(fname, "r");
int lines = 0, tests = 0, success = 1;
utf8proc_int32_t c = 0;

check(f != NULL, "error opening data file \"%s\"\n", fname);

while (success && !feof(f)) {
utf8proc_int32_t start, end;
if (read_range(f, &start, &end)) {
for (; c < start; ++c) {
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
}
for (; c <= end; ++c) {
check(iscase(c), "failed iscase(%04x) in %s\n", c, fname);
check(thatcase(c) == c, "inconsistent thatcase(%04x) in %s\n", c, fname);
++tests;
}
}
++lines;
}
for (; c <= 0x110000; ++c) {
check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
}

printf("Checked %d characters from %d lines of %s\n", tests, lines, fname);
fclose(f);
return success;
}

int main(int argc, char **argv)
{
check(argc == 3, "Expected Lowercase.txt and Uppercase.txt as arguments");
check(test_iscase(argv[1], utf8proc_islower, utf8proc_tolower), "Lowercase tests failed");
check(test_iscase(argv[2], utf8proc_isupper, utf8proc_toupper), "Uppercase tests failed");
printf("utf8proc iscase tests SUCCEEDED.\n");
return 0;
}
12 changes: 6 additions & 6 deletions test/printproperty.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ int main(int argc, char **argv)
" combining_class = %d\n"
" bidi_class = %d\n"
" decomp_type = %d\n"
" uppercase_mapping = %x\n"
" lowercase_mapping = %x\n"
" titlecase_mapping = %x\n"
" uppercase_mapping = %04x (seqindex %04x)%s\n"
" lowercase_mapping = %04x (seqindex %04x)%s\n"
" titlecase_mapping = %04x (seqindex %04x)\n"
" casefold = %s\n"
" comb_index = %d\n"
" bidi_mirrored = %d\n"
Expand All @@ -43,9 +43,9 @@ int main(int argc, char **argv)
p->combining_class,
p->bidi_class,
p->decomp_type,
utf8proc_toupper(c),
utf8proc_tolower(c),
utf8proc_totitle(c),
utf8proc_toupper(c), p->uppercase_seqindex, utf8proc_isupper(c) ? " (isupper)" : "",
utf8proc_tolower(c), p->lowercase_seqindex, utf8proc_islower(c) ? " (islower)" : "",
utf8proc_totitle(c), p->titlecase_seqindex,
(char *) map,
p->comb_index,
p->bidi_mirrored,
Expand Down
12 changes: 12 additions & 0 deletions utf8proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
}

UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
{
const utf8proc_property_t *p = utf8proc_get_property(c);
return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
}

UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
{
const utf8proc_property_t *p = utf8proc_get_property(c);
return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
}

/* return a character width analogous to wcwidth (except portable and
hopefully less buggy than most system wcwidth functions). */
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
Expand Down
12 changes: 12 additions & 0 deletions utf8proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
*/
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return `1` if the codepoint corresponds to a lower-case character
* and `0` otherwise.
*/
UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c);

/**
* Given a codepoint `c`, return `1` if the codepoint corresponds to an upper-case character
* and `0` otherwise.
*/
UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);

/**
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
* except that a width of 0 is returned for non-printable codepoints
Expand Down
Loading

0 comments on commit 5622a0a

Please sign in to comment.