add islower/isupper functions (JuliaStrings#196)

* add islower/isupper functions * added test * more tests + bugfix * Makefile fix * rm iscase test on make clean
archermarx · Aug 25, 2020 · 5622a0a · 5622a0a
1 parent 08f9999
commit 5622a0a
Show file tree

Hide file tree

Showing 9 changed files with 7,527 additions and 5,938 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,7 @@
 /test/valid
 /test/iterate
 /test/case
+/test/iscase
 /test/custom
 /tmp/
 /mingw_static/

diff --git a/Makefile b/Makefile
@@ -56,7 +56,7 @@ clean:
 ifneq ($(OS),Darwin)
 	rm -f libutf8proc.so.$(MAJOR)
 endif
-	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc
+	rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom test/misc test/iscase
 	rm -rf MANIFEST.new tmp
 	$(MAKE) -C bench clean
 	$(MAKE) -C data clean
@@ -129,6 +129,12 @@ data/NormalizationTest.txt:
 data/GraphemeBreakTest.txt:
 	$(MAKE) -C data GraphemeBreakTest.txt
 
+data/Lowercase.txt:
+	$(MAKE) -C data Lowercase.txt
+
+data/Uppercase.txt:
+	$(MAKE) -C data Uppercase.txt
+
 test/tests.o: test/tests.c test/tests.h utf8proc.h
 	$(CC) $(UCFLAGS) -c -o test/tests.o test/tests.c
 
@@ -150,6 +156,9 @@ test/valid: test/valid.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/iterate.c test/tests.o utf8proc.o -o $@
 
+test/iscase: test/iscase.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+	$(CC) $(UCFLAGS) $(LDFLAGS) test/iscase.c test/tests.o utf8proc.o -o $@
+
 test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) test/case.c test/tests.o utf8proc.o -o $@
 
@@ -159,7 +168,7 @@ test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 test/misc: test/misc.c test/tests.o utf8proc.o utf8proc.h test/tests.h
 	$(CC) $(UCFLAGS) $(LDFLAGS) -DUNICODE_VERSION='"'`$(PERL) -ne "/^UNICODE_VERSION=/ and print $$';" data/Makefile`'"' test/misc.c test/tests.o utf8proc.o -o $@
 
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+check: test/normtest data/NormalizationTest.txt data/Lowercase.txt data/Uppercase.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/iscase test/custom test/charwidth test/misc test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
 	$(MAKE) -C bench
 	test/normtest data/NormalizationTest.txt
 	test/graphemetest data/GraphemeBreakTest.txt
@@ -168,4 +177,5 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
 	test/valid
 	test/iterate
 	test/case
+	test/iscase data/Lowercase.txt data/Uppercase.txt
 	test/custom
diff --git a/data/Makefile b/data/Makefile
@@ -51,6 +51,13 @@ GraphemeBreakTest.txt:
 emoji-data.txt:
 	$(CURL) $(CURLFLAGS) -o $@ $(URLCACHE)https://unicode.org/Public/$(UNICODE_VERSION)/ucd/emoji/emoji-data.txt
 
+Uppercase.txt: DerivedCoreProperties.txt
+	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]' > $@
+
+Lowercase.txt: DerivedCoreProperties.txt
+	$(RUBY) -e 'puts File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]' > $@
+
 clean:
 	rm -f UnicodeData.txt EastAsianWidth.txt GraphemeBreakProperty.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt CharWidths.txt emoji-data.txt
+	rm -f Uppercase.txt Lowercase.txt
 	rm -f utf8proc_data.c.new
diff --git a/data/data_generator.rb b/data/data_generator.rb
@@ -77,6 +77,26 @@
   end
 end
 
+$uppercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Uppercase.*?# Total code points:/m]
+$uppercase = []
+$uppercase_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+    $1.hex.upto($2.hex) { |e2| $uppercase << e2 }
+  elsif entry =~ /^[0-9A-F]+/
+    $uppercase << $&.hex
+  end
+end
+
+$lowercase_list = File.read("DerivedCoreProperties.txt")[/# Derived Property: Lowercase.*?# Total code points:/m]
+$lowercase = []
+$lowercase_list.each_line do |entry|
+  if entry =~ /^([0-9A-F]+)\.\.([0-9A-F]+)/
+    $1.hex.upto($2.hex) { |e2| $lowercase << e2 }
+  elsif entry =~ /^[0-9A-F]+/
+    $lowercase << $&.hex
+  end
+end
+
 $grapheme_boundclass_list = File.read("GraphemeBreakProperty.txt")
 $grapheme_boundclass = Hash.new("UTF8PROC_BOUNDCLASS_OTHER")
 $grapheme_boundclass_list.each_line do |entry|
@@ -204,8 +224,10 @@ def initialize(line)
                          $8.split.collect { |element| element.hex }
     @bidi_mirrored     = ($13=='Y') ? true : false
     # issue #130: use nonstandard uppercase ß -> ẞ
-    @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : nil) : $16.hex
-    @lowercase_mapping = ($17=='') ? nil : $17.hex
+    # issue #195: if character is uppercase but has no lowercase mapping,
+    #             then make lowercase mapping = itself (vice versa for lowercase)
+    @uppercase_mapping = ($16=='') ? (code==0x00df ? 0x1e9e : ($17=='' && $lowercase.include?(code) ? code : nil)) : $16.hex
+    @lowercase_mapping = ($17=='') ? ($16=='' && $uppercase.include?(code) ? code : nil) : $17.hex
     @titlecase_mapping = ($18=='') ? (code==0x00df ? 0x1e9e : nil) : $18.hex
   end
   def case_folding

diff --git a/test/iscase.c b/test/iscase.c
@@ -0,0 +1,62 @@
+#include "tests.h"
+
+int read_range(FILE *f, utf8proc_int32_t *start, utf8proc_int32_t *end)
+{
+     unsigned char buf[8192];
+     size_t len = simple_getline(buf, f);
+     size_t pos = skipspaces(buf, 0);
+     unsigned char s[16];
+     if (pos == len || buf[pos] == '#') return 0;
+     pos += encode(s, buf + pos) - 1;
+     check(s[0], "invalid line %s in data", buf);
+     utf8proc_iterate((utf8proc_uint8_t*) s, -1, start);
+     if (buf[pos] == '.' && buf[pos+1] == '.') {
+          encode(s, buf + pos + 2);
+          check(s[0], "invalid line %s in data", buf);
+          utf8proc_iterate((utf8proc_uint8_t*) s, -1, end);
+     }
+     else
+          *end = *start;
+     return 1;
+}
+
+int test_iscase(const char *fname, int (*iscase)(utf8proc_int32_t),
+                utf8proc_int32_t (*thatcase)(utf8proc_int32_t))
+{
+     FILE *f = fopen(fname, "r");
+     int lines = 0, tests = 0, success = 1;
+     utf8proc_int32_t c = 0;
+
+     check(f != NULL, "error opening data file \"%s\"\n", fname);
+
+     while (success && !feof(f)) {
+          utf8proc_int32_t start, end;
+          if (read_range(f, &start, &end)) {
+               for (; c < start; ++c) {
+                    check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
+               }
+               for (; c <= end; ++c) {
+                    check(iscase(c), "failed iscase(%04x) in %s\n", c, fname);
+                    check(thatcase(c) == c, "inconsistent thatcase(%04x) in %s\n", c, fname);
+                    ++tests;
+               }
+          }
+          ++lines;
+     }
+     for (; c <= 0x110000; ++c) {
+          check(!iscase(c), "failed !iscase(%04x) in %s\n", c, fname);
+     }
+
+     printf("Checked %d characters from %d lines of %s\n", tests, lines, fname);
+     fclose(f);
+     return success;
+}
+
+int main(int argc, char **argv)
+{
+     check(argc == 3, "Expected Lowercase.txt and Uppercase.txt as arguments");
+     check(test_iscase(argv[1], utf8proc_islower, utf8proc_tolower), "Lowercase tests failed");
+     check(test_iscase(argv[2], utf8proc_isupper, utf8proc_toupper), "Uppercase tests failed");
+     printf("utf8proc iscase tests SUCCEEDED.\n");
+     return 0;
+}
diff --git a/test/printproperty.c b/test/printproperty.c
@@ -27,9 +27,9 @@ int main(int argc, char **argv)
             "  combining_class = %d\n"
             "  bidi_class = %d\n"
             "  decomp_type = %d\n"
-            "  uppercase_mapping = %x\n"
-            "  lowercase_mapping = %x\n"
-            "  titlecase_mapping = %x\n"
+            "  uppercase_mapping = %04x (seqindex %04x)%s\n"
+            "  lowercase_mapping = %04x (seqindex %04x)%s\n"
+            "  titlecase_mapping = %04x (seqindex %04x)\n"
             "  casefold = %s\n"
             "  comb_index = %d\n"
             "  bidi_mirrored = %d\n"
@@ -43,9 +43,9 @@ int main(int argc, char **argv)
         p->combining_class,
         p->bidi_class,
         p->decomp_type,
-        utf8proc_toupper(c),
-        utf8proc_tolower(c),
-        utf8proc_totitle(c),
+        utf8proc_toupper(c), p->uppercase_seqindex, utf8proc_isupper(c) ? " (isupper)" : "",
+        utf8proc_tolower(c), p->lowercase_seqindex, utf8proc_islower(c) ? " (islower)" : "",
+        utf8proc_totitle(c), p->titlecase_seqindex,
         (char *) map,
         p->comb_index,
         p->bidi_mirrored,

diff --git a/utf8proc.c b/utf8proc.c
@@ -384,6 +384,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
   return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
 }
 
+UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c)
+{
+  const utf8proc_property_t *p = utf8proc_get_property(c);
+  return p->lowercase_seqindex != p->uppercase_seqindex && p->lowercase_seqindex == UINT16_MAX;
+}
+
+UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c)
+{
+  const utf8proc_property_t *p = utf8proc_get_property(c);
+  return p->lowercase_seqindex != p->uppercase_seqindex && p->uppercase_seqindex == UINT16_MAX && p->category != UTF8PROC_CATEGORY_LT;
+}
+
 /* return a character width analogous to wcwidth (except portable and
    hopefully less buggy than most system wcwidth functions). */
 UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {

diff --git a/utf8proc.h b/utf8proc.h
@@ -635,6 +635,18 @@ UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
  */
 UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
 
+/**
+ * Given a codepoint `c`, return `1` if the codepoint corresponds to a lower-case character
+ * and `0` otherwise.
+ */
+UTF8PROC_DLLEXPORT int utf8proc_islower(utf8proc_int32_t c);
+
+/**
+ * Given a codepoint `c`, return `1` if the codepoint corresponds to an upper-case character
+ * and `0` otherwise.
+ */
+UTF8PROC_DLLEXPORT int utf8proc_isupper(utf8proc_int32_t c);
+
 /**
  * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
  * except that a width of 0 is returned for non-printable codepoints