From e3b31718ea2ecf15f52b83073a58eac3c62c576b Mon Sep 17 00:00:00 2001 From: Frank Tang Date: Tue, 1 Aug 2023 17:10:11 +0000 Subject: [PATCH] ICU-22365 C API for ULocaleBuilder See #2520 ICU-22365 Fix comments --- icu4c/source/common/BUILD.bazel | 1 + icu4c/source/common/common.vcxproj | 1 + icu4c/source/common/common.vcxproj.filters | 6 + icu4c/source/common/common_uwp.vcxproj | 1 + icu4c/source/common/sources.txt | 1 + icu4c/source/common/ulocbuilder.cpp | 145 ++ icu4c/source/common/unicode/ulocbuilder.h | 406 ++++ icu4c/source/test/cintltst/Makefile.in | 2 +- icu4c/source/test/cintltst/cintltst.vcxproj | 1 + .../test/cintltst/cintltst.vcxproj.filters | 3 + icu4c/source/test/cintltst/cutiltst.c | 2 + icu4c/source/test/cintltst/ulocbuildertst.c | 1735 +++++++++++++++++ icu4c/source/test/depstest/dependencies.txt | 1 + 13 files changed, 2304 insertions(+), 1 deletion(-) create mode 100644 icu4c/source/common/ulocbuilder.cpp create mode 100644 icu4c/source/common/unicode/ulocbuilder.h create mode 100644 icu4c/source/test/cintltst/ulocbuildertst.c diff --git a/icu4c/source/common/BUILD.bazel b/icu4c/source/common/BUILD.bazel index 47d3d24bf518..00f80046dfb1 100644 --- a/icu4c/source/common/BUILD.bazel +++ b/icu4c/source/common/BUILD.bazel @@ -609,6 +609,7 @@ cc_library( "uloc.cpp", "uloc_tag.cpp", "uloc_keytype.cpp", + "ulocbuilder.cpp", "uresbund.cpp", "uresdata.cpp", "wintz.cpp", diff --git a/icu4c/source/common/common.vcxproj b/icu4c/source/common/common.vcxproj index 2b4cc05357ac..caa62100ea92 100644 --- a/icu4c/source/common/common.vcxproj +++ b/icu4c/source/common/common.vcxproj @@ -167,6 +167,7 @@ + diff --git a/icu4c/source/common/common.vcxproj.filters b/icu4c/source/common/common.vcxproj.filters index 28a5d903429f..abb276f217b3 100644 --- a/icu4c/source/common/common.vcxproj.filters +++ b/icu4c/source/common/common.vcxproj.filters @@ -322,6 +322,9 @@ locales & resources + + locales & resources + locales & resources @@ -1132,6 +1135,9 @@ locales & resources + + locales & resources + locales & resources diff --git a/icu4c/source/common/common_uwp.vcxproj b/icu4c/source/common/common_uwp.vcxproj index 5df0d57a7de4..247da53f7359 100644 --- a/icu4c/source/common/common_uwp.vcxproj +++ b/icu4c/source/common/common_uwp.vcxproj @@ -301,6 +301,7 @@ + diff --git a/icu4c/source/common/sources.txt b/icu4c/source/common/sources.txt index 90171fe9bd4b..6e7f9b020df7 100644 --- a/icu4c/source/common/sources.txt +++ b/icu4c/source/common/sources.txt @@ -139,6 +139,7 @@ ulist.cpp uloc.cpp uloc_keytype.cpp uloc_tag.cpp +ulocbuilder.cpp umapfile.cpp umath.cpp umutablecptrie.cpp diff --git a/icu4c/source/common/ulocbuilder.cpp b/icu4c/source/common/ulocbuilder.cpp new file mode 100644 index 000000000000..be344c9dfbd6 --- /dev/null +++ b/icu4c/source/common/ulocbuilder.cpp @@ -0,0 +1,145 @@ +// © 2023 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include + +#include "unicode/bytestream.h" +#include "unicode/localebuilder.h" +#include "unicode/locid.h" +#include "unicode/stringpiece.h" +#include "unicode/umachine.h" +#include "unicode/ulocbuilder.h" +#include "cstring.h" + +using icu::CheckedArrayByteSink; +using icu::StringPiece; + +#define EXTERNAL(i) ((ULocaleBuilder*)(i)) +#define INTERNAL(e) ((icu::LocaleBuilder*)(e)) + +ULocaleBuilder* ulocbld_open() { + return EXTERNAL(new icu::LocaleBuilder()); +} + +void ulocbld_close(ULocaleBuilder* builder) { + if (builder == nullptr) return; + delete INTERNAL(builder); +} + +void ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length) { + if (builder == nullptr) return; + icu::Locale l; + if (length < 0 || locale[length] == '\0') { + l = icu::Locale(locale); + } else { + if (length >= ULOC_FULLNAME_CAPACITY) { + l.setToBogus(); + } else { + // locale is not null termined but Locale API require one. + // Create a null termined version in buf. + char buf[ULOC_FULLNAME_CAPACITY]; + uprv_memcpy(buf, locale, length); + buf[length] = '\0'; + l = icu::Locale(buf); + } + } + INTERNAL(builder)->setLocale(l); +} + +#define STRING_PIECE(s, l) ((l)<0 ? StringPiece(s) : StringPiece((s), (l))) + +#define IMPL_ULOCBLD_SETTER(N) \ +void ulocbld_##N(ULocaleBuilder* bld, const char* s, int32_t l) { \ + if (bld == nullptr) return; \ + INTERNAL(bld)->N(STRING_PIECE(s,l)); \ +} + +IMPL_ULOCBLD_SETTER(setLanguageTag) +IMPL_ULOCBLD_SETTER(setLanguage) +IMPL_ULOCBLD_SETTER(setScript) +IMPL_ULOCBLD_SETTER(setRegion) +IMPL_ULOCBLD_SETTER(setVariant) +IMPL_ULOCBLD_SETTER(addUnicodeLocaleAttribute) +IMPL_ULOCBLD_SETTER(removeUnicodeLocaleAttribute) + +void ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length) { + if (builder == nullptr) return; + INTERNAL(builder)->setExtension(key, STRING_PIECE(value, length)); +} + +void ulocbld_setUnicodeLocaleKeyword( + ULocaleBuilder* builder, const char* key, int32_t keyLength, + const char* type, int32_t typeLength) { + if (builder == nullptr) return; + INTERNAL(builder)->setUnicodeLocaleKeyword( + STRING_PIECE(key, keyLength), STRING_PIECE(type, typeLength)); +} + +void ulocbld_clear(ULocaleBuilder* builder) { + if (builder == nullptr) return; + INTERNAL(builder)->clear(); +} + +void ulocbld_clearExtensions(ULocaleBuilder* builder) { + if (builder == nullptr) return; + INTERNAL(builder)->clearExtensions(); +} + +int32_t ulocbld_buildLocaleID(ULocaleBuilder* builder, + char* buffer, int32_t bufferCapacity, UErrorCode* err) { + if (builder == nullptr) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + icu::Locale l = INTERNAL(builder)->build(*err); + if (U_FAILURE(*err)) return 0; + int32_t length = (int32_t)(uprv_strlen(l.getName())); + if (length <= bufferCapacity) { + uprv_strncpy(buffer, l.getName(), length); + if (length < bufferCapacity) { + buffer[length] = '\0'; + } else { + *err = U_STRING_NOT_TERMINATED_WARNING; + } + return length; + } + *err = U_BUFFER_OVERFLOW_ERROR; + uprv_memcpy(buffer, l.getName(), bufferCapacity); + return length; +} + +int32_t ulocbld_buildLanguageTag(ULocaleBuilder* builder, + char* buffer, int32_t bufferCapacity, UErrorCode* err) { + if (builder == nullptr) { + *err = U_ILLEGAL_ARGUMENT_ERROR; + return 0; + } + icu::Locale l = INTERNAL(builder)->build(*err); + if (U_FAILURE(*err)) return 0; + CheckedArrayByteSink sink(buffer, bufferCapacity); + l.toLanguageTag(sink, *err); + if (U_FAILURE(*err)) { + return 0; + } + if (sink.Overflowed()) { + *err = U_BUFFER_OVERFLOW_ERROR; + return sink.NumberOfBytesAppended(); + } + int32_t written = sink.NumberOfBytesWritten(); + + if (written < bufferCapacity) { + // null terminate + buffer[written] = '\0'; + } else { + *err = U_STRING_NOT_TERMINATED_WARNING; + } + return written; +} + +UBool ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode) { + if (builder == nullptr) { + *outErrorCode = U_ILLEGAL_ARGUMENT_ERROR; + return true; + } + return INTERNAL(builder)->copyErrorTo(*outErrorCode); +} diff --git a/icu4c/source/common/unicode/ulocbuilder.h b/icu4c/source/common/unicode/ulocbuilder.h new file mode 100644 index 000000000000..dfcf973c8c96 --- /dev/null +++ b/icu4c/source/common/unicode/ulocbuilder.h @@ -0,0 +1,406 @@ +// © 2023 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html +#ifndef __ULOCBUILDER_H__ +#define __ULOCBUILDER_H__ + +#include "unicode/localpointer.h" +#include "unicode/utypes.h" + +/** + * \file + * \brief C API: Builder API for Locale + */ + +#ifndef U_HIDE_DRAFT_API + +/** + * Opaque C service object type for the locale builder API + * @draft ICU 74 + */ +struct ULocaleBuilder; + +/** + * C typedef for struct ULocaleBuilder. + * @draft ICU 74 + */ +typedef struct ULocaleBuilder ULocaleBuilder; + +/** + * ULocaleBuilder is used to build valid locale id + * string or IETF BCP 47 language tag from values configured by the setters. + * The ULocaleBuilder checks if a value configured by a + * setter satisfies the syntax requirements defined by the Locale + * class. A string of Locale created by a ULocaleBuilder is + * well-formed and can be transformed to a well-formed IETF BCP 47 language tag + * without losing information. + * + *

The following example shows how to create a locale string + * with the ULocaleBuilder. + *

+ *
+ *     UErrorCode err = U_ZERO_ERROR;
+ *     char buffer[ULOC_FULLNAME_CAPACITY];
+ *     ULocaleBuilder* builder = ulocbld_open();
+ *     ulocbld_setLanguage(builder, "sr", -1);
+ *     ulocbld_setScript(builder, "Latn", -1);
+ *     ulocbld_setRegion(builder, "RS", -1);
+ *     int32_t length = ulocbld_buildLocaleID(
+ *         builder, buffer, ULOC_FULLNAME_CAPACITY, &error);
+ *     ulocbld_close(builder);
+ * 
+ *
+ * + *

ULocaleBuilders can be reused; ulocbld_clear() resets all + * fields to their default values. + * + *

ULocaleBuilder tracks errors in an internal UErrorCode. For all setters, + * except ulocbld_setLanguageTag and ulocbld_setLocale, ULocaleBuilder will return immediately + * if the internal UErrorCode is in error state. + * To reset internal state and error code, call clear method. + * The ulocbld_setLanguageTag and setLocale method will first clear the internal + * UErrorCode, then track the error of the validation of the input parameter + * into the internal UErrorCode. + * + * @draft ICU 74 + */ + +/** + * Constructs an empty ULocaleBuilder. The default value of all + * fields, extensions, and private use information is the + * empty string. The created builder should be destroyed by calling + * ulocbld_close(); + * + * @draft ICU 74 + */ +U_CAPI ULocaleBuilder* U_EXPORT2 +ulocbld_open(); + +/** + * Close the builder and destroy it's internal states. + * @param builder the builder + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_close(ULocaleBuilder* builder); + +/** + * Resets the ULocaleBuilder to match the provided + * locale. Existing state is discarded. + * + *

All fields of the locale must be well-formed. + *

This method clears the internal UErrorCode. + * + * @param builder the builder + * @param locale the locale, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the locale; if negative, then the locale need to be + * null terminated, + * + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setLocale(ULocaleBuilder* builder, const char* locale, int32_t length); + +/** + * Resets the ULocaleBuilder to match the provided IETF BCP 47 language tag. + * Discards the existing state. + * The empty string causes the builder to be reset, like {@link #ulocbld_clear}. + * Legacy language tags (marked as “Type: grandfathered” in BCP 47) + * are converted to their canonical form before being processed. + * Otherwise, the language tag must be well-formed, + * or else the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods + * will later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

This method clears the internal UErrorCode. + * + * @param builder the builder + * @param tag the language tag, defined as IETF BCP 47 language tag, a + * const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the tag; if negative, then the tag need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setLanguageTag(ULocaleBuilder* builder, const char* tag, int32_t length); + +/** + * Sets the language. If language is the empty string, the + * language in this ULocaleBuilder is removed. Otherwise, the + * language must be well-formed, or else the ulocbld_buildLocaleID() + * and ulocbld_buildLanguageTag() methods will + * later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

The syntax of language value is defined as + * [unicode_language_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag). + * + * @param builder the builder + * @param language the language, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the language; if negative, then the language need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setLanguage(ULocaleBuilder* builder, const char* language, int32_t length); + +/** + * Sets the script. If script is the empty string, the script in + * this ULocaleBuilder is removed. + * Otherwise, the script must be well-formed, or else the + * ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later + * report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

The script value is a four-letter script code as + * [unicode_script_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_script_subtag) + * defined by ISO 15924 + * + * @param builder the builder + * @param script the script, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the script; if negative, then the script need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setScript(ULocaleBuilder* builder, const char* script, int32_t length); + +/** + * Sets the region. If region is the empty string, the region in this + * ULocaleBuilder is removed. Otherwise, the region + * must be well-formed, or else the ulocbld_buildLocaleID() and + * ulocbld_buildLanguageTag() methods will later report an + * U_ILLEGAL_ARGUMENT_ERROR. + * + *

The region value is defined by + * [unicode_region_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_region_subtag) + * as a two-letter ISO 3166 code or a three-digit UN M.49 area code. + * + *

The region value in the Locale created by the + * ULocaleBuilder is always normalized to upper case. + * + * @param builder the builder + * @param region the region, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the region; if negative, then the region need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setRegion(ULocaleBuilder* builder, const char* region, int32_t length); + +/** + * Sets the variant. If variant is the empty string, the variant in this + * ULocaleBuilder is removed. Otherwise, the variant + * must be well-formed, or else the ulocbld_buildLocaleID() and + * ulocbld_buildLanguageTag() methods will later report an + * U_ILLEGAL_ARGUMENT_ERROR. + * + *

Note: This method checks if variant + * satisfies the + * [unicode_variant_subtag](http://www.unicode.org/reports/tr35/tr35.html#unicode_variant_subtag) + * syntax requirements, and normalizes the value to lowercase letters. However, + * the Locale class does not impose any syntactic + * restriction on variant. To set an ill-formed variant, use a Locale constructor. + * If there are multiple unicode_variant_subtag, the caller must concatenate + * them with '-' as separator (ex: "foobar-fibar"). + * + * @param builder the builder + * @param variant the variant, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the variant; if negative, then the variant need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setVariant(ULocaleBuilder* builder, const char* variant, int32_t length); + +/** + * Sets the extension for the given key. If the value is the empty string, + * the extension is removed. Otherwise, the key and + * value must be well-formed, or else the ulocbld_buildLocaleID() + * and ulocbld_buildLanguageTag() methods will + * later report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

Note: The key ('u') is used for the Unicode locale extension. + * Setting a value for this key replaces any existing Unicode locale key/type + * pairs with those defined in the extension. + * + *

Note: The key ('x') is used for the private use code. To be + * well-formed, the value for this key needs only to have subtags of one to + * eight alphanumeric characters, not two to eight as in the general case. + * + * @param builder the builder + * @param key the extension key + * @param value the value, a const char * pointer (need not be terminated when + * the length is non-negative) + * @param length the length of the value; if negative, then the value need to be + * null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setExtension(ULocaleBuilder* builder, char key, const char* value, int32_t length); + +/** + * Sets the Unicode locale keyword type for the given key. If the type + * StringPiece is constructed with a nullptr, the keyword is removed. + * If the type is the empty string, the keyword is set without type subtags. + * Otherwise, the key and type must be well-formed, or else the + * ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() methods will later + * report an U_ILLEGAL_ARGUMENT_ERROR. + * + *

Keys and types are converted to lower case. + * + *

Note:Setting the 'u' extension via {@link #ulocbld_setExtension} + * replaces all Unicode locale keywords with those defined in the + * extension. + * + * @param builder the builder + * @param key the Unicode locale key, a const char * pointer (need not be + * terminated when the length is non-negative) + * @param keyLength the length of the key; if negative, then the key need to be + * null terminated, + * @param type the Unicode locale type, a const char * pointer (need not be + * terminated when the length is non-negative) + * @param typeLength the length of the type; if negative, then the type need to + * be null terminated, + * @return This builder. + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_setUnicodeLocaleKeyword(ULocaleBuilder* builder, + const char* key, int32_t keyLength, const char* type, int32_t typeLength); + +/** + * Adds a unicode locale attribute, if not already present, otherwise + * has no effect. The attribute must not be empty string and must be + * well-formed or U_ILLEGAL_ARGUMENT_ERROR will be set to status + * during the ulocbld_buildLocaleID() and ulocbld_buildLanguageTag() calls. + * + * @param builder the builder + * @param attribute the attribute, a const char * pointer (need not be + * terminated when the length is non-negative) + * @param length the length of the attribute; if negative, then the attribute + * need to be null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_addUnicodeLocaleAttribute( + ULocaleBuilder* builder, const char* attribute, int32_t length); + +/** + * Removes a unicode locale attribute, if present, otherwise has no + * effect. The attribute must not be empty string and must be well-formed + * or U_ILLEGAL_ARGUMENT_ERROR will be set to status during the ulocbld_buildLocaleID() + * and ulocbld_buildLanguageTag() calls. + * + *

Attribute comparison for removal is case-insensitive. + * + * @param builder the builder + * @param attribute the attribute, a const char * pointer (need not be + * terminated when the length is non-negative) + * @param length the length of the attribute; if negative, then the attribute + * need to be null terminated, + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_removeUnicodeLocaleAttribute( + ULocaleBuilder* builder, const char* attribute, int32_t length); + +/** + * Resets the builder to its initial, empty state. + *

This method clears the internal UErrorCode. + * + * @param builder the builder + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_clear(ULocaleBuilder* builder); + +/** + * Resets the extensions to their initial, empty state. + * Language, script, region and variant are unchanged. + * + * @param builder the builder + * @draft ICU 74 + */ +U_CAPI void U_EXPORT2 +ulocbld_clearExtensions(ULocaleBuilder* builder); + +/** + * Build the LocaleID string from the fields set on this builder. + * If any set methods or during the ulocbld_buildLocaleID() call require memory + * allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status. + * If any of the fields set by the setters are not well-formed, the status + * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will + * not change after the ulocbld_buildLocaleID() call and the caller is + * free to keep using the same builder to build more locales. + * + * @param builder the builder + * @param locale the locale id + * @param localeCapacity the size of the locale buffer to store the locale id + * @param err the error code + * @return the length of the locale id in buffer + * @draft ICU 74 + */ +U_CAPI int32_t U_EXPORT2 +ulocbld_buildLocaleID(ULocaleBuilder* builder, char* locale, + int32_t localeCapacity, UErrorCode* err); + +/** + * Build the IETF BCP 47 language tag string from the fields set on this builder. + * If any set methods or during the ulocbld_buildLanguageTag() call require memory + * allocation but fail U_MEMORY_ALLOCATION_ERROR will be set to status. + * If any of the fields set by the setters are not well-formed, the status + * will be set to U_ILLEGAL_ARGUMENT_ERROR. The state of the builder will + * not change after the ulocbld_buildLanguageTag() call and the caller is free + * to keep using the same builder to build more locales. + * + * @param builder the builder + * @param language the language tag + * @param languageCapacity the size of the language buffer to store the language + * tag + * @param err the error code + * @return the length of the language tag in buffer + * @draft ICU 74 + */ +U_CAPI int32_t U_EXPORT2 +ulocbld_buildLanguageTag(ULocaleBuilder* builder, char* language, + int32_t languageCapacity, UErrorCode* err); + +/** + * Sets the UErrorCode if an error occurred while recording sets. + * Preserves older error codes in the outErrorCode. + * + * @param builder the builder + * @param outErrorCode Set to an error code that occurred while setting subtags. + * Unchanged if there is no such error or if outErrorCode + * already contained an error. + * @return true if U_FAILURE(*outErrorCode) + * @draft ICU 74 + */ +U_CAPI UBool U_EXPORT2 +ulocbld_copyErrorTo(const ULocaleBuilder* builder, UErrorCode *outErrorCode); + +#if U_SHOW_CPLUSPLUS_API + +U_NAMESPACE_BEGIN + +/** + * \class LocalULocaleBuilderPointer + * "Smart pointer" class, closes a ULocaleBuilder via ulocbld_close(). + * For most methods see the LocalPointerBase base class. + * + * @see LocalPointerBase + * @see LocalPointer + * @draft ICU 74 + */ +U_DEFINE_LOCAL_OPEN_POINTER(LocalULocaleBuilderPointer, ULocaleBuilder, ulocbld_close); + +U_NAMESPACE_END + +#endif /* U_SHOW_CPLUSPLUS_API */ + +#endif /* U_HIDE_DRAFT_API */ + +#endif // __ULOCBUILDER_H__ diff --git a/icu4c/source/test/cintltst/Makefile.in b/icu4c/source/test/cintltst/Makefile.in index 34f8ea2acdd6..6516fb1a4468 100644 --- a/icu4c/source/test/cintltst/Makefile.in +++ b/icu4c/source/test/cintltst/Makefile.in @@ -42,7 +42,7 @@ LIBS = $(LIBCTESTFW) $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) OBJECTS = callcoll.o calltest.o capitst.o cbiapts.o cbkittst.o \ ccaltst.o ucnvseltst.o cctest.o ccapitst.o ccolltst.o encoll.o cconvtst.o ccurrtst.o \ cdateintervalformattest.o cdattst.o cdetst.o cdtdptst.o cdtrgtst.o cestst.o cfintst.o \ -cformtst.o cfrtst.o cg7coll.o chashtst.o cintltst.o citertst.o cjaptst.o cloctst.o \ +cformtst.o cfrtst.o cg7coll.o chashtst.o cintltst.o citertst.o cjaptst.o cloctst.o ulocbuildertst.o \ cmsccoll.o cmsgtst.o cpluralrulestest.o cposxtst.o cldrtest.o \ cnmdptst.o cnormtst.o cnumtst.o crelativedateformattest.o crestst.o creststn.o cturtst.o \ cucdapi.o cucdtst.o custrtst.o cstrcase.o cutiltst.o nucnvtst.o nccbtst.o bocu1tst.o \ diff --git a/icu4c/source/test/cintltst/cintltst.vcxproj b/icu4c/source/test/cintltst/cintltst.vcxproj index 251d4fde6bb2..0f5a5c7ae196 100644 --- a/icu4c/source/test/cintltst/cintltst.vcxproj +++ b/icu4c/source/test/cintltst/cintltst.vcxproj @@ -127,6 +127,7 @@ + diff --git a/icu4c/source/test/cintltst/cintltst.vcxproj.filters b/icu4c/source/test/cintltst/cintltst.vcxproj.filters index d39436d44517..03d13c2ec799 100644 --- a/icu4c/source/test/cintltst/cintltst.vcxproj.filters +++ b/icu4c/source/test/cintltst/cintltst.vcxproj.filters @@ -234,6 +234,9 @@ locales & resources + + locales & resources + locales & resources diff --git a/icu4c/source/test/cintltst/cutiltst.c b/icu4c/source/test/cintltst/cutiltst.c index 5f43c48ad280..bbac813103d6 100644 --- a/icu4c/source/test/cintltst/cutiltst.c +++ b/icu4c/source/test/cintltst/cutiltst.c @@ -17,6 +17,7 @@ #include "cintltst.h" void addLocaleTest(TestNode**); +void addLocaleBuilderTest(TestNode**); void addCLDRTest(TestNode**); void addUnicodeTest(TestNode**); void addUStringTest(TestNode**); @@ -41,6 +42,7 @@ void addUtility(TestNode** root) addTrie2Test(root); addUCPTrieTest(root); addLocaleTest(root); + addLocaleBuilderTest(root); addCLDRTest(root); addUnicodeTest(root); addUStringTest(root); diff --git a/icu4c/source/test/cintltst/ulocbuildertst.c b/icu4c/source/test/cintltst/ulocbuildertst.c new file mode 100644 index 000000000000..1fb9017f22d6 --- /dev/null +++ b/icu4c/source/test/cintltst/ulocbuildertst.c @@ -0,0 +1,1735 @@ +// © 2023 and later: Unicode, Inc. and others. +// License & terms of use: http://www.unicode.org/copyright.html + +#include "cintltst.h" +#include "cstring.h" +#include "unicode/uloc.h" +#include "unicode/ulocbuilder.h" + +#ifndef UPRV_LENGTHOF +#define UPRV_LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) +#endif +void addLocaleBuilderTest(TestNode** root); + +static void Verify(ULocaleBuilder* bld, const char* expected, const char* msg) { + UErrorCode status = U_ZERO_ERROR; + UErrorCode copyStatus = U_ZERO_ERROR; + UErrorCode errorStatus = U_ILLEGAL_ARGUMENT_ERROR; + if (ulocbld_copyErrorTo(bld, ©Status)) { + log_err(msg, u_errorName(copyStatus)); + } + if (!ulocbld_copyErrorTo(bld, &errorStatus)) { + log_err("Should always get the previous error and return false"); + } + char tag[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLanguageTag(bld, tag, ULOC_FULLNAME_CAPACITY, &status); + + if (U_FAILURE(status)) { + log_err(msg, u_errorName(status)); + } + if (status != copyStatus) { + log_err(msg, u_errorName(status)); + } + if (strcmp(tag, expected) != 0) { + log_err("should get \"%s\", but got \"%s\"\n", expected, tag); + } +} + +static void TestLocaleBuilder() { + + // The following test data are copy from + // icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/LocaleBuilderTest.java + // "L": +1 = language + // "S": +1 = script + // "R": +1 = region + // "V": +1 = variant + // "K": +1 = Unicode locale key / +2 = Unicode locale type + // "A": +1 = Unicode locale attribute + // "E": +1 = extension letter / +2 = extension value + // "P": +1 = private use + // "U": +1 = ULocale + // "B": +1 = BCP47 language tag + // "C": Clear all + // "N": Clear extensions + // "D": +1 = Unicode locale attribute to be removed + // "X": indicates an exception must be thrown + // "T": +1 = expected language tag / +2 = expected locale string + const char* TESTCASES[][14] = { + {"L", "en", "R", "us", "T", "en-US", "en_US"}, + {"L", "en", "R", "CA", "L", NULL, "T", "und-CA", "_CA"}, + {"L", "en", "R", "CA", "L", "", "T", "und-CA", "_CA"}, + {"L", "en", "R", "FR", "L", "fr", "T", "fr-FR", "fr_FR"}, + {"L", "123", "X"}, + {"R", "us", "T", "und-US", "_US"}, + {"R", "usa", "X"}, + {"R", "123", "L", "it", "R", NULL, "T", "it", "it"}, + {"R", "123", "L", "it", "R", "", "T", "it", "it"}, + {"R", "123", "L", "en", "T", "en-123", "en_123"}, + {"S", "LATN", "L", "DE", "T", "de-Latn", "de_Latn"}, + {"L", "De", "S", "latn", "R", "de", "S", "", "T", "de-DE", "de_DE"}, + {"L", "De", "S", "Arab", "R", "de", "S", NULL, "T", "de-DE", "de_DE"}, + {"S", "latin", "X"}, + {"V", "1234", "L", "en", "T", "en-1234", "en__1234"}, + {"V", "1234", "L", "en", "V", "5678", "T", "en-5678", "en__5678"}, + {"V", "1234", "L", "en", "V", NULL, "T", "en", "en"}, + {"V", "1234", "L", "en", "V", "", "T", "en", "en"}, + {"V", "123", "X"}, + {"U", "en_US", "T", "en-US", "en_US"}, + {"U", "en_US_WIN", "X"}, + {"B", "fr-FR-1606nict-u-ca-gregory-x-test", "T", + "fr-FR-1606nict-u-ca-gregory-x-test", + "fr_FR_1606NICT@calendar=gregorian;x=test"}, + {"B", "ab-cde-fghij", "T", "cde-fghij", "cde__FGHIJ"}, + {"B", "und-CA", "T", "und-CA", "_CA"}, + // Blocked by ICU-20327 + // {"B", "en-US-x-test-lvariant-var", "T", "en-US-x-test-lvariant-var", + // "en_US_VAR@x=test"}, + {"B", "en-US-VAR", "X"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "L", "ko", "T", + "ko-JP-u-ca-japanese-cu-jpy", "ko_JP@calendar=japanese;currency=JPY"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "K", "ca", NULL, "T", + "ja-JP-u-cu-jpy", "ja_JP@currency=JPY"}, + {"U", "ja_JP@calendar=japanese;currency=JPY", "E", "u", + "attr1-ca-gregory", "T", "ja-JP-u-attr1-ca-gregory", + "ja_JP@attribute=attr1;calendar=gregorian"}, + {"U", "en@colnumeric=yes", "K", "kn", "true", "T", "en-u-kn", + "en@colnumeric=yes"}, + {"L", "th", "R", "th", "K", "nu", "thai", "T", "th-TH-u-nu-thai", + "th_TH@numbers=thai"}, + {"U", "zh_Hans", "R", "sg", "K", "ca", "badcalendar", "X"}, + {"U", "zh_Hans", "R", "sg", "K", "cal", "gregory", "X"}, + {"E", "z", "ExtZ", "L", "en", "T", "en-z-extz", "en@z=extz"}, + {"E", "z", "ExtZ", "L", "en", "E", "z", "", "T", "en", "en"}, + {"E", "z", "ExtZ", "L", "en", "E", "z", NULL, "T", "en", "en"}, + {"E", "a", "x", "X"}, + {"E", "a", "abc_def", "T", "und-a-abc-def", "@a=abc-def"}, + // Design limitation - typeless u extension keyword 0a below is interpreted as a boolean value true/yes. + // With the legacy keyword syntax, "yes" is used for such boolean value instead of "true". + // However, once the legacy keyword is translated back to BCP 47 u extension, key "0a" is unknown, + // so "yes" is preserved - not mapped to "true". We could change the code to automatically transform + // key = alphanum alpha + {"L", "en", "E", "u", "bbb-aaa-0a", "T", "en-u-aaa-bbb-0a", + "en@0a=yes;attribute=aaa-bbb"}, + {"L", "fr", "R", "FR", "P", "Yoshito-ICU", "T", "fr-FR-x-yoshito-icu", + "fr_FR@x=yoshito-icu"}, + {"L", "ja", "R", "jp", "K", "ca", "japanese", "T", "ja-JP-u-ca-japanese", + "ja_JP@calendar=japanese"}, + {"K", "co", "PHONEBK", "K", "ca", "gregory", "L", "De", "T", + "de-u-ca-gregory-co-phonebk", "de@calendar=gregorian;collation=phonebook"}, + {"E", "o", "OPQR", "E", "a", "aBcD", "T", "und-a-abcd-o-opqr", "@a=abcd;o=opqr"}, + {"E", "u", "nu-thai-ca-gregory", "L", "TH", "T", "th-u-ca-gregory-nu-thai", + "th@calendar=gregorian;numbers=thai"}, + {"L", "en", "K", "tz", "usnyc", "R", "US", "T", "en-US-u-tz-usnyc", + "en_US@timezone=America/New_York"}, + {"L", "de", "K", "co", "phonebk", "K", "ks", "level1", "K", "kk", + "true", "T", "de-u-co-phonebk-kk-ks-level1", + "de@collation=phonebook;colnormalization=yes;colstrength=primary"}, + {"L", "en", "R", "US", "K", "ca", "gregory", "T", "en-US-u-ca-gregory", + "en_US@calendar=gregorian"}, + {"L", "en", "R", "US", "K", "cal", "gregory", "X"}, + {"L", "en", "R", "US", "K", "ca", "gregorian", "X"}, + {"L", "en", "R", "US", "K", "kn", "true", "T", "en-US-u-kn", + "en_US@colnumeric=yes"}, + {"B", "de-DE-u-co-phonebk", "C", "L", "pt", "T", "pt", "pt"}, + {"B", "ja-jp-u-ca-japanese", "N", "T", "ja-JP", "ja_JP"}, + {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "T", + "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"}, + {"B", "es-u-def-abc-co-trad", "A", "hij", "D", "def", "D", "def", "T", + "es-u-abc-hij-co-trad", "es@attribute=abc-hij;collation=traditional"}, + {"L", "en", "A", "aa", "X"}, + {"B", "fr-u-attr1-cu-eur", "D", "attribute1", "X"}, + }; + UErrorCode status = U_ZERO_ERROR; + ULocaleBuilder* bld = ulocbld_open(); + char tag[ULOC_FULLNAME_CAPACITY]; + char locale[ULOC_FULLNAME_CAPACITY]; + for (int tidx = 0; tidx < UPRV_LENGTHOF(TESTCASES); tidx++) { + char actions[1000]; + actions[0] = '\0'; + for (int p = 0; p < UPRV_LENGTHOF(TESTCASES[tidx]); p++) { + if (TESTCASES[tidx][p] == NULL) { + strcpy(actions, " (nullptr)"); + break; + } + if (p > 0) strcpy(actions, " "); + strcpy(actions, TESTCASES[tidx][p]); + } + int i = 0; + const char* method; + status = U_ZERO_ERROR; + ulocbld_clear(bld); + while (true) { + status = U_ZERO_ERROR; + UErrorCode copyStatus = U_ZERO_ERROR; + method = TESTCASES[tidx][i++]; + if (strcmp("L", method) == 0) { + ulocbld_setLanguage(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("S", method) == 0) { + ulocbld_setScript(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("R", method) == 0) { + ulocbld_setRegion(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("V", method) == 0) { + ulocbld_setVariant(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("K", method) == 0) { + const char* key = TESTCASES[tidx][i++]; + const char* type = TESTCASES[tidx][i++]; + ulocbld_setUnicodeLocaleKeyword(bld, key, -1, type, -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("A", method) == 0) { + ulocbld_addUnicodeLocaleAttribute(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("E", method) == 0) { + const char* key = TESTCASES[tidx][i++]; + const char* value = TESTCASES[tidx][i++]; + ulocbld_setExtension(bld, key[0], value, -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("P", method) == 0) { + ulocbld_setExtension(bld, 'x', TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("U", method) == 0) { + ulocbld_setLocale(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("B", method) == 0) { + ulocbld_setLanguageTag(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } + // clear / remove + else if (strcmp("C", method) == 0) { + ulocbld_clear(bld); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("N", method) == 0) { + ulocbld_clearExtensions(bld); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } else if (strcmp("D", method) == 0) { + ulocbld_removeUnicodeLocaleAttribute(bld, TESTCASES[tidx][i++], -1); + ulocbld_copyErrorTo(bld, ©Status); + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + } + // result + else if (strcmp("X", method) == 0) { + if (U_SUCCESS(status)) { + log_err("FAIL: No error return - test case: %s", actions); + } + } else if (strcmp("T", method) == 0) { + status = U_ZERO_ERROR; + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + if (status != copyStatus) { + log_err("copyErrorTo not matching"); + } + if (U_FAILURE(status) || + strcmp(locale, TESTCASES[tidx][i + 1]) != 0) { + log_err("FAIL: Wrong locale ID - %s %s %s", locale, + " for test case: ", actions); + } + ulocbld_buildLanguageTag(bld, tag, ULOC_FULLNAME_CAPACITY, &status); + if (U_FAILURE(status) || strcmp(tag, TESTCASES[tidx][i]) != 0) { + log_err("FAIL: Wrong language tag - %s %s %s", tag, + " for test case: ", actions); + } + break; + } else { + // Unknown test method + log_err("Unknown test case method: There is an error in the test case data."); + break; + } + if (status != copyStatus) { + log_err("copyErrorTo not matching"); + } + if (U_FAILURE(status)) { + if (strcmp("X", TESTCASES[tidx][i]) == 0) { + // This failure is expected + break; + } else { + log_err("FAIL: U_ILLEGAL_ARGUMENT_ERROR at offset %d %s %s", i, + " in test case: ", actions); + break; + } + } + if (strcmp("T", method) == 0) { + break; + } + } // while(true) + } // for TESTCASES + ulocbld_close(bld); +} + +static void TestLocaleBuilderBasic() { + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "zh", -1); + Verify(bld, "zh", "ulocbld_setLanguage('zh') got Error: %s\n"); + ulocbld_setScript(bld, "Hant", -1); + Verify(bld, "zh-Hant", "ulocbld_setScript('Hant') got Error: %s\n"); + + ulocbld_setRegion(bld, "SG", -1); + Verify(bld, "zh-Hant-SG", "ulocbld_setRegion('SG') got Error: %s\n"); + + ulocbld_setRegion(bld, "HK", -1); + ulocbld_setScript(bld, "Hans", -1); + + Verify(bld, "zh-Hans-HK", + "ulocbld_setRegion('HK') and ulocbld_setScript('Hans') got Error: %s\n"); + + ulocbld_setVariant(bld, "revised###", 7); + Verify(bld, "zh-Hans-HK-revised", + "ulocbld_setVariant('revised') got Error: %s\n"); + + ulocbld_setUnicodeLocaleKeyword(bld, "nu", -1, "thai###", 4); + Verify(bld, "zh-Hans-HK-revised-u-nu-thai", + "ulocbld_setUnicodeLocaleKeyword('nu', 'thai'') got Error: %s\n"); + + ulocbld_setUnicodeLocaleKeyword(bld, "co###", 2, "pinyin", -1); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-thai", + "ulocbld_setUnicodeLocaleKeyword('co', 'pinyin'') got Error: %s\n"); + + ulocbld_setUnicodeLocaleKeyword(bld, "nu", -1, "latn###", 4); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin-nu-latn", + "ulocbld_setUnicodeLocaleKeyword('nu', 'latn'') got Error: %s\n"); + + ulocbld_setUnicodeLocaleKeyword(bld, "nu", -1, "latn", -1); + ulocbld_setUnicodeLocaleKeyword(bld, "nu", -1, NULL, 0); + Verify(bld, "zh-Hans-HK-revised-u-co-pinyin", + "ulocbld_setUnicodeLocaleKeyword('nu', ''') got Error: %s\n"); + + + ulocbld_setUnicodeLocaleKeyword(bld, "co", -1, NULL, 0); + Verify(bld, "zh-Hans-HK-revised", + "ulocbld_setUnicodeLocaleKeyword('nu', NULL) got Error: %s\n"); + + ulocbld_setScript(bld, "", -1); + Verify(bld, "zh-HK-revised", + "ulocbld_setScript('') got Error: %s\n"); + + ulocbld_setVariant(bld, "", -1); + Verify(bld, "zh-HK", + "ulocbld_setVariant('') got Error: %s\n"); + + ulocbld_setRegion(bld, "", -1); + Verify(bld, "zh", + "ulocbld_setRegion('') got Error: %s\n"); + + ulocbld_close(bld); +} + +static void TestLocaleBuilderBasicWithExtensionsOnDefaultLocale() { + // Change the default locale to one with extension tags. + UErrorCode status = U_ZERO_ERROR; + char originalDefault[ULOC_FULLNAME_CAPACITY]; + strcpy(originalDefault, uloc_getDefault()); + uloc_setDefault("en-US-u-hc-h12", &status); + if (U_FAILURE(status)) { + log_err("ERROR: Could not change the default locale"); + return; + } + + // Invoke the basic test now that the default locale has been changed. + TestLocaleBuilderBasic(); + + uloc_setDefault(originalDefault, &status); + if (U_FAILURE(status)) { + log_err("ERROR: Could not restore the default locale"); + } +} + +static void TestSetLanguageWellFormed() { + // http://www.unicode.org/reports/tr35/tr35.html#unicode_language_subtag + // unicode_language_subtag = alpha{2,3} | alpha{5,8}; + // ICUTC decided also support alpha{4} + static const char* wellFormedLanguages[] = { + "", + + // alpha{2} + "en", + "NE", + "eN", + "Ne", + + // alpha{3} + "aNe", + "zzz", + "AAA", + + // alpha{4} + "ABCD", + "abcd", + + // alpha{5} + "efgij", + "AbCAD", + "ZAASD", + + // alpha{6} + "efgijk", + "AADGFE", + "AkDfFz", + + // alpha{7} + "asdfads", + "ADSFADF", + "piSFkDk", + + // alpha{8} + "oieradfz", + "IADSFJKR", + "kkDSFJkR", + }; + for(int32_t i=0;i 2) { + ulocbld_removeUnicodeLocaleAttribute(bld, wellFormedAttributes[i - 1], -1); + if (ulocbld_copyErrorTo(bld, &status)) { + log_err("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n", + wellFormedAttributes[i - 1], u_errorName(status)); + } + ulocbld_removeUnicodeLocaleAttribute(bld, wellFormedAttributes[i - 3], -1); + if (ulocbld_copyErrorTo(bld, &status)) { + log_err("removeUnicodeLocaleAttribute(\"%s\") got Error: %s\n", + wellFormedAttributes[i - 3], u_errorName(status)); + } + } + } + ulocbld_close(bld); +} + +static void TestAddUnicodeLocaleAttributeIllFormed() { + static const char* illFormed[] = { + "aa", + "34", + "ab-", + "-cd", + "-ef-", + "zyzabcdef", + "123456789", + "ab-abc", + "1ZB30zfk9-abc", + "2ck30zfk9-adsf023-234kcZ", + }; + for (int i = 0; i < UPRV_LENGTHOF(illFormed); i++) { + const char* ill = illFormed[i]; + UErrorCode status = U_ZERO_ERROR; + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_addUnicodeLocaleAttribute(bld, ill, -1); + char buffer[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld, buffer, ULOC_FULLNAME_CAPACITY, &status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("addUnicodeLocaleAttribute(\"%s\") should fail but has no Error\n", + ill); + } + ulocbld_close(bld); + } +} + +static void TestSetExtensionU() { + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "zhABC", 2); + Verify(bld, "zh", + "ulocbld_setLanguage(\"zh\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'u', "co-stroke", -1); + Verify(bld, "zh-u-co-stroke", + "ulocbld_setExtension('u', \"co-stroke\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'U', "ca-islamicABCDE", 10); + Verify(bld, "zh-u-ca-islamic", + "ulocbld_setExtension('U', \"zh-u-ca-islamic\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'u', "ca-chinese", 10); + Verify(bld, "zh-u-ca-chinese", + "ulocbld_setExtension('u', \"ca-chinese\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'U', "co-pinyin1234", 9); + Verify(bld, "zh-u-co-pinyin", + "ulocbld_setExtension('U', \"co-pinyin\") got Error: %s\n"); + + ulocbld_setRegion(bld, "TW123", 2); + Verify(bld, "zh-TW-u-co-pinyin", + "ulocbld_setRegion(\"TW\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'U', "", 0); + Verify(bld, "zh-TW", + "ulocbld_setExtension('U', \"\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'u', "abc-defg-kr-face", -1); + Verify(bld, "zh-TW-u-abc-defg-kr-face", + "ulocbld_setExtension('u', \"abc-defg-kr-face\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'U', "ca-japanese", -1); + Verify(bld, "zh-TW-u-ca-japanese", + "ulocbld_setExtension('U', \"ca-japanese\") got Error: %s\n"); + + ulocbld_close(bld); +} + +static void TestSetExtensionValidateUWellFormed() { + static const char* wellFormedExtensions[] = { + // keyword + // keyword = key (sep type)? ; + // key = alphanum alpha ; + // type = alphanum{3,8} (sep alphanum{3,8})* ; + "3A", + "ZA", + "az-abc", + "zz-123", + "7z-12345678", + "kb-A234567Z", + // (sep keyword)+ + "1z-ZZ", + "2z-ZZ-123", + "3z-ZZ-123-cd", + "0z-ZZ-123-cd-efghijkl", + // attribute + "abc", + "456", + "87654321", + "ZABADFSD", + // (sep attribute)+ + "abc-ZABADFSD", + "123-ZABADFSD", + "K2K-12345678", + "K2K-12345678-zzz", + // (sep attribute)+ (sep keyword)* + "K2K-12345678-zz", + "K2K-12345678-zz-0z", + "K2K-12345678-9z-AZ-abc", + "K2K-12345678-zz-9A-234", + "K2K-12345678-zk0-abc-efg-zz-9k-234", + }; + for (int i = 0; i < UPRV_LENGTHOF(wellFormedExtensions); i++) { + const char* extension = wellFormedExtensions[i]; + UErrorCode status = U_ZERO_ERROR; + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setExtension(bld, 'u', extension, -1); + char buffer[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld, buffer, ULOC_FULLNAME_CAPACITY, &status); + if (U_FAILURE(status)) { + log_err("setExtension('u', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + ulocbld_close(bld); + } +} + +static void TestSetExtensionValidateUIllFormed() { + static const char* illFormed[] = { + // bad key + "-", + "-ab", + "ab-", + "abc-", + "-abc", + "0", + "a", + "A0", + "z9", + "09", + "90", + // bad keyword + "AB-A0", + "AB-efg-A0", + "xy-123456789", + "AB-Aa-", + "AB-Aac-", + // bad attribute + "abcdefghi", + "abcdefgh-", + "abcdefgh-abcdefghi", + "abcdefgh-1", + "abcdefgh-a", + "abcdefgh-a2345678z", + }; + for (int i = 0; i < UPRV_LENGTHOF(illFormed); i++) { + const char* ill = illFormed[i]; + UErrorCode status = U_ZERO_ERROR; + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setExtension(bld, 'u', ill, -1); + char buffer[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld, buffer, ULOC_FULLNAME_CAPACITY, &status); + if (status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("setExtension('u', \"%s\") should fail but has no Error\n", + ill); + } + ulocbld_close(bld); + } +} + +static void TestSetExtensionT() { + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "fr", 2); + Verify(bld, "fr", + "ulocbld_setLanguage(\"fr\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'T', "zh", -1); + Verify(bld, "fr-t-zh", + "ulocbld_setExtension('T', \"zh\") got Error: %s\n"); + + ulocbld_setExtension(bld, 't', "zh-Hant-TW-1234-A9-123-456ABCDE", -1); + Verify(bld, "fr-t-zh-hant-tw-1234-a9-123-456abcde", + "ulocbld_setExtension('t', \"zh-Hant-TW-1234-A9-123-456ABCDE\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'T', "a9-123", -1); + Verify(bld, "fr-t-a9-123", + "ulocbld_setExtension('T', \"a9-123\") got Error: %s\n"); + + ulocbld_setRegion(bld, "MX###", 2); + Verify(bld, "fr-MX-t-a9-123", + "ulocbld_setRegion(\"MX\") got Error: %s\n"); + + ulocbld_setScript(bld, "Hans##", 4); + Verify(bld, "fr-Hans-MX-t-a9-123", + "ulocbld_setScript(\"Hans\") got Error: %s\n"); + + ulocbld_setVariant(bld, "9abc-abcde1234", 10 ); + Verify(bld, "fr-Hans-MX-9abc-abcde-t-a9-123", + "ulocbld_setVariant(\"9abc-abcde\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'T', "", 0); + Verify(bld, "fr-Hans-MX-9abc-abcde", + "ulocbld_setExtension('T', \"\") got Error: %s\n"); + + ulocbld_close(bld); +} + +static void TestSetExtensionValidateTWellFormed() { + // ((sep tlang (sep tfield)*) | (sep tfield)+) + static const char* wellFormedExtensions[] = { + // tlang + // tlang = unicode_language_subtag (sep unicode_script_subtag)? + // (sep unicode_region_subtag)? (sep unicode_variant_subtag)* ; + // unicode_language_subtag + "en", + "abc", + "abcde", + "ABCDEFGH", + // unicode_language_subtag sep unicode_script_subtag + "en-latn", + "abc-arab", + "ABCDEFGH-Thai", + // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag + "en-latn-ME", + "abc-arab-RU", + "ABCDEFGH-Thai-TH", + "en-latn-409", + "abc-arab-123", + "ABCDEFGH-Thai-456", + // unicode_language_subtag sep unicode_region_subtag + "en-ME", + "abc-RU", + "ABCDEFGH-TH", + "en-409", + "abc-123", + "ABCDEFGH-456", + // unicode_language_subtag sep unicode_script_subtag sep unicode_region_subtag + // sep (sep unicode_variant_subtag)* + "en-latn-ME-abcde", + "abc-arab-RU-3abc-abcdef", + "ABCDEFGH-Thai-TH-ADSFS-9xyz-abcdef", + "en-latn-409-xafsa", + "abc-arab-123-ADASDF", + "ABCDEFGH-Thai-456-9sdf-ADASFAS", + // (sep tfield)+ + "A0-abcde", + "z9-abcde123", + "z9-abcde123-a1-abcde", + // tlang (sep tfield)* + "fr-A0-abcde", + "fr-FR-A0-abcde", + "fr-123-z9-abcde123-a1-abcde", + "fr-Latn-FR-z9-abcde123-a1-abcde", + "gab-Thai-TH-abcde-z9-abcde123-a1-abcde", + "gab-Thai-TH-0bde-z9-abcde123-a1-abcde", + }; + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(wellFormedExtensions); i++) { + ulocbld_clear(bld); + const char* extension = wellFormedExtensions[i]; + ulocbld_setExtension(bld, 't', extension, -1); + UErrorCode status = U_ZERO_ERROR; + if (ulocbld_copyErrorTo(bld, &status)) { + log_err("ulocbld_setExtension('t', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + } + ulocbld_close(bld); +} + +static void TestSetExtensionValidateTIllFormed() { + static const char* illFormed[] = { + "a", + "a-", + "0", + "9-", + "-9", + "-z", + "Latn", + "Latn-", + "en-", + "nob-", + "-z9", + "a3", + "a3-", + "3a", + "0z-", + "en-123-a1", + "en-TH-a1", + "gab-TH-a1", + "gab-Thai-a1", + "gab-Thai-TH-a1", + "gab-Thai-TH-0bde-a1", + "gab-Thai-TH-0bde-3b", + "gab-Thai-TH-0bde-z9-a1", + "gab-Thai-TH-0bde-z9-3b", + "gab-Thai-TH-0bde-z9-abcde123-3b", + "gab-Thai-TH-0bde-z9-abcde123-ab", + "gab-Thai-TH-0bde-z9-abcde123-ab", + "gab-Thai-TH-0bde-z9-abcde123-a1", + "gab-Thai-TH-0bde-z9-abcde123-a1-", + "gab-Thai-TH-0bde-z9-abcde123-a1-a", + "gab-Thai-TH-0bde-z9-abcde123-a1-ab", + // ICU-21408 + "root", + }; + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(illFormed); i++) { + ulocbld_clear(bld); + const char* ill = illFormed[i]; + UErrorCode status = U_ZERO_ERROR; + ulocbld_setExtension(bld, 't', ill, -1); + if (!ulocbld_copyErrorTo(bld, &status) || status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("setExtension('t', \"%s\") should fail but has no Error\n", + ill); + } + } + ulocbld_close(bld); +} + +static void TestSetExtensionPU() { + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "ar123", 2); + Verify(bld, "ar", + "ulocbld_setLanguage(\"ar\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'X', "a-b-c-d-e12345", 9); + Verify(bld, "ar-x-a-b-c-d-e", + "ulocbld_setExtension('X', \"a-b-c-d-e\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'x', "0-1-2-3", -1); + Verify(bld, "ar-x-0-1-2-3", + "ulocbld_setExtension('x', \"0-1-2-3\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'X', "0-12345678-x-x", -1); + Verify(bld, "ar-x-0-12345678-x-x", + "ulocbld_setExtension('x', \"ar-x-0-12345678-x-x\") got Error: %s\n"); + + ulocbld_setRegion(bld, "TH123", 2); + Verify(bld, "ar-TH-x-0-12345678-x-x", + "ulocbld_setRegion(\"TH\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'X', "", -1); + Verify(bld, "ar-TH", + "ulocbld_setExtension(\"X\") got Error: %s\n"); + ulocbld_close(bld); +} + +static void TestSetExtensionValidatePUWellFormed() { + // ((sep tlang (sep tfield)*) | (sep tfield)+) + static const char* wellFormedExtensions[] = { + "a", // Short subtag + "z", // Short subtag + "0", // Short subtag, digit + "9", // Short subtag, digit + "a-0", // Two short subtag, alpha and digit + "9-z", // Two short subtag, digit and alpha + "ab", + "abc", + "abcefghi", // Long subtag + "87654321", + "01", + "234", + "0a-ab-87654321", // Three subtags + "87654321-ab-00-3A", // Four subtabs + "a-9-87654321", // Three subtags with short and long subtags + "87654321-ab-0-3A", + }; + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(wellFormedExtensions); i++) { + ulocbld_clear(bld); + const char* extension = wellFormedExtensions[i]; + UErrorCode status = U_ZERO_ERROR; + ulocbld_setExtension(bld, 'x', extension, -1); + if (ulocbld_copyErrorTo(bld, &status) || U_FAILURE(status)) { + log_err("setExtension('x', \"%s\") got Error: %s\n", + extension, u_errorName(status)); + } + } + ulocbld_close(bld); +} + +static void TestSetExtensionValidatePUIllFormed() { + static const char* illFormed[] = { + "123456789", // Too long + "abcdefghi", // Too long + "ab-123456789", // Second subtag too long + "abcdefghi-12", // First subtag too long + "a-ab-987654321", // Third subtag too long + "987654321-a-0-3", // First subtag too long + }; + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(illFormed); i++) { + const char* ill = illFormed[i]; + ulocbld_clear(bld); + ulocbld_setExtension(bld, 'x', ill, -1); + UErrorCode status = U_ZERO_ERROR; + if (!ulocbld_copyErrorTo(bld, &status) ||status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("ulocbld_setExtension('x', \"%s\") should fail but has no Error\n", + ill); + } + } + ulocbld_close(bld); +} + +static void TestSetExtensionOthers() { + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "fr", -1); + Verify(bld, "fr", + "ulocbld_setLanguage(\"fr\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'Z', "ab1234", 2); + Verify(bld, "fr-z-ab", + "ulocbld_setExtension('Z', \"ab\") got Error: %s\n"); + + ulocbld_setExtension(bld, '0', "xyz12345-abcdefg", -1); + Verify(bld, "fr-0-xyz12345-abcdefg-z-ab", + "ulocbld_setExtension('0', \"xyz12345-abcdefg\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'a', "01-12345678-ABcdef", -1); + Verify(bld, "fr-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "ulocbld_setExtension('a', \"01-12345678-ABcdef\") got Error: %s\n"); + + ulocbld_setRegion(bld, "TH1234", 2); + Verify(bld, "fr-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "ulocbld_setRegion(\"TH\") got Error: %s\n"); + + ulocbld_setScript(bld, "Arab", -1); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-01-12345678-abcdef-z-ab", + "ulocbld_setRegion(\"Arab\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'A', "97", 2); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-a-97-z-ab", + "ulocbld_setExtension('a', \"97\") got Error: %s\n"); + + ulocbld_setExtension(bld, 'a', "", 0); + Verify(bld, "fr-Arab-TH-0-xyz12345-abcdefg-z-ab", + "ulocbld_setExtension('a', \"\") got Error: %s\n"); + + ulocbld_setExtension(bld, '0', "", -1); + Verify(bld, "fr-Arab-TH-z-ab", + "ulocbld_setExtension('0', \"\") got Error: %s\n"); + ulocbld_close(bld); +} + +static void TestSetExtensionValidateOthersWellFormed() { + static const char* wellFormedExtensions[] = { + "ab", + "abc", + "abcefghi", + "01", + "234", + "87654321", + "0a-ab-87654321", + "87654321-ab-00-3A", + }; + + const char * aToZ = "abcdefghijklmnopqrstuvwxyz"; + const int32_t aToZLen = strlen(aToZ); + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(wellFormedExtensions); i++) { + const char* extension = wellFormedExtensions[i]; + ulocbld_clear(bld); + char ch = aToZ[i]; + i = (i + 1) % aToZLen; + UErrorCode status = U_ZERO_ERROR; + ulocbld_setExtension(bld, ch, extension, -1); + if (ulocbld_copyErrorTo(bld, &status) || U_FAILURE(status)) { + log_err("ulocbld_setExtension('%c', \"%s\") got Error: %s\n", + ch, extension, u_errorName(status)); + } + } + + const char* someChars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789`~!@#$%^&*()-_=+;:,.<>?"; + const int32_t someCharsLen = strlen(someChars); + for (int32_t i = 0; i < someCharsLen; i++) { + char ch = someChars[i]; + UErrorCode status = U_ZERO_ERROR; + ulocbld_clear(bld); + ulocbld_setExtension(bld, ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)], -1); + if (uprv_isASCIILetter(ch) || ('0' <= ch && ch <= '9')) { + if (ch != 't' && ch != 'T' && ch != 'u' && ch != 'U' && ch != 'x' && ch != 'X') { + if (ulocbld_copyErrorTo(bld, &status) || U_FAILURE(status)) { + log_err("setExtension('%c', \"%s\") got Error: %s\n", + ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)], u_errorName(status)); + } + } + } else { + if (!ulocbld_copyErrorTo(bld, &status) || status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("setExtension('%c', \"%s\") should fail but has no Error\n", + ch, wellFormedExtensions[ch % UPRV_LENGTHOF(wellFormedExtensions)]); + } + } + + } + ulocbld_close(bld); +} + +static void TestSetExtensionValidateOthersIllFormed() { + static const char* illFormed[] = { + "0", // Too short + "a", // Too short + "123456789", // Too long + "abcdefghi", // Too long + "ab-123456789", // Second subtag too long + "abcdefghi-12", // First subtag too long + "a-ab-87654321", // Third subtag too long + "87654321-a-0-3", // First subtag too long + }; + const char * aToZ = "abcdefghijklmnopqrstuvwxyz"; + const int32_t aToZLen = strlen(aToZ); + ULocaleBuilder* bld = ulocbld_open(); + for (int i = 0; i < UPRV_LENGTHOF(illFormed); i++) { + const char* ill = illFormed[i]; + char ch = aToZ[i]; + ulocbld_clear(bld); + i = (i + 1) % aToZLen; + UErrorCode status = U_ZERO_ERROR; + ulocbld_setExtension(bld, ch, ill, -1); + if (!ulocbld_copyErrorTo(bld, &status) || status != U_ILLEGAL_ARGUMENT_ERROR) { + log_err("setExtension('%c', \"%s\") should fail but has no Error\n", + ch, ill); + } + } + ulocbld_close(bld); +} + +static void TestSetLocale() { + ULocaleBuilder* bld1 = ulocbld_open(); + ULocaleBuilder* bld2 = ulocbld_open(); + UErrorCode status = U_ZERO_ERROR; + + ulocbld_setLanguage(bld1, "en", -1); + ulocbld_setScript(bld1, "Latn", -1); + ulocbld_setRegion(bld1, "MX", -1); + ulocbld_setVariant(bld1, "3456-abcde", -1); + ulocbld_addUnicodeLocaleAttribute(bld1, "456", -1); + ulocbld_addUnicodeLocaleAttribute(bld1, "123", -1); + ulocbld_setUnicodeLocaleKeyword(bld1, "nu", -1, "thai", -1); + ulocbld_setUnicodeLocaleKeyword(bld1, "co", -1, "stroke", -1); + ulocbld_setUnicodeLocaleKeyword(bld1, "ca", -1, "chinese", -1); + char locale1[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld1, locale1, ULOC_FULLNAME_CAPACITY, &status); + + if (U_FAILURE(status)) { + log_err("build got Error: %s\n", u_errorName(status)); + } + ulocbld_setLocale(bld2, locale1, -1); + char locale2[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld2, locale2, ULOC_FULLNAME_CAPACITY, &status); + if (U_FAILURE(status)) { + log_err("build got Error: %s\n", u_errorName(status)); + } + if (strcmp(locale1, locale2) != 0) { + log_err("Two locales should be the same, but one is '%s' and the other is '%s'", + locale1, locale2); + } + ulocbld_close(bld1); + ulocbld_close(bld2); +} + + +static void TestPosixCases() { + UErrorCode status = U_ZERO_ERROR; + ULocaleBuilder* bld = ulocbld_open(); + ulocbld_setLanguage(bld, "en", -1); + ulocbld_setRegion(bld, "MX", -1); + ulocbld_setScript(bld, "Arab", -1); + ulocbld_setUnicodeLocaleKeyword(bld, "nu", -1, "Thai", -1); + ulocbld_setExtension(bld, 'x', "1", -1); + // All of above should be cleared by the setLocale call. + const char* posix = "en_US_POSIX"; + ulocbld_setLocale(bld, posix, -1); + char locale[ULOC_FULLNAME_CAPACITY]; + ulocbld_buildLocaleID(bld, locale, ULOC_FULLNAME_CAPACITY, &status); + if (U_FAILURE(status)) { + log_err("build got Error: %s\n", u_errorName(status)); + } + if (strcmp(posix, locale) != 0) { + log_err("The result locale should be the set as the setLocale %s but got %s\n", + posix, locale); + } + ulocbld_close(bld); +} + +#define TESTCASE(name) addTest(root, &name, "tsutil/ulocbuildertst/" #name) +void addLocaleBuilderTest(TestNode** root) +{ + TESTCASE(TestLocaleBuilder); + TESTCASE(TestLocaleBuilderBasic); + TESTCASE(TestLocaleBuilderBasicWithExtensionsOnDefaultLocale); + TESTCASE(TestSetLanguageWellFormed); + TESTCASE(TestSetLanguageIllFormed); + TESTCASE(TestSetScriptWellFormed); + TESTCASE(TestSetScriptIllFormed); + TESTCASE(TestSetRegionWellFormed); + TESTCASE(TestSetRegionIllFormed); + TESTCASE(TestSetVariantWellFormed); + TESTCASE(TestSetVariantIllFormed); + TESTCASE(TestSetUnicodeLocaleKeywordWellFormed); + TESTCASE(TestSetUnicodeLocaleKeywordIllFormedKey); + TESTCASE(TestSetUnicodeLocaleKeywordIllFormedValue); + TESTCASE(TestAddRemoveUnicodeLocaleAttribute); + TESTCASE(TestAddRemoveUnicodeLocaleAttributeWellFormed); + TESTCASE(TestAddUnicodeLocaleAttributeIllFormed); + TESTCASE(TestSetExtensionU); + TESTCASE(TestSetExtensionValidateUWellFormed); + TESTCASE(TestSetExtensionValidateUIllFormed); + TESTCASE(TestSetExtensionT); + TESTCASE(TestSetExtensionValidateTWellFormed); + TESTCASE(TestSetExtensionValidateTIllFormed); + TESTCASE(TestSetExtensionPU); + TESTCASE(TestSetExtensionValidatePUWellFormed); + TESTCASE(TestSetExtensionValidatePUIllFormed); + TESTCASE(TestSetExtensionOthers); + TESTCASE(TestSetExtensionValidateOthersWellFormed); + TESTCASE(TestSetExtensionValidateOthersIllFormed); + TESTCASE(TestSetLocale); + TESTCASE(TestPosixCases); +} diff --git a/icu4c/source/test/depstest/dependencies.txt b/icu4c/source/test/depstest/dependencies.txt index d95e4de3374e..29819db31af5 100644 --- a/icu4c/source/test/depstest/dependencies.txt +++ b/icu4c/source/test/depstest/dependencies.txt @@ -653,6 +653,7 @@ group: resourcebundle locbased.o loclikely.o localebuilder.o + ulocbuilder.o deps udata ucol_swp sort stringenumeration uhash uvector