From 80ab31a43577ab95eb3ddfac637bd792989555b1 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= Date: Tue, 1 Sep 2020 10:43:21 +0200 Subject: [PATCH] shared/utf8: add utf8_is_valid_n() Sometimes we need to check strings without the terminating NUL. Add a variant that does that. --- src/basic/utf8.c | 18 +++++++++++------- src/basic/utf8.h | 5 ++++- src/test/test-utf8.c | 20 ++++++++++++++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/src/basic/utf8.c b/src/basic/utf8.c index 174075be54..f0233397ef 100644 --- a/src/basic/utf8.c +++ b/src/basic/utf8.c @@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin return true; } -char *utf8_is_valid(const char *str) { - const char *p; +char *utf8_is_valid_n(const char *str, size_t len_bytes) { + /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after + * len_bytes. Otherwise, stop at NUL. */ assert(str); - p = str; - while (*p) { + for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) { int len; - len = utf8_encoded_valid_unichar(p, (size_t) -1); - if (len < 0) - return NULL; + if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1) + return NULL; /* embedded NUL */ + + len = utf8_encoded_valid_unichar(p, + len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1); + if (_unlikely_(len < 0)) + return NULL; /* invalid character */ p += len; } diff --git a/src/basic/utf8.h b/src/basic/utf8.h index 52b487955b..f315ea0f1e 100644 --- a/src/basic/utf8.h +++ b/src/basic/utf8.h @@ -14,7 +14,10 @@ bool unichar_is_valid(char32_t c); -char *utf8_is_valid(const char *s) _pure_; +char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_; +static inline char *utf8_is_valid(const char *s) { + return utf8_is_valid_n(s, (size_t) -1); +} char *ascii_is_valid(const char *s) _pure_; char *ascii_is_valid_n(const char *str, size_t len); diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c index 8937f56237..66003ac13e 100644 --- a/src/test/test-utf8.c +++ b/src/test/test-utf8.c @@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) { assert_se(utf8_is_printable("\t", 1)); } +static void test_utf8_n_is_valid(void) { + log_info("/* %s */", __func__); + + assert_se( utf8_is_valid_n("ascii is valid unicode", 21)); + assert_se( utf8_is_valid_n("ascii is valid unicode", 22)); + assert_se(!utf8_is_valid_n("ascii is valid unicode", 23)); + assert_se( utf8_is_valid_n("\342\204\242", 0)); + assert_se(!utf8_is_valid_n("\342\204\242", 1)); + assert_se(!utf8_is_valid_n("\342\204\242", 2)); + assert_se( utf8_is_valid_n("\342\204\242", 3)); + assert_se(!utf8_is_valid_n("\342\204\242", 4)); + assert_se( utf8_is_valid_n("", 0)); + assert_se( utf8_is_valid_n("", 1)); + assert_se( utf8_is_valid_n("", 2)); + assert_se( utf8_is_valid_n("", 3)); + assert_se( utf8_is_valid_n("", 4)); + assert_se(!utf8_is_valid_n("", 5)); +} + static void test_utf8_is_valid(void) { log_info("/* %s */", __func__); @@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) { } int main(int argc, char *argv[]) { + test_utf8_n_is_valid(); test_utf8_is_valid(); test_utf8_is_printable(); test_ascii_is_valid(); -- 2.25.1