From 80ab31a43577ab95eb3ddfac637bd792989555b1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Zbigniew=20J=C4=99drzejewski-Szmek?= <zbyszek@in.waw.pl>
Date: Tue, 1 Sep 2020 10:43:21 +0200
Subject: [PATCH] shared/utf8: add utf8_is_valid_n()

Sometimes we need to check strings without the terminating NUL. Add a variant
that does that.
---
 src/basic/utf8.c     | 18 +++++++++++-------
 src/basic/utf8.h     |  5 ++++-
 src/test/test-utf8.c | 20 ++++++++++++++++++++
 3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/src/basic/utf8.c b/src/basic/utf8.c
index 174075be54..f0233397ef 100644
--- a/src/basic/utf8.c
+++ b/src/basic/utf8.c
@@ -150,18 +150,22 @@ bool utf8_is_printable_newline(const char* str, size_t length, bool allow_newlin
         return true;
 }
 
-char *utf8_is_valid(const char *str) {
-        const char *p;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) {
+        /* Check if the string is composed of valid utf8 characters. If length len_bytes is given, stop after
+         * len_bytes. Otherwise, stop at NUL. */
 
         assert(str);
 
-        p = str;
-        while (*p) {
+        for (const char *p = str; len_bytes != (size_t) -1 ? (size_t) (p - str) < len_bytes : *p != '\0'; ) {
                 int len;
 
-                len = utf8_encoded_valid_unichar(p, (size_t) -1);
-                if (len < 0)
-                        return NULL;
+                if (_unlikely_(*p == '\0') && len_bytes != (size_t) -1)
+                        return NULL; /* embedded NUL */
+
+                len = utf8_encoded_valid_unichar(p,
+                                                 len_bytes != (size_t) -1 ? len_bytes - (p - str) : (size_t) -1);
+                if (_unlikely_(len < 0))
+                        return NULL; /* invalid character */
 
                 p += len;
         }
diff --git a/src/basic/utf8.h b/src/basic/utf8.h
index 52b487955b..f315ea0f1e 100644
--- a/src/basic/utf8.h
+++ b/src/basic/utf8.h
@@ -14,7 +14,10 @@
 
 bool unichar_is_valid(char32_t c);
 
-char *utf8_is_valid(const char *s) _pure_;
+char *utf8_is_valid_n(const char *str, size_t len_bytes) _pure_;
+static inline char *utf8_is_valid(const char *s) {
+        return utf8_is_valid_n(s, (size_t) -1);
+}
 char *ascii_is_valid(const char *s) _pure_;
 char *ascii_is_valid_n(const char *str, size_t len);
 
diff --git a/src/test/test-utf8.c b/src/test/test-utf8.c
index 8937f56237..66003ac13e 100644
--- a/src/test/test-utf8.c
+++ b/src/test/test-utf8.c
@@ -18,6 +18,25 @@ static void test_utf8_is_printable(void) {
         assert_se(utf8_is_printable("\t", 1));
 }
 
+static void test_utf8_n_is_valid(void) {
+        log_info("/* %s */", __func__);
+
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 21));
+        assert_se( utf8_is_valid_n("ascii is valid unicode", 22));
+        assert_se(!utf8_is_valid_n("ascii is valid unicode", 23));
+        assert_se( utf8_is_valid_n("\342\204\242", 0));
+        assert_se(!utf8_is_valid_n("\342\204\242", 1));
+        assert_se(!utf8_is_valid_n("\342\204\242", 2));
+        assert_se( utf8_is_valid_n("\342\204\242", 3));
+        assert_se(!utf8_is_valid_n("\342\204\242", 4));
+        assert_se( utf8_is_valid_n("<ZZ>", 0));
+        assert_se( utf8_is_valid_n("<ZZ>", 1));
+        assert_se( utf8_is_valid_n("<ZZ>", 2));
+        assert_se( utf8_is_valid_n("<ZZ>", 3));
+        assert_se( utf8_is_valid_n("<ZZ>", 4));
+        assert_se(!utf8_is_valid_n("<ZZ>", 5));
+}
+
 static void test_utf8_is_valid(void) {
         log_info("/* %s */", __func__);
 
@@ -216,6 +235,7 @@ static void test_utf8_to_utf16(void) {
 }
 
 int main(int argc, char *argv[]) {
+        test_utf8_n_is_valid();
         test_utf8_is_valid();
         test_utf8_is_printable();
         test_ascii_is_valid();
-- 
2.25.1