[PATCH 3/3] fs: unicode: Make UTF-8 encoding loadable

Shreeya Patel Sat, 13 Mar 2021 15:14:53 -0800

utf8data.h_shipped has a large database table which is an auto-generated
decodification trie for the unicode normalization functions.
It is not necessary to carry this large table in the kernel hence make
UTF-8 encoding loadable by converting it into a module.
Also, modify the file called unicode-core which will act as a layer for
unicode subsystem. It will load the UTF-8 module and access it's functions
whenever any filesystem that needs unicode is mounted.


Signed-off-by: Shreeya Patel <[email protected]>
---
 fs/unicode/Kconfig        |   7 +-
 fs/unicode/Makefile       |   5 +-
 fs/unicode/unicode-core.c | 201 ++++++-------------------------
 fs/unicode/utf8-core.c    | 112 +++++++++++++++++
 fs/unicode/utf8mod.c      | 246 ++++++++++++++++++++++++++++++++++++++
 include/linux/unicode.h   |  20 ++++
 6 files changed, 427 insertions(+), 164 deletions(-)
 create mode 100644 fs/unicode/utf8-core.c
 create mode 100644 fs/unicode/utf8mod.c

diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
index 2c27b9a5cd6c..33a27deef729 100644
--- a/fs/unicode/Kconfig
+++ b/fs/unicode/Kconfig
@@ -8,7 +8,12 @@ config UNICODE
          Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
          support.
 
+config UNICODE_UTF8
+       tristate "UTF-8 module"
+       depends on UNICODE
+       default m
+
 config UNICODE_NORMALIZATION_SELFTEST
        tristate "Test UTF-8 normalization support"
-       depends on UNICODE
+       depends on UNICODE_UTF8
        default n
diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
index fbf9a629ed0d..9dbb04194b32 100644
--- a/fs/unicode/Makefile
+++ b/fs/unicode/Makefile
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0
 
 obj-$(CONFIG_UNICODE) += unicode.o
+obj-$(CONFIG_UNICODE_UTF8) += utf8.o
 obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
 
-unicode-y := utf8-norm.o unicode-core.o
+unicode-y := unicode-core.o
+utf8-y := utf8mod.o utf8-norm.o
 
 $(obj)/utf8-norm.o: $(obj)/utf8data.h
+$(obj)/utf8mod.o: $(obj)/utf8-norm.o
 
 # In the normal build, the checked-in utf8data.h is just shipped.
 #
diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
index d5f09e022ac5..b832341f1e7b 100644
--- a/fs/unicode/unicode-core.c
+++ b/fs/unicode/unicode-core.c
@@ -7,70 +7,29 @@
 #include <linux/errno.h>
 #include <linux/unicode.h>
 #include <linux/stringhash.h>
+#include <linux/delay.h>
 
-#include "utf8n.h"
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
 
 int unicode_validate(const struct unicode_map *um, const struct qstr *str)
 {
-       const struct utf8data *data = utf8nfdi(um->version);
-
-       if (utf8nlen(data, str->name, str->len) < 0)
-               return -1;
-       return 0;
+       return utf8_ops->validate(um, str);
 }
 EXPORT_SYMBOL(unicode_validate);
 
 int unicode_strncmp(const struct unicode_map *um,
                    const struct qstr *s1, const struct qstr *s2)
 {
-       const struct utf8data *data = utf8nfdi(um->version);
-       struct utf8cursor cur1, cur2;
-       int c1, c2;
-
-       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
-               return -EINVAL;
-
-       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
-               return -EINVAL;
-
-       do {
-               c1 = utf8byte(&cur1);
-               c2 = utf8byte(&cur2);
-
-               if (c1 < 0 || c2 < 0)
-                       return -EINVAL;
-               if (c1 != c2)
-                       return 1;
-       } while (c1);
-
-       return 0;
+       return utf8_ops->strncmp(um, s1, s2);
 }
 EXPORT_SYMBOL(unicode_strncmp);
 
 int unicode_strncasecmp(const struct unicode_map *um,
                        const struct qstr *s1, const struct qstr *s2)
 {
-       const struct utf8data *data = utf8nfdicf(um->version);
-       struct utf8cursor cur1, cur2;
-       int c1, c2;
-
-       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
-               return -EINVAL;
-
-       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
-               return -EINVAL;
-
-       do {
-               c1 = utf8byte(&cur1);
-               c2 = utf8byte(&cur2);
-
-               if (c1 < 0 || c2 < 0)
-                       return -EINVAL;
-               if (c1 != c2)
-                       return 1;
-       } while (c1);
-
-       return 0;
+       return utf8_ops->strncasecmp(um, s1, s2);
 }
 EXPORT_SYMBOL(unicode_strncasecmp);
 
@@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct unicode_map 
*um,
                               const struct qstr *cf,
                               const struct qstr *s1)
 {
-       const struct utf8data *data = utf8nfdicf(um->version);
-       struct utf8cursor cur1;
-       int c1, c2;
-       int i = 0;
-
-       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
-               return -EINVAL;
-
-       do {
-               c1 = utf8byte(&cur1);
-               c2 = cf->name[i++];
-               if (c1 < 0)
-                       return -EINVAL;
-               if (c1 != c2)
-                       return 1;
-       } while (c1);
-
-       return 0;
+       return utf8_ops->strncasecmp_folded(um, cf, s1);
 }
 EXPORT_SYMBOL(unicode_strncasecmp_folded);
 
 int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
                     unsigned char *dest, size_t dlen)
 {
-       const struct utf8data *data = utf8nfdicf(um->version);
-       struct utf8cursor cur;
-       size_t nlen = 0;
-
-       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
-               return -EINVAL;
-
-       for (nlen = 0; nlen < dlen; nlen++) {
-               int c = utf8byte(&cur);
-
-               dest[nlen] = c;
-               if (!c)
-                       return nlen;
-               if (c == -1)
-                       break;
-       }
-       return -EINVAL;
+       return utf8_ops->casefold(um, str, dest, dlen);
 }
 EXPORT_SYMBOL(unicode_casefold);
 
 int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
                          struct qstr *str)
 {
-       const struct utf8data *data = utf8nfdicf(um->version);
-       struct utf8cursor cur;
-       int c;
-       unsigned long hash = init_name_hash(salt);
-
-       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
-               return -EINVAL;
-
-       while ((c = utf8byte(&cur))) {
-               if (c < 0)
-                       return -EINVAL;
-               hash = partial_name_hash((unsigned char)c, hash);
-       }
-       str->hash = end_name_hash(hash);
-       return 0;
+       return utf8_ops->casefold_hash(um, salt, str);
 }
 EXPORT_SYMBOL(unicode_casefold_hash);
 
 int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
                      unsigned char *dest, size_t dlen)
 {
-       const struct utf8data *data = utf8nfdi(um->version);
-       struct utf8cursor cur;
-       ssize_t nlen = 0;
+       return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
 
-       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
-               return -EINVAL;
+struct unicode_map *unicode_load(const char *version)
+{
+       int ret = unicode_load_module();
 
-       for (nlen = 0; nlen < dlen; nlen++) {
-               int c = utf8byte(&cur);
+       if (ret)
+               return ERR_PTR(ret);
 
-               dest[nlen] = c;
-               if (!c)
-                       return nlen;
-               if (c == -1)
-                       break;
-       }
-       return -EINVAL;
+       else
+               return utf8_ops->load(version);
 }
-EXPORT_SYMBOL(unicode_normalize);
+EXPORT_SYMBOL(unicode_load);
 
-static int unicode_parse_version(const char *version, unsigned int *maj,
-                                unsigned int *min, unsigned int *rev)
+void unicode_unload(struct unicode_map *um)
 {
-       substring_t args[3];
-       char version_string[12];
-       static const struct match_token token[] = {
-               {1, "%d.%d.%d"},
-               {0, NULL}
-       };
+       kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
 
-       strncpy(version_string, version, sizeof(version_string));
+static int unicode_load_module(void)
+{
+       int ret = request_module("utf8");
 
-       if (match_token(version_string, token, args) != 1)
-               return -EINVAL;
+       msleep(100);
 
-       if (match_int(&args[0], maj) || match_int(&args[1], min) ||
-           match_int(&args[2], rev))
-               return -EINVAL;
+       if (ret) {
+               pr_err("Failed to load UTF-8 module\n");
+               return ret;
+       }
 
        return 0;
 }
 
-struct unicode_map *unicode_load(const char *version)
+void unicode_register(struct unicode_ops *ops)
 {
-       struct unicode_map *um = NULL;
-       int unicode_version;
-
-       if (version) {
-               unsigned int maj, min, rev;
-
-               if (unicode_parse_version(version, &maj, &min, &rev) < 0)
-                       return ERR_PTR(-EINVAL);
-
-               if (!utf8version_is_supported(maj, min, rev))
-                       return ERR_PTR(-EINVAL);
-
-               unicode_version = UNICODE_AGE(maj, min, rev);
-       } else {
-               unicode_version = utf8version_latest();
-               printk(KERN_WARNING"UTF-8 version not specified. "
-                      "Assuming latest supported version (%d.%d.%d).",
-                      (unicode_version >> 16) & 0xff,
-                      (unicode_version >> 8) & 0xff,
-                      (unicode_version & 0xff));
-       }
-
-       um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
-       if (!um)
-               return ERR_PTR(-ENOMEM);
-
-       um->charset = "UTF-8";
-       um->version = unicode_version;
-
-       return um;
+       utf8_ops = ops;
 }
-EXPORT_SYMBOL(unicode_load);
+EXPORT_SYMBOL(unicode_register);
 
-void unicode_unload(struct unicode_map *um)
+void unicode_unregister(void)
 {
-       kfree(um);
+       utf8_ops = NULL;
 }
-EXPORT_SYMBOL(unicode_unload);
+EXPORT_SYMBOL(unicode_unregister);
 
 MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
new file mode 100644
index 000000000000..009faa68330c
--- /dev/null
+++ b/fs/unicode/utf8-core.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+#include <linux/delay.h>
+
+struct unicode_ops *utf8_ops;
+
+static int unicode_load_module(void);
+
+int unicode_validate(const struct unicode_map *um, const struct qstr *str)
+{
+       return utf8_ops->validate(um, str);
+}
+EXPORT_SYMBOL(unicode_validate);
+
+int unicode_strncmp(const struct unicode_map *um,
+                   const struct qstr *s1, const struct qstr *s2)
+{
+       return utf8_ops->strncmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncmp);
+
+int unicode_strncasecmp(const struct unicode_map *um,
+                       const struct qstr *s1, const struct qstr *s2)
+{
+       return utf8_ops->strncasecmp(um, s1, s2);
+}
+EXPORT_SYMBOL(unicode_strncasecmp);
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int unicode_strncasecmp_folded(const struct unicode_map *um,
+                              const struct qstr *cf,
+                              const struct qstr *s1)
+{
+       return utf8_ops->strncasecmp_folded(um, cf, s1);
+}
+EXPORT_SYMBOL(unicode_strncasecmp_folded);
+
+int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
+                    unsigned char *dest, size_t dlen)
+{
+       return utf8_ops->casefold(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_casefold);
+
+int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
+                         struct qstr *str)
+{
+       return utf8_ops->casefold_hash(um, salt, str);
+}
+EXPORT_SYMBOL(unicode_casefold_hash);
+
+int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
+                     unsigned char *dest, size_t dlen)
+{
+       return utf8_ops->normalize(um, str, dest, dlen);
+}
+EXPORT_SYMBOL(unicode_normalize);
+
+struct unicode_map *unicode_load(const char *version)
+{
+       int ret = unicode_load_module();
+
+       if (ret)
+               return ERR_PTR(ret);
+
+       else
+               return utf8_ops->load(version);
+}
+EXPORT_SYMBOL(unicode_load);
+
+void unicode_unload(struct unicode_map *um)
+{
+       kfree(um);
+}
+EXPORT_SYMBOL(unicode_unload);
+
+void unicode_register(struct unicode_ops *ops)
+{
+       utf8_ops = ops;
+}
+EXPORT_SYMBOL(unicode_register);
+
+void unicode_unregister(void)
+{
+       utf8_ops = NULL;
+}
+EXPORT_SYMBOL(unicode_unregister);
+
+static int unicode_load_module(void)
+{
+       int ret = request_module("utf8");
+
+       msleep(100);
+
+       if (ret) {
+               pr_err("Failed to load UTF-8 module\n");
+               return ret;
+       }
+
+       return 0;
+}
+
+MODULE_LICENSE("GPL v2");
diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
new file mode 100644
index 000000000000..8eaeeb27255c
--- /dev/null
+++ b/fs/unicode/utf8mod.c
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/parser.h>
+#include <linux/errno.h>
+#include <linux/unicode.h>
+#include <linux/stringhash.h>
+
+#include "utf8n.h"
+
+static int utf8_validate(const struct unicode_map *um, const struct qstr *str)
+{
+       const struct utf8data *data = utf8nfdi(um->version);
+
+       if (utf8nlen(data, str->name, str->len) < 0)
+               return -1;
+       return 0;
+}
+
+static int utf8_strncmp(const struct unicode_map *um,
+                       const struct qstr *s1, const struct qstr *s2)
+{
+       const struct utf8data *data = utf8nfdi(um->version);
+       struct utf8cursor cur1, cur2;
+       int c1, c2;
+
+       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+               return -EINVAL;
+
+       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+               return -EINVAL;
+
+       do {
+               c1 = utf8byte(&cur1);
+               c2 = utf8byte(&cur2);
+
+               if (c1 < 0 || c2 < 0)
+                       return -EINVAL;
+               if (c1 != c2)
+                       return 1;
+       } while (c1);
+
+       return 0;
+}
+
+static int utf8_strncasecmp(const struct unicode_map *um,
+                           const struct qstr *s1, const struct qstr *s2)
+{
+       const struct utf8data *data = utf8nfdicf(um->version);
+       struct utf8cursor cur1, cur2;
+       int c1, c2;
+
+       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+               return -EINVAL;
+
+       if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
+               return -EINVAL;
+
+       do {
+               c1 = utf8byte(&cur1);
+               c2 = utf8byte(&cur2);
+
+               if (c1 < 0 || c2 < 0)
+                       return -EINVAL;
+               if (c1 != c2)
+                       return 1;
+       } while (c1);
+
+       return 0;
+}
+
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+static int utf8_strncasecmp_folded(const struct unicode_map *um,
+                                  const struct qstr *cf,
+                                  const struct qstr *s1)
+{
+       const struct utf8data *data = utf8nfdicf(um->version);
+       struct utf8cursor cur1;
+       int c1, c2;
+       int i = 0;
+
+       if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+               return -EINVAL;
+
+       do {
+               c1 = utf8byte(&cur1);
+               c2 = cf->name[i++];
+               if (c1 < 0)
+                       return -EINVAL;
+               if (c1 != c2)
+                       return 1;
+       } while (c1);
+
+       return 0;
+}
+
+static int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
+                        unsigned char *dest, size_t dlen)
+{
+       const struct utf8data *data = utf8nfdicf(um->version);
+       struct utf8cursor cur;
+       size_t nlen = 0;
+
+       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+               return -EINVAL;
+
+       for (nlen = 0; nlen < dlen; nlen++) {
+               int c = utf8byte(&cur);
+
+               dest[nlen] = c;
+               if (!c)
+                       return nlen;
+               if (c == -1)
+                       break;
+       }
+       return -EINVAL;
+}
+
+static int utf8_casefold_hash(const struct unicode_map *um, const void *salt,
+                             struct qstr *str)
+{
+       const struct utf8data *data = utf8nfdicf(um->version);
+       struct utf8cursor cur;
+       int c;
+       unsigned long hash = init_name_hash(salt);
+
+       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+               return -EINVAL;
+
+       while ((c = utf8byte(&cur))) {
+               if (c < 0)
+                       return -EINVAL;
+               hash = partial_name_hash((unsigned char)c, hash);
+       }
+       str->hash = end_name_hash(hash);
+       return 0;
+}
+
+static int utf8_normalize(const struct unicode_map *um, const struct qstr *str,
+                         unsigned char *dest, size_t dlen)
+{
+       const struct utf8data *data = utf8nfdi(um->version);
+       struct utf8cursor cur;
+       ssize_t nlen = 0;
+
+       if (utf8ncursor(&cur, data, str->name, str->len) < 0)
+               return -EINVAL;
+
+       for (nlen = 0; nlen < dlen; nlen++) {
+               int c = utf8byte(&cur);
+
+               dest[nlen] = c;
+               if (!c)
+                       return nlen;
+               if (c == -1)
+                       break;
+       }
+       return -EINVAL;
+}
+
+static int utf8_parse_version(const char *version, unsigned int *maj,
+                             unsigned int *min, unsigned int *rev)
+{
+       substring_t args[3];
+       char version_string[12];
+       static const struct match_token token[] = {
+               {1, "%d.%d.%d"},
+               {0, NULL}
+       };
+
+       strncpy(version_string, version, sizeof(version_string));
+
+       if (match_token(version_string, token, args) != 1)
+               return -EINVAL;
+
+       if (match_int(&args[0], maj) || match_int(&args[1], min) ||
+           match_int(&args[2], rev))
+               return -EINVAL;
+
+       return 0;
+}
+
+static struct unicode_map *utf8_load(const char *version)
+{
+       struct unicode_map *um = NULL;
+       int unicode_version;
+
+       if (version) {
+               unsigned int maj, min, rev;
+
+               if (utf8_parse_version(version, &maj, &min, &rev) < 0)
+                       return ERR_PTR(-EINVAL);
+
+               if (!utf8version_is_supported(maj, min, rev))
+                       return ERR_PTR(-EINVAL);
+
+               unicode_version = UNICODE_AGE(maj, min, rev);
+       } else {
+               unicode_version = utf8version_latest();
+               printk(KERN_WARNING"UTF-8 version not specified. "
+                      "Assuming latest supported version (%d.%d.%d).",
+                      (unicode_version >> 16) & 0xff,
+                      (unicode_version >> 8) & 0xff,
+                      (unicode_version & 0xff));
+       }
+
+       um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
+       if (!um)
+               return ERR_PTR(-ENOMEM);
+
+       um->charset = "UTF-8";
+       um->version = unicode_version;
+
+       return um;
+}
+
+static struct unicode_ops ops = {
+       .validate = utf8_validate,
+       .strncmp = utf8_strncmp,
+       .strncasecmp = utf8_strncasecmp,
+       .strncasecmp_folded = utf8_strncasecmp_folded,
+       .casefold = utf8_casefold,
+       .casefold_hash = utf8_casefold_hash,
+       .normalize = utf8_normalize,
+       .load = utf8_load,
+};
+
+static int __init utf8_init(void)
+{
+       unicode_register(&ops);
+       return 0;
+}
+
+static void __exit utf8_exit(void)
+{
+       unicode_unregister();
+}
+
+module_init(utf8_init);
+module_exit(utf8_exit);
+
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/unicode.h b/include/linux/unicode.h
index de23f9ee720b..b0d59069e438 100644
--- a/include/linux/unicode.h
+++ b/include/linux/unicode.h
@@ -10,6 +10,23 @@ struct unicode_map {
        int version;
 };
 
+struct unicode_ops {
+       int (*validate)(const struct unicode_map *um, const struct qstr *str);
+       int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
+                      const struct qstr *s2);
+       int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
+                          const struct qstr *s2);
+       int (*strncasecmp_folded)(const struct unicode_map *um, const struct 
qstr *cf,
+                                 const struct qstr *s1);
+       int (*normalize)(const struct unicode_map *um, const struct qstr *str,
+                        unsigned char *dest, size_t dlen);
+       int (*casefold)(const struct unicode_map *um, const struct qstr *str,
+                       unsigned char *dest, size_t dlen);
+       int (*casefold_hash)(const struct unicode_map *um, const void *salt,
+                            struct qstr *str);
+       struct unicode_map* (*load)(const char *version);
+};
+
 int unicode_validate(const struct unicode_map *um, const struct qstr *str);
 
 int unicode_strncmp(const struct unicode_map *um,
@@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, const 
void *salt,
 struct unicode_map *unicode_load(const char *version);
 void unicode_unload(struct unicode_map *um);
 
+void unicode_register(struct unicode_ops *ops);
+void unicode_unregister(void);
+
 #endif /* _LINUX_UNICODE_H */
-- 
2.30.1

[PATCH 3/3] fs: unicode: Make UTF-8 encoding loadable

Reply via email to