Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

Gabriel Krisman Bertazi Sun, 14 Mar 2021 21:22:58 -0700

Shreeya Patel <[email protected]> writes:

> On 14/03/21 7:19 am, Gabriel Krisman Bertazi wrote:
>> Shreeya Patel <[email protected]> writes:
>>
>>> utf8data.h_shipped has a large database table which is an auto-generated
>>> decodification trie for the unicode normalization functions.
>>> It is not necessary to carry this large table in the kernel hence make
>>> UTF-8 encoding loadable by converting it into a module.
>>> Also, modify the file called unicode-core which will act as a layer for
>>> unicode subsystem. It will load the UTF-8 module and access it's functions
>>> whenever any filesystem that needs unicode is mounted.
>>>
>>> Signed-off-by: Shreeya Patel <[email protected]>
>> Hi Shreeya,
> Hi Gabriel,
>>
>>> ---
>>>   fs/unicode/Kconfig        |   7 +-
>>>   fs/unicode/Makefile       |   5 +-
>>>   fs/unicode/unicode-core.c | 201 ++++++-------------------------
>>>   fs/unicode/utf8-core.c    | 112 +++++++++++++++++
>>>   fs/unicode/utf8mod.c      | 246 ++++++++++++++++++++++++++++++++++++++
>>>   include/linux/unicode.h   |  20 ++++
>>>   6 files changed, 427 insertions(+), 164 deletions(-)
>>>   create mode 100644 fs/unicode/utf8-core.c
>>>   create mode 100644 fs/unicode/utf8mod.c
>>>
>>> diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig
>>> index 2c27b9a5cd6c..33a27deef729 100644
>>> --- a/fs/unicode/Kconfig
>>> +++ b/fs/unicode/Kconfig
>>> @@ -8,7 +8,12 @@ config UNICODE
>>>       Say Y here to enable UTF-8 NFD normalization and NFD+CF casefolding
>>>       support.
>>>   +config UNICODE_UTF8
>>> +   tristate "UTF-8 module"
>>> +   depends on UNICODE
>>> +   default m
>>> +
>>>   config UNICODE_NORMALIZATION_SELFTEST
>>>     tristate "Test UTF-8 normalization support"
>>> -   depends on UNICODE
>>> +   depends on UNICODE_UTF8
>>>     default n
>>> diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile
>>> index fbf9a629ed0d..9dbb04194b32 100644
>>> --- a/fs/unicode/Makefile
>>> +++ b/fs/unicode/Makefile
>>> @@ -1,11 +1,14 @@
>>>   # SPDX-License-Identifier: GPL-2.0
>>>     obj-$(CONFIG_UNICODE) += unicode.o
>>> +obj-$(CONFIG_UNICODE_UTF8) += utf8.o
>>>   obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o
>>>   -unicode-y := utf8-norm.o unicode-core.o
>>> +unicode-y := unicode-core.o
>>> +utf8-y := utf8mod.o utf8-norm.o
>>>     $(obj)/utf8-norm.o: $(obj)/utf8data.h
>>> +$(obj)/utf8mod.o: $(obj)/utf8-norm.o
>>>     # In the normal build, the checked-in utf8data.h is just shipped.
>>>   #
>>> diff --git a/fs/unicode/unicode-core.c b/fs/unicode/unicode-core.c
>>> index d5f09e022ac5..b832341f1e7b 100644
>>> --- a/fs/unicode/unicode-core.c
>>> +++ b/fs/unicode/unicode-core.c
>>> @@ -7,70 +7,29 @@
>>>   #include <linux/errno.h>
>>>   #include <linux/unicode.h>
>>>   #include <linux/stringhash.h>
>>> +#include <linux/delay.h>
>>>   -#include "utf8n.h"
>>> +struct unicode_ops *utf8_ops;
>>> +
>>> +static int unicode_load_module(void);
>> This is unnecessary
>>>     int unicode_validate(const struct unicode_map *um, const struct
>>> qstr *str)
>>>   {
>>> -   const struct utf8data *data = utf8nfdi(um->version);
>>> -
>>> -   if (utf8nlen(data, str->name, str->len) < 0)
>>> -           return -1;
>>> -   return 0;
>>> +   return utf8_ops->validate(um, str);
>>>   }
>>>   EXPORT_SYMBOL(unicode_validate);
>>>     int unicode_strncmp(const struct unicode_map *um,
>>>                 const struct qstr *s1, const struct qstr *s2)
>>>   {
>>> -   const struct utf8data *data = utf8nfdi(um->version);
>>> -   struct utf8cursor cur1, cur2;
>>> -   int c1, c2;
>>> -
>>> -   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   do {
>>> -           c1 = utf8byte(&cur1);
>>> -           c2 = utf8byte(&cur2);
>>> -
>>> -           if (c1 < 0 || c2 < 0)
>>> -                   return -EINVAL;
>>> -           if (c1 != c2)
>>> -                   return 1;
>>> -   } while (c1);
>>> -
>>> -   return 0;
>>> +   return utf8_ops->strncmp(um, s1, s2);
>>>   }
>> I think these would go on a header file and  inlined.
>>
>>>   EXPORT_SYMBOL(unicode_strncmp);
>>>     int unicode_strncasecmp(const struct unicode_map *um,
>>>                     const struct qstr *s1, const struct qstr *s2)
>>>   {
>>> -   const struct utf8data *data = utf8nfdicf(um->version);
>>> -   struct utf8cursor cur1, cur2;
>>> -   int c1, c2;
>>> -
>>> -   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   do {
>>> -           c1 = utf8byte(&cur1);
>>> -           c2 = utf8byte(&cur2);
>>> -
>>> -           if (c1 < 0 || c2 < 0)
>>> -                   return -EINVAL;
>>> -           if (c1 != c2)
>>> -                   return 1;
>>> -   } while (c1);
>>> -
>>> -   return 0;
>>> +   return utf8_ops->strncasecmp(um, s1, s2);
>>>   }
>>>   EXPORT_SYMBOL(unicode_strncasecmp);
>>>   @@ -81,155 +40,73 @@ int unicode_strncasecmp_folded(const struct
>>> unicode_map *um,
>>>                            const struct qstr *cf,
>>>                            const struct qstr *s1)
>>>   {
>>> -   const struct utf8data *data = utf8nfdicf(um->version);
>>> -   struct utf8cursor cur1;
>>> -   int c1, c2;
>>> -   int i = 0;
>>> -
>>> -   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   do {
>>> -           c1 = utf8byte(&cur1);
>>> -           c2 = cf->name[i++];
>>> -           if (c1 < 0)
>>> -                   return -EINVAL;
>>> -           if (c1 != c2)
>>> -                   return 1;
>>> -   } while (c1);
>>> -
>>> -   return 0;
>>> +   return utf8_ops->strncasecmp_folded(um, cf, s1);
>>>   }
>>>   EXPORT_SYMBOL(unicode_strncasecmp_folded);
>>>     int unicode_casefold(const struct unicode_map *um, const struct
>>> qstr *str,
>>>                  unsigned char *dest, size_t dlen)
>>>   {
>>> -   const struct utf8data *data = utf8nfdicf(um->version);
>>> -   struct utf8cursor cur;
>>> -   size_t nlen = 0;
>>> -
>>> -   if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   for (nlen = 0; nlen < dlen; nlen++) {
>>> -           int c = utf8byte(&cur);
>>> -
>>> -           dest[nlen] = c;
>>> -           if (!c)
>>> -                   return nlen;
>>> -           if (c == -1)
>>> -                   break;
>>> -   }
>>> -   return -EINVAL;
>>> +   return utf8_ops->casefold(um, str, dest, dlen);
>>>   }
>>>   EXPORT_SYMBOL(unicode_casefold);
>>>     int unicode_casefold_hash(const struct unicode_map *um, const
>>> void *salt,
>>>                       struct qstr *str)
>>>   {
>>> -   const struct utf8data *data = utf8nfdicf(um->version);
>>> -   struct utf8cursor cur;
>>> -   int c;
>>> -   unsigned long hash = init_name_hash(salt);
>>> -
>>> -   if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> -           return -EINVAL;
>>> -
>>> -   while ((c = utf8byte(&cur))) {
>>> -           if (c < 0)
>>> -                   return -EINVAL;
>>> -           hash = partial_name_hash((unsigned char)c, hash);
>>> -   }
>>> -   str->hash = end_name_hash(hash);
>>> -   return 0;
>>> +   return utf8_ops->casefold_hash(um, salt, str);
>>>   }
>>>   EXPORT_SYMBOL(unicode_casefold_hash);
>>>     int unicode_normalize(const struct unicode_map *um, const struct
>>> qstr *str,
>>>                   unsigned char *dest, size_t dlen)
>>>   {
>>> -   const struct utf8data *data = utf8nfdi(um->version);
>>> -   struct utf8cursor cur;
>>> -   ssize_t nlen = 0;
>>> +   return utf8_ops->normalize(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_normalize);
>>>   - if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> -           return -EINVAL;
>>> +struct unicode_map *unicode_load(const char *version)
>>> +{
>>> +   int ret = unicode_load_module();
>>>   - for (nlen = 0; nlen < dlen; nlen++) {
>>> -           int c = utf8byte(&cur);
>>> +   if (ret)
>>> +           return ERR_PTR(ret);
>>>   -         dest[nlen] = c;
>>> -           if (!c)
>>> -                   return nlen;
>>> -           if (c == -1)
>>> -                   break;
>>> -   }
>>> -   return -EINVAL;
>>> +   else
>>> +           return utf8_ops->load(version);
>>>   }
>>> -EXPORT_SYMBOL(unicode_normalize);
>>> +EXPORT_SYMBOL(unicode_load);
>>>   -static int unicode_parse_version(const char *version, unsigned int
>>> *maj,
>>> -                            unsigned int *min, unsigned int *rev)
>>> +void unicode_unload(struct unicode_map *um)
>>>   {
>>> -   substring_t args[3];
>>> -   char version_string[12];
>>> -   static const struct match_token token[] = {
>>> -           {1, "%d.%d.%d"},
>>> -           {0, NULL}
>>> -   };
>>> +   kfree(um);
>>> +}
>>> +EXPORT_SYMBOL(unicode_unload);
>>>   - strncpy(version_string, version, sizeof(version_string));
>>> +static int unicode_load_module(void)
>>> +{
>>> +   int ret = request_module("utf8");
>>>   - if (match_token(version_string, token, args) != 1)
>>> -           return -EINVAL;
>>> +   msleep(100);
>> I think I misunderstood when you mentioned you did this msleep.  It was
>> ok to debug the issue you were observing, but it is not a solution.
>> Setting an arbitrary amount of time will either waste time, or you can
>> still fail if things take longer than expected.  There are mechanisms to
>> load and wait on a module.  See how fs/nls/nls_base.c do exactly this.
>>
>>> -   if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>>> -       match_int(&args[2], rev))
>>> -           return -EINVAL;
>>> +   if (ret) {
>>> +           pr_err("Failed to load UTF-8 module\n");
>>> +           return ret;
>>> +   }
>>>             return 0;
>>>   }
>>>   -struct unicode_map *unicode_load(const char *version)
>>> +void unicode_register(struct unicode_ops *ops)
>>>   {
>>> -   struct unicode_map *um = NULL;
>>> -   int unicode_version;
>>> -
>>> -   if (version) {
>>> -           unsigned int maj, min, rev;
>>> -
>>> -           if (unicode_parse_version(version, &maj, &min, &rev) < 0)
>>> -                   return ERR_PTR(-EINVAL);
>>> -
>>> -           if (!utf8version_is_supported(maj, min, rev))
>>> -                   return ERR_PTR(-EINVAL);
>>> -
>>> -           unicode_version = UNICODE_AGE(maj, min, rev);
>>> -   } else {
>>> -           unicode_version = utf8version_latest();
>>> -           printk(KERN_WARNING"UTF-8 version not specified. "
>>> -                  "Assuming latest supported version (%d.%d.%d).",
>>> -                  (unicode_version >> 16) & 0xff,
>>> -                  (unicode_version >> 8) & 0xff,
>>> -                  (unicode_version & 0xff));
>>> -   }
>>> -
>>> -   um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>>> -   if (!um)
>>> -           return ERR_PTR(-ENOMEM);
>>> -
>>> -   um->charset = "UTF-8";
>>> -   um->version = unicode_version;
>>> -
>>> -   return um;
>>> +   utf8_ops = ops;
>>>   }
>>> -EXPORT_SYMBOL(unicode_load);
>>> +EXPORT_SYMBOL(unicode_register);
>>>   -void unicode_unload(struct unicode_map *um)
>>> +void unicode_unregister(void)
>>>   {
>>> -   kfree(um);
>>> +   utf8_ops = NULL;
>>>   }
>>> -EXPORT_SYMBOL(unicode_unload);
>>> +EXPORT_SYMBOL(unicode_unregister);
>>>     MODULE_LICENSE("GPL v2");
>>> diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
>>> new file mode 100644
>>> index 000000000000..009faa68330c
>>> --- /dev/null
>>> +++ b/fs/unicode/utf8-core.c
>>> @@ -0,0 +1,112 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#include <linux/module.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/string.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/parser.h>
>>> +#include <linux/errno.h>
>>> +#include <linux/unicode.h>
>>> +#include <linux/stringhash.h>
>>> +#include <linux/delay.h>
>>> +
>>> +struct unicode_ops *utf8_ops;
>>> +
>>> +static int unicode_load_module(void);
>>> +
>>> +int unicode_validate(const struct unicode_map *um, const struct qstr *str)
>>> +{
>>> +   return utf8_ops->validate(um, str);
>>> +}
>>> +EXPORT_SYMBOL(unicode_validate);
>>> +
>>> +int unicode_strncmp(const struct unicode_map *um,
>>> +               const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> +   return utf8_ops->strncmp(um, s1, s2);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncmp);
>> I'm confused now.  Isn't this redefining unicode_strncmp ?  It was
>> defined in unicode_core.c on the hunk above and now it is redefined on
>> utf8_core.c.  There is something odd here.
> sorry, I think I messed up patches while using git send-email and that
> is why you might see
> two copies of the last patch. Let me resend the series and then it might
> make sense. One question


Hi Shreeya,

Yes, I noticed that too, but this comment just was about the patch I reviewed.
If I read the patch correctly, unicode_strncmp is defined in two places,
in file unicode_core.c and utf8_core.c, in this single patch?

> though, why would unicode_strncmp go into the header file?

Since unicode_strnmcp is now just a wrapper that calls the ->ops hook,
putting it in the header file will allow it to be inlined by the
compiler inside the caller function.


>>> +
>>> +int unicode_strncasecmp(const struct unicode_map *um,
>>> +                   const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> +   return utf8_ops->strncasecmp(um, s1, s2);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncasecmp);
>>> +
>>> +/* String cf is expected to be a valid UTF-8 casefolded
>>> + * string.
>>> + */
>>> +int unicode_strncasecmp_folded(const struct unicode_map *um,
>>> +                          const struct qstr *cf,
>>> +                          const struct qstr *s1)
>>> +{
>>> +   return utf8_ops->strncasecmp_folded(um, cf, s1);
>>> +}
>>> +EXPORT_SYMBOL(unicode_strncasecmp_folded);
>>> +
>>> +int unicode_casefold(const struct unicode_map *um, const struct qstr *str,
>>> +                unsigned char *dest, size_t dlen)
>>> +{
>>> +   return utf8_ops->casefold(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_casefold);
>>> +
>>> +int unicode_casefold_hash(const struct unicode_map *um, const void *salt,
>>> +                     struct qstr *str)
>>> +{
>>> +   return utf8_ops->casefold_hash(um, salt, str);
>>> +}
>>> +EXPORT_SYMBOL(unicode_casefold_hash);
>>> +
>>> +int unicode_normalize(const struct unicode_map *um, const struct qstr *str,
>>> +                 unsigned char *dest, size_t dlen)
>>> +{
>>> +   return utf8_ops->normalize(um, str, dest, dlen);
>>> +}
>>> +EXPORT_SYMBOL(unicode_normalize);
>>> +
>>> +struct unicode_map *unicode_load(const char *version)
>>> +{
>>> +   int ret = unicode_load_module();
>>> +
>>> +   if (ret)
>>> +           return ERR_PTR(ret);
>>> +
>>> +   else
>>> +           return utf8_ops->load(version);
>>> +}
>>> +EXPORT_SYMBOL(unicode_load);
>>> +
>>> +void unicode_unload(struct unicode_map *um)
>>> +{
>>> +   kfree(um);
>>> +}
>>> +EXPORT_SYMBOL(unicode_unload);
>>> +
>>> +void unicode_register(struct unicode_ops *ops)
>>> +{
>>> +   utf8_ops = ops;
>>> +}
>>> +EXPORT_SYMBOL(unicode_register);
>>> +
>>> +void unicode_unregister(void)
>>> +{
>>> +   utf8_ops = NULL;
>>> +}
>>> +EXPORT_SYMBOL(unicode_unregister);
>>> +
>>> +static int unicode_load_module(void)
>>> +{
>>> +   int ret = request_module("utf8");
>>> +
>>> +   msleep(100);
>>> +
>>> +   if (ret) {
>>> +           pr_err("Failed to load UTF-8 module\n");
>>> +           return ret;
>>> +   }
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +MODULE_LICENSE("GPL v2");
>>> diff --git a/fs/unicode/utf8mod.c b/fs/unicode/utf8mod.c
>>> new file mode 100644
>>> index 000000000000..8eaeeb27255c
>>> --- /dev/null
>>> +++ b/fs/unicode/utf8mod.c
>>> @@ -0,0 +1,246 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#include <linux/module.h>
>>> +#include <linux/kernel.h>
>>> +#include <linux/string.h>
>>> +#include <linux/slab.h>
>>> +#include <linux/parser.h>
>>> +#include <linux/errno.h>
>>> +#include <linux/unicode.h>
>>> +#include <linux/stringhash.h>
>>> +
>>> +#include "utf8n.h"
>>> +
>>> +static int utf8_validate(const struct unicode_map *um, const struct qstr 
>>> *str)
>>> +{
>>> +   const struct utf8data *data = utf8nfdi(um->version);
>>> +
>>> +   if (utf8nlen(data, str->name, str->len) < 0)
>>> +           return -1;
>>> +   return 0;
>>> +}
>>> +
>>> +static int utf8_strncmp(const struct unicode_map *um,
>>> +                   const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> +   const struct utf8data *data = utf8nfdi(um->version);
>>> +   struct utf8cursor cur1, cur2;
>>> +   int c1, c2;
>>> +
>>> +   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   do {
>>> +           c1 = utf8byte(&cur1);
>>> +           c2 = utf8byte(&cur2);
>>> +
>>> +           if (c1 < 0 || c2 < 0)
>>> +                   return -EINVAL;
>>> +           if (c1 != c2)
>>> +                   return 1;
>>> +   } while (c1);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +static int utf8_strncasecmp(const struct unicode_map *um,
>>> +                       const struct qstr *s1, const struct qstr *s2)
>>> +{
>>> +   const struct utf8data *data = utf8nfdicf(um->version);
>>> +   struct utf8cursor cur1, cur2;
>>> +   int c1, c2;
>>> +
>>> +   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   if (utf8ncursor(&cur2, data, s2->name, s2->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   do {
>>> +           c1 = utf8byte(&cur1);
>>> +           c2 = utf8byte(&cur2);
>>> +
>>> +           if (c1 < 0 || c2 < 0)
>>> +                   return -EINVAL;
>>> +           if (c1 != c2)
>>> +                   return 1;
>>> +   } while (c1);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +/* String cf is expected to be a valid UTF-8 casefolded
>>> + * string.
>>> + */
>>> +static int utf8_strncasecmp_folded(const struct unicode_map *um,
>>> +                              const struct qstr *cf,
>>> +                              const struct qstr *s1)
>>> +{
>>> +   const struct utf8data *data = utf8nfdicf(um->version);
>>> +   struct utf8cursor cur1;
>>> +   int c1, c2;
>>> +   int i = 0;
>>> +
>>> +   if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   do {
>>> +           c1 = utf8byte(&cur1);
>>> +           c2 = cf->name[i++];
>>> +           if (c1 < 0)
>>> +                   return -EINVAL;
>>> +           if (c1 != c2)
>>> +                   return 1;
>>> +   } while (c1);
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +static int utf8_casefold(const struct unicode_map *um, const struct qstr 
>>> *str,
>>> +                    unsigned char *dest, size_t dlen)
>>> +{
>>> +   const struct utf8data *data = utf8nfdicf(um->version);
>>> +   struct utf8cursor cur;
>>> +   size_t nlen = 0;
>>> +
>>> +   if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   for (nlen = 0; nlen < dlen; nlen++) {
>>> +           int c = utf8byte(&cur);
>>> +
>>> +           dest[nlen] = c;
>>> +           if (!c)
>>> +                   return nlen;
>>> +           if (c == -1)
>>> +                   break;
>>> +   }
>>> +   return -EINVAL;
>>> +}
>>> +
>>> +static int utf8_casefold_hash(const struct unicode_map *um, const void 
>>> *salt,
>>> +                         struct qstr *str)
>>> +{
>>> +   const struct utf8data *data = utf8nfdicf(um->version);
>>> +   struct utf8cursor cur;
>>> +   int c;
>>> +   unsigned long hash = init_name_hash(salt);
>>> +
>>> +   if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   while ((c = utf8byte(&cur))) {
>>> +           if (c < 0)
>>> +                   return -EINVAL;
>>> +           hash = partial_name_hash((unsigned char)c, hash);
>>> +   }
>>> +   str->hash = end_name_hash(hash);
>>> +   return 0;
>>> +}
>>> +
>>> +static int utf8_normalize(const struct unicode_map *um, const struct qstr 
>>> *str,
>>> +                     unsigned char *dest, size_t dlen)
>>> +{
>>> +   const struct utf8data *data = utf8nfdi(um->version);
>>> +   struct utf8cursor cur;
>>> +   ssize_t nlen = 0;
>>> +
>>> +   if (utf8ncursor(&cur, data, str->name, str->len) < 0)
>>> +           return -EINVAL;
>>> +
>>> +   for (nlen = 0; nlen < dlen; nlen++) {
>>> +           int c = utf8byte(&cur);
>>> +
>>> +           dest[nlen] = c;
>>> +           if (!c)
>>> +                   return nlen;
>>> +           if (c == -1)
>>> +                   break;
>>> +   }
>>> +   return -EINVAL;
>>> +}
>>> +
>>> +static int utf8_parse_version(const char *version, unsigned int *maj,
>>> +                         unsigned int *min, unsigned int *rev)
>>> +{
>>> +   substring_t args[3];
>>> +   char version_string[12];
>>> +   static const struct match_token token[] = {
>>> +           {1, "%d.%d.%d"},
>>> +           {0, NULL}
>>> +   };
>>> +
>>> +   strncpy(version_string, version, sizeof(version_string));
>>> +
>>> +   if (match_token(version_string, token, args) != 1)
>>> +           return -EINVAL;
>>> +
>>> +   if (match_int(&args[0], maj) || match_int(&args[1], min) ||
>>> +       match_int(&args[2], rev))
>>> +           return -EINVAL;
>>> +
>>> +   return 0;
>>> +}
>>> +
>>> +static struct unicode_map *utf8_load(const char *version)
>>> +{
>>> +   struct unicode_map *um = NULL;
>>> +   int unicode_version;
>>> +
>>> +   if (version) {
>>> +           unsigned int maj, min, rev;
>>> +
>>> +           if (utf8_parse_version(version, &maj, &min, &rev) < 0)
>>> +                   return ERR_PTR(-EINVAL);
>>> +
>>> +           if (!utf8version_is_supported(maj, min, rev))
>>> +                   return ERR_PTR(-EINVAL);
>>> +
>>> +           unicode_version = UNICODE_AGE(maj, min, rev);
>>> +   } else {
>>> +           unicode_version = utf8version_latest();
>>> +           printk(KERN_WARNING"UTF-8 version not specified. "
>>> +                  "Assuming latest supported version (%d.%d.%d).",
>>> +                  (unicode_version >> 16) & 0xff,
>>> +                  (unicode_version >> 8) & 0xff,
>>> +                  (unicode_version & 0xff));
>>> +   }
>>> +
>>> +   um = kzalloc(sizeof(struct unicode_map), GFP_KERNEL);
>>> +   if (!um)
>>> +           return ERR_PTR(-ENOMEM);
>>> +
>>> +   um->charset = "UTF-8";
>>> +   um->version = unicode_version;
>>> +
>>> +   return um;
>>> +}
>>> +
>>> +static struct unicode_ops ops = {
>>> +   .validate = utf8_validate,
>>> +   .strncmp = utf8_strncmp,
>>> +   .strncasecmp = utf8_strncasecmp,
>>> +   .strncasecmp_folded = utf8_strncasecmp_folded,
>>> +   .casefold = utf8_casefold,
>>> +   .casefold_hash = utf8_casefold_hash,
>>> +   .normalize = utf8_normalize,
>>> +   .load = utf8_load,
>>> +};
>>> +
>>> +static int __init utf8_init(void)
>>> +{
>>> +   unicode_register(&ops);
>>> +   return 0;
>>> +}
>>> +
>>> +static void __exit utf8_exit(void)
>>> +{
>>> +   unicode_unregister();
>>> +}
>>> +
>>> +module_init(utf8_init);
>>> +module_exit(utf8_exit);
>>> +
>>> +MODULE_LICENSE("GPL v2");
>>> diff --git a/include/linux/unicode.h b/include/linux/unicode.h
>>> index de23f9ee720b..b0d59069e438 100644
>>> --- a/include/linux/unicode.h
>>> +++ b/include/linux/unicode.h
>>> @@ -10,6 +10,23 @@ struct unicode_map {
>>>     int version;
>>>   };
>>>   +struct unicode_ops {
>>> +   int (*validate)(const struct unicode_map *um, const struct qstr *str);
>>> +   int (*strncmp)(const struct unicode_map *um, const struct qstr *s1,
>>> +                  const struct qstr *s2);
>>> +   int (*strncasecmp)(const struct unicode_map *um, const struct qstr *s1,
>>> +                      const struct qstr *s2);
>>> +   int (*strncasecmp_folded)(const struct unicode_map *um, const struct 
>>> qstr *cf,
>>> +                             const struct qstr *s1);
>>> +   int (*normalize)(const struct unicode_map *um, const struct qstr *str,
>>> +                    unsigned char *dest, size_t dlen);
>>> +   int (*casefold)(const struct unicode_map *um, const struct qstr *str,
>>> +                   unsigned char *dest, size_t dlen);
>>> +   int (*casefold_hash)(const struct unicode_map *um, const void *salt,
>>> +                        struct qstr *str);
>>> +   struct unicode_map* (*load)(const char *version);
>>> +};
>> Also, make sure you run checkpatch.pl on the patch series before
>> submitting.
> I ran checkpatch.pl over the patch, but it seems there were some
> previously existing warnings
> which are not introduced due to any change made in this patch series.
> I am not sure if I am supposed to resolve those warnings in this patch
> series.
>
>>> +
>>>   int unicode_validate(const struct unicode_map *um, const struct qstr 
>>> *str);
>>>     int unicode_strncmp(const struct unicode_map *um,
>>> @@ -33,4 +50,7 @@ int unicode_casefold_hash(const struct unicode_map *um, 
>>> const void *salt,
>>>   struct unicode_map *unicode_load(const char *version);
>>>   void unicode_unload(struct unicode_map *um);
>>>   +void unicode_register(struct unicode_ops *ops);
>>> +void unicode_unregister(void);
>>> +
>>>   #endif /* _LINUX_UNICODE_H */
>

-- 
Gabriel Krisman Bertazi

Re: [PATCH 3/3] fs: unicode: Add utf8 module and a unicode layer

Reply via email to