The branch main has been updated by se:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=f983cb362de77773efba8975eb0f4b247ce5e6f9

commit f983cb362de77773efba8975eb0f4b247ce5e6f9
Author:     Stefan Eßer <[email protected]>
AuthorDate: 2026-06-01 05:50:22 +0000
Commit:     Stefan Eßer <[email protected]>
CommitDate: 2026-06-01 06:22:31 +0000

    fs/msdosfs: add support for file namws with surrogate pairs
    
    Long file names are using UTF-16 symbols to represent international or
    special characters. The implementation in FreeBSD did not support the
    "Supplementary Private Use Area-B" (PUA-B), which requires a surrogate
    pair to be represented in UTF-16 (Unicode code points beyond U+FFFF).
    
    The PUA-B is used to represent emoji characters, which are supported
    in file names on other common operating systems. The motivation for
    this change was that removable media written on another system were
    only partially readable on FreeBSD, since they contained emojis in
    file names.
    
    A test script that verifies correct operations on files names with
    emojis has been added to the tools/test/stress2/misc directory under
    the name msdos24.sh.
    
    Reported by:    Fabian Keil <[email protected]>
    Reviewed by:    ib
    Approved by:    mkcusick
    MFC after:      1 week
    Relnotes:       yes
    Differential Revision:  https://reviews.freebsd.org/D57313
---
 sys/fs/msdosfs/direntry.h       |   9 ++-
 sys/fs/msdosfs/msdosfs_conv.c   | 164 +++++++++++++++++++++++++++++++---------
 sys/fs/msdosfs/msdosfs_lookup.c |   7 +-
 3 files changed, 139 insertions(+), 41 deletions(-)

diff --git a/sys/fs/msdosfs/direntry.h b/sys/fs/msdosfs/direntry.h
index fe66ca3612fb..439556fab587 100644
--- a/sys/fs/msdosfs/direntry.h
+++ b/sys/fs/msdosfs/direntry.h
@@ -136,9 +136,10 @@ struct winentry {
 
 #ifdef _KERNEL
 struct mbnambuf {
-       size_t  nb_len;
-       int     nb_last_id;
-       char    nb_buf[WIN_MAXLEN + 1];
+       size_t          nb_len;
+       int             nb_last_id;
+       uint32_t        nb_surrogate;
+       char            nb_buf[WIN_MAXLEN + 1];
 };
 
 struct dirent;
@@ -152,7 +153,7 @@ int dos2unixfn(u_char dn[11], u_char *un, int lower,
 int    unix2dosfn(const u_char *un, u_char dn[12], size_t unlen, u_int gen,
            struct msdosfsmount *pmp);
 int    unix2winfn(const u_char *un, size_t unlen, struct winentry *wep, int 
cnt,
-           int chksum, struct msdosfsmount *pmp);
+           int chksum, uint32_t *surrogate, struct msdosfsmount *pmp);
 int    winChkName(struct mbnambuf *nbp, const u_char *un, size_t unlen,
            int chksum, struct msdosfsmount *pmp);
 int    win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int chksum,
diff --git a/sys/fs/msdosfs/msdosfs_conv.c b/sys/fs/msdosfs/msdosfs_conv.c
index 208b64930e61..a005fe03e6fb 100644
--- a/sys/fs/msdosfs/msdosfs_conv.c
+++ b/sys/fs/msdosfs/msdosfs_conv.c
@@ -63,9 +63,9 @@ extern struct iconv_functions *msdosfs_iconv;
 
 static int mbsadjpos(const char **, size_t, size_t, int, int, void *handle);
 static u_char * dos2unixchr(u_char *, const u_char **, size_t *, int, struct 
msdosfsmount *);
-static uint16_t unix2doschr(const u_char **, size_t *, struct msdosfsmount *);
-static u_char * win2unixchr(u_char *, uint16_t, struct msdosfsmount *);
-static uint16_t unix2winchr(const u_char **, size_t *, int, struct 
msdosfsmount *);
+static uint32_t unix2doschr(const u_char **, size_t *, struct msdosfsmount *);
+static u_char * win2unixchr(u_char *, uint32_t, struct msdosfsmount *);
+static uint32_t unix2winchr(const u_char **, size_t *, int, struct 
msdosfsmount *);
 
 /*
  * 0 - character disallowed in long file name.
@@ -482,7 +482,7 @@ unix2dosfn(const u_char *un, u_char dn[12], size_t unlen, 
u_int gen,
        if (gentext + sizeof(gentext) - wcp + 1 > 8 - i)
                i = 8 - (gentext + sizeof(gentext) - wcp + 1);
        /*
-        * Correct posision to where insert the generation number
+        * Correct position to where insert the generation number
         */
        cp = dn;
        i -= mbsadjpos((const char**)&cp, i, unlen, 1, pmp->pm_flags, 
pmp->pm_d2u);
@@ -516,22 +516,29 @@ done:
  */
 int
 unix2winfn(const u_char *un, size_t unlen, struct winentry *wep, int cnt,
-    int chksum, struct msdosfsmount *pmp)
+    int chksum, uint32_t *surrogate, struct msdosfsmount *pmp)
 {
        uint8_t *wcp;
        int i, end;
-       uint16_t code;
+       uint32_t code;
 
        /*
         * Drop trailing blanks and dots
         */
        unlen = winLenFixup(un, unlen);
 
+#ifdef MSDOSFS_DEBUG
+       //printf("unix2winfn(%zd, '%.*s')\n", unlen, (int)unlen, un);
+#endif
        /*
-        * Cut *un for this slot
+        * Cut *un for this slot.
+        * If *surrogate is non-zero, allow for 14 UTF-16 symbols
+        * since it means we start in the middle of a surrogate pair.
         */
-       unlen = mbsadjpos((const char **)&un, unlen, (cnt - 1) * WIN_CHARS, 2,
-                         pmp->pm_flags, pmp->pm_u2w);
+       code = *surrogate;
+       unlen = mbsadjpos((const char **)&un, unlen,
+           (cnt - 1) * WIN_CHARS + ((code >> 16) != 0 ? 1 : 0), 2,
+           pmp->pm_flags, pmp->pm_u2w);
 
        /*
         * Initialize winentry to some useful default
@@ -548,27 +555,43 @@ unix2winfn(const u_char *un, size_t unlen, struct 
winentry *wep, int cnt,
         */
        end = 0;
        for (wcp = wep->wePart1, i = sizeof(wep->wePart1)/2; --i >= 0 && !end;) 
{
-               code = unix2winchr(&un, &unlen, 0, pmp);
+               code >>= 16;
+               if (code == 0)
+                       code = unix2winchr(&un, &unlen, 0, pmp);
+#ifdef MSDOSFS_DEBUG
+               //printf("write(%d/%d)=%04x (%s)\n", cnt, 5-i, code, un);
+#endif
                *wcp++ = code;
                *wcp++ = code >> 8;
                if (!code)
                        end = WIN_LAST;
        }
        for (wcp = wep->wePart2, i = sizeof(wep->wePart2)/2; --i >= 0 && !end;) 
{
-               code = unix2winchr(&un, &unlen, 0, pmp);
+               code >>= 16;
+               if (code == 0)
+                       code = unix2winchr(&un, &unlen, 0, pmp);
+#ifdef MSDOSFS_DEBUG
+               //printf("write(%d/%d)=%04x (%s)\n", cnt,11-i, code, un);
+#endif
                *wcp++ = code;
                *wcp++ = code >> 8;
                if (!code)
                        end = WIN_LAST;
        }
        for (wcp = wep->wePart3, i = sizeof(wep->wePart3)/2; --i >= 0 && !end;) 
{
-               code = unix2winchr(&un, &unlen, 0, pmp);
+               code >>= 16;
+               if (code == 0)
+                       code = unix2winchr(&un, &unlen, 0, pmp);
+#ifdef MSDOSFS_DEBUG
+               //printf("write(%d/%d)=%04x (%s)\n", cnt, 13-i, code, un);
+#endif
                *wcp++ = code;
                *wcp++ = code >> 8;
                if (!code)
                        end = WIN_LAST;
        }
-       if (!unlen)
+       *surrogate = code;
+       if (!unlen && (code >> 16) == 0)
                end = WIN_LAST;
        wep->weCnt |= end;
        return !end;
@@ -583,7 +606,7 @@ winChkName(struct mbnambuf *nbp, const u_char *un, size_t 
unlen, int chksum,
     struct msdosfsmount *pmp)
 {
        size_t len;
-       uint16_t c1, c2;
+       uint32_t c1, c2;
        u_char *np;
        struct dirent dirbuf;
 
@@ -632,7 +655,7 @@ win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int 
chksum,
        u_char *c, tmpbuf[5];
        uint8_t *cp;
        uint8_t *np, name[WIN_CHARS * 3 + 1];
-       uint16_t code;
+       uint32_t code, keep, kept;
        int i;
 
        if ((wep->weCnt&WIN_CNT) > howmany(WIN_MAXLEN, WIN_CHARS)
@@ -644,6 +667,7 @@ win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int 
chksum,
         */
        if (wep->weCnt&WIN_LAST) {
                chksum = wep->weChksum;
+               nbp->nb_surrogate = 0;
        } else if (chksum != wep->weChksum)
                chksum = -1;
        if (chksum == -1)
@@ -653,8 +677,36 @@ win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int 
chksum,
         * Convert the name parts
         */
        np = name;
+       keep = 0;
+       kept = nbp->nb_surrogate;
+       nbp->nb_surrogate = 0;
        for (cp = wep->wePart1, i = sizeof(wep->wePart1)/2; --i >= 0;) {
                code = (cp[1] << 8) | cp[0];
+               if (keep == 0) {
+                       /* Not currently within a surrogate pair. */
+                       if ((code & 0xfc00) == 0xd800) {
+                               /* Found 1st half of a surrogate pair. */
+                               keep = code;
+                               cp += 2;
+                               continue;
+                       } else if ((code & 0xfc00) == 0xdc00) {
+                               /*
+                                * Found 2nd part of surrogate pair without 
prior 1st part in
+                                * this directory slot. Keep it for combining 
with the surrogate
+                                * at the end of the next (i.e., logically 
preceding) slot.
+                                */
+                               nbp->nb_surrogate = code;
+                               cp += 2;
+                               continue;
+                       }
+               } else {
+                       /*
+                        * Within a surrogate pair: put new value in upper half 
of the code
+                        * variable and combine with previous value in lower 
half.
+                        */
+                       code = code << 16 | keep;
+                       keep = 0;
+               }
                switch (code) {
                case 0:
                        *np = '\0';
@@ -675,6 +727,16 @@ win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int 
chksum,
        }
        for (cp = wep->wePart2, i = sizeof(wep->wePart2)/2; --i >= 0;) {
                code = (cp[1] << 8) | cp[0];
+               if (keep == 0) {
+                       if ((code & 0xfc00) == 0xd800) {
+                               keep = code;
+                               cp += 2;
+                               continue;
+                       }
+               } else {
+                       code = code << 16 | keep;
+                       keep = 0;
+               }
                switch (code) {
                case 0:
                        *np = '\0';
@@ -695,6 +757,31 @@ win2unixfn(struct mbnambuf *nbp, struct winentry *wep, int 
chksum,
        }
        for (cp = wep->wePart3, i = sizeof(wep->wePart3)/2; --i >= 0;) {
                code = (cp[1] << 8) | cp[0];
+               if (keep == 0) {
+                       /* Not currently within a surrogate pair. */
+                       if ((code & 0xfc00) == 0xd800) {
+                               /* 1st part of a surrogate pair. */
+                               if (i != 0) {
+                                       /*
+                                        * Not on last symbol of this directory 
slot,
+                                        * the 2nd part of the surrogate pair 
will follow.
+                                        */
+                                       keep = code;
+                                       cp += 2;
+                                       continue;
+                               }
+                               /*
+                                * Last symbol in this directory slot is 1st 
part of
+                                * a surrogate pair. The 2nd part has been 
saved from
+                                * the last symbol in the previous directory 
slot
+                                * (which is logically succeeding this one.
+                                */
+                               code |= (kept << 16);
+                       }
+               } else {
+                       code = code << 16 | keep;
+                       keep = 0;
+               }
                switch (code) {
                case 0:
                        *np = '\0';
@@ -845,19 +932,19 @@ dos2unixchr(u_char *outbuf, const u_char **instr, size_t 
*ilen, int lower, struc
 /*
  * Convert Local char to DOS char
  */
-static uint16_t
+static uint32_t
 unix2doschr(const u_char **instr, size_t *ilen, struct msdosfsmount *pmp)
 {
        u_char c;
-       char *up, *outp, unicode[3], outbuf[3];
-       uint16_t wc;
+       char *up, *outp, unicode[4], outbuf[4];
+       uint32_t wc;
        size_t len, ucslen, unixlen, olen;
 
        if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
                /*
                 * to hide an invisible character, using a unicode filter
                 */
-               ucslen = 2;
+               ucslen = 4;
                len = *ilen;
                up = unicode;
                msdosfs_iconv->convchr(pmp->pm_u2w, (const char **)instr,
@@ -891,7 +978,7 @@ unix2doschr(const u_char **instr, size_t *ilen, struct 
msdosfsmount *pmp)
                *instr -= unixlen;
                *ilen = len;
 
-               olen = len = 2;
+               olen = len = 4;
                outp = outbuf;
                msdosfs_iconv->convchr_case(pmp->pm_u2d, (const char **)instr,
                                          ilen, &outp, &olen, 
KICONV_FROM_UPPER);
@@ -916,25 +1003,27 @@ unix2doschr(const u_char **instr, size_t *ilen, struct 
msdosfsmount *pmp)
        c = *(*instr)++;
        c = l2u[c];
        c = unix2dos[c];
-       return ((uint16_t)c);
+       return ((uint32_t)c);
 }
 
 /*
  * Convert Windows char to Local char
  */
 static u_char *
-win2unixchr(u_char *outbuf, uint16_t wc, struct msdosfsmount *pmp)
+win2unixchr(u_char *outbuf, uint32_t wc, struct msdosfsmount *pmp)
 {
-       u_char *inp, *outp, inbuf[3];
+       u_char *inp, *outp, inbuf[5];
        size_t ilen, olen, len;
 
        outp = outbuf;
        if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
                inbuf[0] = (u_char)(wc>>8);
                inbuf[1] = (u_char)wc;
-               inbuf[2] = '\0';
+               inbuf[2] = (u_char)(wc>>24);
+               inbuf[3] = (u_char)(wc>>16);
+               inbuf[4] = '\0';
 
-               ilen = 2;
+               ilen = 4;
                olen = len = 4;
                inp = inbuf;
                msdosfs_iconv->convchr(pmp->pm_w2u, __DECONST(const char **,
@@ -947,7 +1036,7 @@ win2unixchr(u_char *outbuf, uint16_t wc, struct 
msdosfsmount *pmp)
                if (len == 0)
                        *outp++ = '?';
        } else {
-               *outp++ = (wc & 0xff00) ? '?' : (u_char)(wc & 0xff);
+               *outp++ = (wc & 0xffffff00) ? '?' : (u_char)(wc & 0xff);
        }
 
        *outp = '\0';
@@ -956,13 +1045,15 @@ win2unixchr(u_char *outbuf, uint16_t wc, struct 
msdosfsmount *pmp)
 }
 
 /*
- * Convert Local char to Windows char
+ * Convert Local char to Windows char (UTF-16).
+ * If a surrogate pair is required, then put the high part into the upper half
+ * of the word that is returned.
  */
-static uint16_t
+static uint32_t
 unix2winchr(const u_char **instr, size_t *ilen, int lower, struct msdosfsmount 
*pmp)
 {
-       u_char *outp, outbuf[3];
-       uint16_t wc;
+       u_char *outp, outbuf[5];
+       uint32_t wc;
        size_t olen;
 
        if (*ilen == 0)
@@ -970,7 +1061,7 @@ unix2winchr(const u_char **instr, size_t *ilen, int lower, 
struct msdosfsmount *
 
        if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
                outp = outbuf;
-               olen = 2;
+               olen = 4;
                if (lower & (LCASE_BASE | LCASE_EXT))
                        msdosfs_iconv->convchr_case(pmp->pm_u2w, (const char 
**)instr,
                                                  ilen, (char **)&outp, &olen,
@@ -982,10 +1073,12 @@ unix2winchr(const u_char **instr, size_t *ilen, int 
lower, struct msdosfsmount *
                /*
                 * return '0' if end of filename
                 */
-               if (olen == 2)
+               if (olen == 4)
                        return (0);
 
                wc = (outbuf[0]<<8) | outbuf[1];
+               if (olen == 0)
+                       wc |= outbuf[2]<<24 | outbuf[3]<<16;
 
                return (wc);
        }
@@ -1044,8 +1137,11 @@ mbnambuf_write(struct mbnambuf *nbp, char *name, int id)
                return (ENAMETOOLONG);
        }
 
-       /* Shift suffix upwards by the amount length exceeds WIN_CHARS. */
-       if (count > WIN_CHARS && nbp->nb_len != 0) {
+       /*
+        * Shift suffix upwards or downwards by the amount length differs
+        * from WIN_CHARS.
+        */
+       if (count != WIN_CHARS && nbp->nb_len != 0) {
                if ((id * WIN_CHARS + count + nbp->nb_len) >
                    sizeof(nbp->nb_buf))
                        return (ENAMETOOLONG);
diff --git a/sys/fs/msdosfs/msdosfs_lookup.c b/sys/fs/msdosfs/msdosfs_lookup.c
index 58ce8eff9dbd..527c6cbf73a2 100644
--- a/sys/fs/msdosfs/msdosfs_lookup.c
+++ b/sys/fs/msdosfs/msdosfs_lookup.c
@@ -304,8 +304,7 @@ msdosfs_lookup_ino(struct vnode *vdp, struct vnode **vpp, 
struct componentname
                                                continue;
 
                                        chksum = win2unixfn(&nb,
-                                           (struct winentry *)dep, chksum,
-                                           pmp);
+                                           (struct winentry *)dep, chksum, 
pmp);
                                        continue;
                                }
 
@@ -638,6 +637,7 @@ createde(struct denode *dep, struct denode *ddep, struct 
denode **depp,
        struct buf *bp;
        daddr_t bn;
        int blsize;
+       uint32_t surrogate;
 
 #ifdef MSDOSFS_DEBUG
        printf("createde(dep %p, ddep %p, depp %p, cnp %p)\n",
@@ -692,6 +692,7 @@ createde(struct denode *dep, struct denode *ddep, struct 
denode **depp,
        /*
         * Now write the Win95 long name
         */
+       surrogate = 0;
        if (ddep->de_fndcnt > 0) {
                uint8_t chksum = winChksum(ndep->deName);
                const u_char *un = (const u_char *)cnp->cn_nameptr;
@@ -725,7 +726,7 @@ createde(struct denode *dep, struct denode *ddep, struct 
denode **depp,
                        }
                        rootde_alloced(ddep);
                        if (!unix2winfn(un, unlen, (struct winentry *)ndep,
-                                       cnt++, chksum, pmp))
+                                       cnt++, chksum, &surrogate, pmp))
                                break;
                }
        }

Reply via email to