Commit: patch 9.1.2124: blob2str() does not handle UTF-16 encoding

Christian Brabandt Sat, 31 Jan 2026 08:01:28 -0800

patch 9.1.2124: blob2str() does not handle UTF-16 encoding

Commit: 
https://github.com/vim/vim/commit/2b184d4b97c370179a33aeb832cd77e737bbc480
Author: Yasuhiro Matsumoto <[email protected]>
Date:   Sat Jan 31 15:53:26 2026 +0000


    patch 9.1.2124: blob2str() does not handle UTF-16 encoding
    
    Problem:  blob2str() does not handle UTF-16 encoding
              (Hirohito Higashi)
    Solution: Refactor the code and fix remaining issues, see below
              (Yasuhiro Matsumoto).
    
    blob2str() function did not properly handle UTF-16/UCS-2/UTF-32/UCS-4
    encodings with endianness suffixes (e.g., utf-16le, utf-16be, ucs-2le).
    The encoding name was canonicalized too aggressively, losing the
    endianness information needed by iconv.
    
    This change include few fixes:
    
    - Preserve the raw encoding name with endianness suffix for iconv calls
    - Normalize encoding names properly: "ucs2be" → "ucs-2be", "utf16le" →
      "utf-16le"
    - For multi-byte encodings (UTF-16/32, UCS-2/4), convert the entire blob
      first, then split by newlines
    
    convert_string() cannot handle UTF-16 because it uses string_convert()
    which expects NUL-terminated strings. UTF-16 contains 0x00 bytes within
    characters (e.g., "H" = 0x48 0x00), causing premature termination.
    Therefore, for UTF-16/32 encodings, the fix uses string_convert_ext()
    with an explicit input length to convert the entire blob at once.
    
    The code appends two NUL bytes (ga_append(&blob_ga, NUL) twice) because
    UTF-16 requires a 2-byte NUL terminator (0x00 0x00), not a single-byte
    NUL.
    
    - src/strings.c: Add from_encoding_raw to preserve endianness, special
      handling for UTF-16/32 and UCS-2/4
    - src/mbyte.c: Fix convert_setup_ext() to use == ENC_UNICODE instead of
      & ENC_UNICODE. The bitwise AND was incorrectly treating UTF-16/UCS-2
      (which have ENC_UNICODE + ENC_2BYTE etc.) as UTF-8, causing iconv
      setup to be skipped.
    
    fixes:  #19198
    closes: #19246
    
    Co-authored-by: Copilot <[email protected]>
    Signed-off-by: Yasuhiro Matsumoto <[email protected]>
    Signed-off-by: Christian Brabandt <[email protected]>

diff --git a/runtime/doc/builtin.txt b/runtime/doc/builtin.txt
index c940ee50c..18712b403 100644
--- a/runtime/doc/builtin.txt
+++ b/runtime/doc/builtin.txt
@@ -1,4 +1,4 @@
-*builtin.txt*  For Vim version 9.1.  Last change: 2026 Jan 17
+*builtin.txt*  For Vim version 9.1.  Last change: 2026 Jan 31
 
 
                  VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -1389,6 +1389,9 @@ blob2str({blob} [, {options}])                            
*blob2str()*
                Can also be used as a |method|: >
                        GetBlob()->blob2str()
 <
+               If `iconv` is not available and the encoding cannot be converted
+               using built-in conversion rules, an error will be reported.
+
                Return type: list<string>
 
 
diff --git a/src/strings.c b/src/strings.c
index 4d878cb7d..f4b335676 100644
--- a/src/strings.c
+++ b/src/strings.c
@@ -1275,10 +1275,123 @@ string_from_blob(blob_T *blob, long *start_idx)
     return ret_str;
 }
 
+/*
+ * Normalize encoding name for iconv by adding hyphens.
+ * For example: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le"
+ * Returns allocated string or NULL on allocation failure.
+ */
+    static char_u *
+normalize_encoding_name(char_u *enc_skipped)
+{
+    char_u *from_encoding_raw = alloc(STRLEN(enc_skipped) + 3);
+    if (from_encoding_raw == NULL)
+       return NULL;
+
+    char_u *s = enc_skipped;
+    char_u *pe = from_encoding_raw;
+
+    // Convert to lowercase and replace '_' with '-'
+    while (*s != NUL)
+    {
+       if (*s == '_')
+           *pe++ = '-';
+       else
+           *pe++ = TOLOWER_ASC(*s);
+       ++s;
+    }
+    *pe = NUL;
+
+    // Add hyphen before digit: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le"
+    char_u *p = from_encoding_raw;
+    if ((STRNCMP(p, "ucs", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] 
!= '-') ||
+       (STRNCMP(p, "utf", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] 
!= '-'))
+    {
+       // Insert hyphen after "ucs" or "utf": "ucs2" -> "ucs-2"
+       mch_memmove(p + 4, p + 3, STRLEN(p + 3) + 1);
+       p[3] = '-';
+    }
+
+    return from_encoding_raw;
+}
+
 /*
  * "blob2str()" function
  * Converts a blob to a string, ensuring valid UTF-8 encoding.
  */
+    static void
+append_converted_string_to_list(
+       char_u *converted,
+       int validate_utf8,
+       list_T *list,
+       char_u *from_encoding)
+{
+    if (converted != NULL)
+    {
+       // After conversion, the output is a valid UTF-8 string (NUL-terminated)
+       int converted_len = (int)STRLEN(converted);
+
+       // Split by newlines and add to list
+       char_u *p = converted;
+       char_u *end = converted + converted_len;
+       while (p < end)
+       {
+           char_u *line_start = p;
+           while (p < end && *p != NL)
+               p++;
+
+           // Add this line to the result list
+           char_u *line = vim_strnsave(line_start, p - line_start);
+           if (line != NULL)
+           {
+               if (validate_utf8 && !utf_valid_string(line, NULL))
+               {
+                   vim_free(line);
+                   semsg(_(e_str_encoding_from_failed), p_enc);
+                   vim_free(converted);
+                   return; // Stop processing
+               }
+               if (list_append_string(list, line, -1) == FAIL)
+               {
+                   vim_free(line);
+                   vim_free(converted);
+                   return; // Stop processing on append failure
+               }
+               vim_free(line);
+           }
+           else
+           {
+               // Allocation failure: report error and stop processing
+               emsg(_(e_out_of_memory));
+               vim_free(converted);
+               return;
+           }
+
+           if (*p == NL)
+               p++;
+       }
+       vim_free(converted);
+    }
+    else
+    {
+       semsg(_(e_str_encoding_from_failed), from_encoding);
+    }
+}
+
+    static int
+append_validated_line_to_list(char_u *line, int validate_utf8, list_T *list)
+{
+    if (validate_utf8 && !utf_valid_string(line, NULL))
+    {
+       semsg(_(e_str_encoding_from_failed), p_enc);
+       vim_free(line);
+       return FAIL;
+    }
+
+    int ret = list_append_string(list, line, -1);
+    vim_free(line);
+    return ret;
+}
+
     void
 f_blob2str(typval_T *argvars, typval_T *rettv)
 {
@@ -1300,6 +1413,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
     blen = blob_len(blob);
 
     char_u     *from_encoding = NULL;
+    char_u     *from_encoding_raw = NULL;  // Encoding name with endianness 
preserved for iconv
     if (argvars[1].v_type != VAR_UNKNOWN)
     {
        dict_T *d = argvars[1].vval.v_dict;
@@ -1307,7 +1421,20 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
        {
            char_u *enc = dict_get_string(d, "encoding", FALSE);
            if (enc != NULL)
-               from_encoding = enc_canonize(enc_skip(enc));
+           {
+               char_u *enc_skipped = enc_skip(enc);
+               from_encoding = enc_canonize(enc_skipped);
+
+               // For iconv, preserve the endianness suffix by creating a 
normalized
+               // version with hyphens: "ucs2be" -> "ucs-2be", "utf16le" -> 
"utf-16le"
+               from_encoding_raw = normalize_encoding_name(enc_skipped);
+               if (from_encoding_raw == NULL)
+               {
+                   emsg(_(e_out_of_memory));
+                   VIM_CLEAR(from_encoding);
+                   return;
+               }
+           }
        }
     }
 
@@ -1317,46 +1444,74 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
     if (from_encoding != NULL && STRCMP(from_encoding, "none") == 0)
     {
        validate_utf8 = FALSE;
-       vim_free(from_encoding);
-       from_encoding = NULL;
+       VIM_CLEAR(from_encoding);
+       VIM_CLEAR(from_encoding_raw);
     }
 
-    idx = 0;
-    while (idx < blen)
+    // Special handling for UTF-16/UCS-2/UTF-32/UCS-4 encodings: convert 
entire blob before splitting by newlines
+    int from_prop = 0;
+    if (from_encoding != NULL)
+       from_prop = enc_canon_props(from_encoding);
+    if (from_encoding != NULL && (from_prop & (ENC_2BYTE | ENC_4BYTE | 
ENC_2WORD)))
     {
-       char_u  *str;
-       char_u  *converted_str;
+       // Build a temporary buffer from the blob as a whole
+       // Don't use string_from_blob() because it treats NUL as line separator
+       garray_T blob_ga;
+       int nul_size = (from_prop & ENC_4BYTE) ? 4 : 2;
+       ga_init2(&blob_ga, 1, blen + nul_size);
+       for (long i = 0; i < blen; i++)
+           ga_append(&blob_ga, (int)(unsigned char)blob_get(blob, i));
+       // Add NUL terminator (2 bytes for UTF-16/UCS-2, 4 bytes for 
UTF-32/UCS-4)
+       for (int i = 0; i < nul_size; i++)
+           ga_append(&blob_ga, NUL);
+
+       // Convert the entire blob at once
+       vimconv_T vimconv;
+       vimconv.vc_type = CONV_NONE;
+       // Use raw encoding name for iconv to preserve endianness (utf-16be vs 
utf-16)
+       if (convert_setup_ext(&vimconv, from_encoding_raw ? from_encoding_raw : 
from_encoding, FALSE, p_enc, FALSE) == FAIL)
+       {
+           ga_clear(&blob_ga);
+           semsg(_(e_str_encoding_from_failed), from_encoding);
+           goto done;
+       }
+       vimconv.vc_fail = TRUE;
+       // Use string_convert_ext with explicit input length
+       int inlen = blen;
+       char_u *converted = string_convert_ext(&vimconv, (char_u 
*)blob_ga.ga_data, &inlen, NULL);
+       convert_setup(&vimconv, NULL, NULL);
+       ga_clear(&blob_ga);
+       append_converted_string_to_list(converted, validate_utf8, 
rettv->vval.v_list, from_encoding);
+    }
+    else
+    {
+       // Original logic for non-UTF-16 encodings
+       idx = 0;
+       while (idx < blen)
+       {
+           char_u      *str;
 
-       str = string_from_blob(blob, &idx);
-       if (str == NULL)
-           break;
+           str = string_from_blob(blob, &idx);
+           if (str == NULL)
+               break;
 
-       converted_str = str;
-       if (from_encoding != NULL)
-       {
-           converted_str = convert_string(str, from_encoding, p_enc);
-           vim_free(str);
-           if (converted_str == NULL)
+           if (from_encoding != NULL)
            {
-               semsg(_(e_str_encoding_from_failed), from_encoding);
-               goto done;
+               char_u *converted = convert_string(str,
+                       from_encoding_raw ? from_encoding_raw : from_encoding, 
p_enc);
+               vim_free(str);
+               str = converted;
            }
-       }
 
-       if (validate_utf8)
-       {
-           if (!utf_valid_string(converted_str, NULL))
+           if (str == NULL)
            {
-               semsg(_(e_str_encoding_from_failed), p_enc);
-               vim_free(converted_str);
+               semsg(_(e_str_encoding_from_failed), from_encoding);
                goto done;
            }
-       }
 
-       int ret = list_append_string(rettv->vval.v_list, converted_str, -1);
-       vim_free(converted_str);
-       if (ret == FAIL)
-           break;
+           if (append_validated_line_to_list(str, validate_utf8, 
rettv->vval.v_list) == FAIL)
+               goto done;
+       }
     }
 
     // If the blob ends with a newline, we need to add another empty string.
@@ -1365,6 +1520,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv)
 
 done:
     vim_free(from_encoding);
+    vim_free(from_encoding_raw);
 }
 
 /*
diff --git a/src/testdir/test_blob.vim b/src/testdir/test_blob.vim
index 1ce227d5c..34a5cb750 100644
--- a/src/testdir/test_blob.vim
+++ b/src/testdir/test_blob.vim
@@ -898,4 +898,44 @@ func Test_blob2str_empty_line()
   call assert_equal(['Hello', '', 'World!'], blob2str(b))
 endfunc
 
+func Test_blob2str_multi_byte_encodings()
+  " UTF-16LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00
+  call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'utf-16le'}))
+  call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'utf16le'}))
+
+  " UTF-16BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F
+  call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 
'utf-16be'}))
+  call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 
'utf16be'}))
+
+  " UCS-2LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00
+  call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'ucs-2le'}))
+  call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'ucs2le'}))
+
+  " UCS-2BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F
+  call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 
'ucs-2be'}))
+  call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 
'ucs2be'}))
+
+  " UTF-32LE: "Hi" = 48 00 00 00 69 00 00 00
+  call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 
'utf-32le'}))
+  call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 
'utf32le'}))
+
+  " UTF-32BE: "Hi" = 00 00 00 48 00 00 00 69
+  call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 
'utf-32be'}))
+  call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 
'utf32be'}))
+
+  " UCS-4LE: "Hi" = 48 00 00 00 69 00 00 00
+  call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 
'ucs-4le'}))
+  call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 
'ucs4le'}))
+
+  " UCS-4BE: "Hi" = 00 00 00 48 00 00 00 69
+  call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 
'ucs-4be'}))
+  call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 
'ucs4be'}))
+
+  " UTF-16LE with newlines: "Hi
Bye" = 48 00 69 00 0A 00 42 00 79 00 65 00
+  call assert_equal(['Hi', 'Bye'], blob2str(0z48006900.0A004200.79006500, 
{'encoding': 'utf-16le'}))
+
+  " UTF-32LE with newlines: "A
B" = 41 00 00 00 0A 00 00 00 42 00 00 00
+  call assert_equal(['A', 'B'], blob2str(0z41000000.0A000000.42000000, 
{'encoding': 'utf-32le'}))
+endfunc
+
 " vim: shiftwidth=2 sts=2 expandtab
diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim
index 2fa09eddc..0f3d30e51 100644
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -4557,6 +4557,13 @@ func Test_blob2str()
     call assert_fails("call blob2str(0z6162, [])", 'E1206: Dictionary required 
for argument 2')
     call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using 
a List as a String')
     call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: 
Unable to convert from ''ab12xy'' encoding')
+
+    #" UTF-16LE encoding
+    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'utf-16le'}))
+    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'utf16le'}))
+    #" UCS-2LE encoding
+    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'ucs-2le'}))
+    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 
'ucs2le'}))
   END
   call v9.CheckLegacyAndVim9Success(lines)
 endfunc
diff --git a/src/version.c b/src/version.c
index 5dc95056b..011a4d1e6 100644
--- a/src/version.c
+++ b/src/version.c
@@ -734,6 +734,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    2124,
 /**/
     2123,
 /**/

-- 
-- 
You received this message from the "vim_dev" maillist.
Do not top-post! Type your reply below the text you are replying to.
For more information, visit http://www.vim.org/maillist.php

--- 
You received this message because you are subscribed to the Google Groups 
"vim_dev" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion visit 
https://groups.google.com/d/msgid/vim_dev/E1vmDOF-005ljI-EN%40256bit.org.

Commit: patch 9.1.2124: blob2str() does not handle UTF-16 encoding

Raspunde prin e-mail lui