Hi,

Overall I think the implementation looks fine. The test cases are missing some of the more exotic DOS device paths[1], for example `\\.\Volume{b75e2c83-0000-0000-0000-602f00000000}\` but to be frank I don't know what should be the expected outputs for it. Should we simply declare that DOS device paths may not work as expected?

[1]: https://learn.microsoft.com/en-us/dotnet/standard/io/file-path-formats#dos-device-paths

Manual fuzzing with asan and ubsan revealed an issue: The line `info.prefix_end[1] = 0;` (line 213) can result in a buffer overrun. This happens with inputs in the form of an UNC host only, e.g. `\\host`, which is also missing from the test cases.

I would think the fix is as simple as changing line 211 to `if(info.prefix_end < info.path_end - 1) {`, but this revealed another issue because `info.path_end` contained an uninitialized value -- an early return in `do_get_path_info` (line 127) missed setting `info->path_end = pos;`. After applying these fixes there seems to be no other sanitizer issues. I have attached my test which I built with `clang -fsanitize=address,undefined`.

Cheers,
Alvin


On 26/3/2023 17:09, LIU Hao wrote:
As discussed on IRC just now, paths are not affected by the global locale. According to Microsoft documentation about `fopen()` [1], paths are interpreted with `CP_ACP` if `AreFileApisANSI()` returns true, and `CP_OEMCP` otherwise. We had better follow that convention.


[1] https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/fopen-wfopen?view=msvc-170



_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public
#define TEST_CODEPAGE 950
// #define TEST_CODEPAGE CP_UTF8

#define _CRT_RAND_S

// -----------------

/**
 * This file has no copyright assigned and is placed in the Public Domain.
 * This file is part of the mingw-w64 runtime package.
 * No warranty is given; refer to the file DISCLAIMER.PD within this package.
 */
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <stdlib.h>
#include <libgen.h>
#include <windows.h>

/* A 'directory separator' is a byte that equals 0x2F ('solidus' or more
 * commonly 'forward slash') or 0x5C ('reverse solidus' or more commonly
 * 'backward slash'). The byte 0x5C may look different from a backward slash
 * in some locales; for example, it looks the same as a Yen sign in Japanese
 * locales and a Won sign in Korean locales. Despite its appearance, it still
 * functions as a directory separator.
 *
 * A 'path' comprises an optional DOS drive letter with a colon, and then an
 * arbitrary number of possibily empty components, separated by non-empty
 * sequences of directory separators (in other words, consecutive directory
 * separators are treated as a single one). A path that comprises an empty
 * component denotes the current working directory.
 *
 * An 'absolute path' comprises at least two components, the first of which
 * is empty.
 *
 * A 'relative path' is a path that is not an absolute path. In other words,
 * it either comprises an empty component, or begins with a non-empty
 * component.
 *
 * POSIX doesn't have a concept about DOS drives. A path that does not have a
 * drive letter starts from the same drive as the current working directory.
 *
 * For example:
 * (Examples without drive letters match POSIX.)
 *
 *   Argument                 dirname() returns        basename() returns
 *   --------                 -----------------        ------------------
 *   `` or NULL               `.`                      `.`
 *   `usr`                    `.`                      `usr`
 *   `usr\`                   `.`                      `usr`
 *   `\`                      `\`                      `\`
 *   `\usr`                   `\`                      `usr`
 *   `\usr\lib`               `\usr`                   `lib`
 *   `\home\\dwc\\test`       `\home\\dwc`             `test`
 *   `\\host\usr`             `\\host\.`               `usr`
 *   `\\host\usr\lib`         `\\host\usr`             `lib`
 *   `\\host\\usr`            `\\host\\`               `usr`
 *   `\\host\\usr\lib`        `\\host\\usr`            `lib`
 *   `C:`                     `C:.`                    `.`
 *   `C:usr`                  `C:.`                    `usr`
 *   `C:usr\`                 `C:.`                    `usr`
 *   `C:\`                    `C:\`                    `\`
 *   `C:\\`                   `C:\`                    `\`
 *   `C:\\\`                  `C:\`                    `\`
 *   `C:\usr`                 `C:\`                    `usr`
 *   `C:\usr\lib`             `C:\usr`                 `lib`
 *   `C:\\usr\\lib\\`         `C:\\usr`                `lib`
 *   `C:\home\\dwc\\test`     `C:\home\\dwc`           `test`
 */

struct path_info
  {
    /* This points to end of the UNC prefix and drive letter, if any.  */
    char* prefix_end;

    /* These point to the directory separator in front of the last non-empty
     * component.  */
    char* base_sep_begin;
    char* base_sep_end;

    /* This points to the last directory separator sequence if no other
     * non-separator characters follow it.  */
    char* term_sep_begin;

    /* This points to the end of the string.  */
    char* path_end;
  };

#define IS_DIR_SEP(c)  ((c) == '/' || (c) == '\\')

static
void
do_get_path_info(struct path_info* info, char* path)
  {
    char* pos = path;
    DWORD cp;
    int dbcs_tb, dir_sep;

    /* Get the code page for paths in the same way as `fopen()`.  */
    // cp = AreFileApisANSI() ? CP_ACP : CP_OEMCP;
    cp = TEST_CODEPAGE;

    /* Set the structure to 'no data'.  */
    info->prefix_end = NULL;
    info->base_sep_begin = NULL;
    info->base_sep_end = NULL;
    info->term_sep_begin = NULL;

    /* Check for a UNC prefix.  */
    if(IS_DIR_SEP(pos[0]) && IS_DIR_SEP(pos[1])) {
      pos += 2;
      info->prefix_end = pos;

      /* Seek to the end of the host name.  */
      dbcs_tb = 0;
      while(*pos != 0) {
        dir_sep = 0;

        if(dbcs_tb)
          dbcs_tb = 0;
        else if(IsDBCSLeadByteEx(cp, *pos))
          dbcs_tb = 1;
        else
          dir_sep = IS_DIR_SEP(*pos);

        if(dir_sep)
          break;

        pos ++;
      }

      if(*pos == 0) {
        /* Only a host name exists.  */
        info->prefix_end = pos;
        info->path_end = pos;
        return;
      }

      /* Host name terminates here. The terminating directory separator is
       * part of the prefix.  */
      pos ++;
      info->prefix_end = pos;
    }

    /* Check for a DOS drive letter.  */
    if((pos[0] >= 'A' && pos[0] <= 'Z' && pos[1] == ':')
       || (pos[0] >= 'a' && pos[0] <= 'z' && pos[1] == ':')) {
      pos += 2;
      info->prefix_end = pos;
    }

    /* The remaining part of the path is almost the same as POSIX.  */
    dbcs_tb = 0;
    while(*pos != 0) {
      dir_sep = 0;

      if(dbcs_tb)
        dbcs_tb = 0;
      else if(IsDBCSLeadByteEx(cp, *pos))
        dbcs_tb = 1;
      else
        dir_sep = IS_DIR_SEP(*pos);

      /* If a separator has been encountered and the previous character
       * was not, mark this as the beginning of the terminating separator
       * sequence.  */
      if(dir_sep && !info->term_sep_begin)
        info->term_sep_begin = pos;

      /* If a non-separator character has been encountered and a previous
       * terminating separator sequence exists, start a new component.  */
      if(!dir_sep && info->term_sep_begin) {
        info->base_sep_begin = info->term_sep_begin;
        info->base_sep_end = pos;
        info->term_sep_begin = NULL;
      }

      pos ++;
    }

    /* Stores the end of the path for convenience.  */
    info->path_end = pos;
  }

char*
dirname(char* path)
  {
    struct path_info info;
    char* upath;
    const char* top;
    static char* static_path_copy;

    if(path == NULL|| path[0] == 0)
      return (char*) ".";

    do_get_path_info(&info, path);
    upath = info.prefix_end ? info.prefix_end : path;
    top = IS_DIR_SEP(upath[0]) ? "\\" : ".";

    /* If a non-terminating directory separator exists, it terminates the
     * dirname. Truncate the path there.  */
    if(info.base_sep_begin) {
      info.base_sep_begin[0] = 0;

      /* If the unprefixed path has not been truncated to empty, it is now
       * the dirname, so return it.  */
      if(upath[0])
        return path;
    }

    /* The dirname is empty. In principle we return `<prefix>.` if the
     * path is relative and `<prefix>\` if it is absolute. This can be
     * optimized if there is no prefix.  */
    if(upath == path)
      return (char*) top;

    /* When there is a prefix, we must append a character to the prefix.
     * If there is enough room in the original path, we just reuse its
     * storage.  */
    if(info.prefix_end < info.path_end - 1) {
      info.prefix_end[0] = *top;
      info.prefix_end[1] = 0;
      return path;
    }

    /* This is only the last resort. If there is no room, we have to copy
     * the prefix elsewhere.  */
    upath = realloc(static_path_copy, info.prefix_end - path + 2);
    if(!upath)
      return (char*) top;

    memcpy(upath, path, info.prefix_end - path);
    static_path_copy = upath;

    upath = static_path_copy + (info.prefix_end - path);
    upath[0] = *top;
    upath[1] = 0;
    return static_path_copy;
  }

char*
basename(char* path)
  {
    struct path_info info;
    char* upath;

    if(path == NULL)
      return (char*) ".";

    do_get_path_info(&info, path);
    upath = info.prefix_end ? info.prefix_end : path;

    /* If the unprefixed path is empty, POSIX says '.' shall be returned.  */
    if(upath[0] == 0)
      return (char*) ".";

    /* If a terminating separator sequence exists, it is not part of the
     * name and shall be truncated.  */
    if(info.term_sep_begin)
      info.term_sep_begin[0] = 0;

    /* If some other separator sequence has been found, the basename
     * immediately follows it.  */
    if(info.base_sep_end)
      return info.base_sep_end;

    /* If removal of the terminating separator sequence has caused the
     * unprefixed path to become empty, it must have comprised only
     * separators. POSIX says `/` shall be returned, but on Windows, we
     * return `\` instead.  */
    if(upath[0] == 0)
      return (char*) "\\";

    /* Return the unprefixed path.  */
    return upath;
  }

// -----------------

#include <stdio.h>
#include <string.h>
#include <libgen.h>

struct
  {
    char path[256], dir[128], base[128];
  }
const tests[] =
  {
    { "", ".", "." },
    { "usr", ".", "usr" },
    { "usr\\", ".", "usr" },
    { "\\", "\\", "\\" },
    { "\\usr", "\\", "usr" },
    { "\\usr\\lib", "\\usr", "lib" },
    { "\\home\\\\dwc\\\\test", "\\home\\\\dwc", "test" },
    { "\\\\host", "\\\\host", "" },
    { "\\\\host\\usr", "\\\\host\\.", "usr" },
    { "\\\\host\\usr\\lib", "\\\\host\\usr", "lib" },
    { "\\\\host\\\\usr", "\\\\host\\\\", "usr" },
    { "\\\\host\\\\usr\\lib", "\\\\host\\\\usr", "lib" },
    { "C:", "C:.", "." },
    { "C:usr", "C:.", "usr" },
    { "C:usr\\", "C:.", "usr" },
    { "C:\\", "C:\\", "\\" },
    { "C:\\\\", "C:\\", "\\" },
    { "C:\\\\\\", "C:\\", "\\" },
    { "C:\\usr", "C:\\", "usr" },
    { "C:\\usr\\lib", "C:\\usr", "lib" },
    { "C:\\\\usr\\\\lib\\\\", "C:\\\\usr", "lib" },
    { "C:\\home\\\\dwc\\\\test", "C:\\home\\\\dwc", "test" },
    { "\\\\.\\C:\\Test\\Foo.txt", "\\\\.\\C:\\Test", "Foo.txt" },
    { "\\\\.\\C:\\", "\\\\.\\C:\\", "\\" },
    { "\\\\?\\C:\\Test\\Foo.txt", "\\\\?\\C:\\Test", "Foo.txt" },
    { "\\\\?\\C:\\", "\\\\?\\C:\\", "\\" },
    { "\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\Test\\Foo.txt", 
"\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\Test", "Foo.txt" },
    { "\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\", "\\\\.\\", 
"Volume{b75e2c83-0000-0000-0000-602f00000000}" },
#if TEST_CODEPAGE == 950
    // `C:\你好世界\檔案.txt` in cp950
    { 
"\x43\x3a\x5c\xa7\x41\xa6\x6e\xa5\x40\xac\xc9\x5c\xc0\xc9\xae\xd7\x2e\x74\x78\x74",
 "\x43\x3a\x5c\xa7\x41\xa6\x6e\xa5\x40\xac\xc9", 
"\xc0\xc9\xae\xd7\x2e\x74\x78\x74" },
    // `C:\dir\file功name.txt` in cp950
    { "C:\\dir\\file\xa5\x5cname.txt", "C:\\dir", "file\xa5\x5cname.txt" },
    // Path containing an invalid DBCS code in cp950
    { "C:\\dir\\file\xa5\x00name.txt", "C:\\dir", "file\xa5" },
    { "C:\\dir\\file\xa5", "C:\\dir", "file\xa5" },
#elif TEST_CODEPAGE == CP_UTF8
    // `C:\你好世界こにちは😁\檔案.📃` in UTF-8
    { 
u8"C:\\\u4F60\u597D\u4E16\u754C\u3053\u306B\u3061\u306F\U0001f601\\\u6A94\u6848.\U0001f4c3",
 u8"C:\\\u4F60\u597D\u4E16\u754C\u3053\u306B\u3061\u306F\U0001f601", 
u8"\u6A94\u6848.\U0001f4c3" },
    // Path containing an invalid UTF-8 code unit
    { "C:\\dir\\file\xa5\x00name.txt", "C:\\dir", "file\xa5" },
    { "C:\\dir\\file\xa5", "C:\\dir", "file\xa5" },
#endif
  };


// Fuzz with a random 64-byte string, with all bytes in [0x20, 0x7F) (7-bit 
ASCII)
void fuzz1() {
  unsigned int rnd;
  char string[65] = {0};
  for (int i = 0; i < 64; i++) {
    do {
      if (0 != rand_s(&rnd))
        abort();
      rnd %= 128;
    } while (rnd < 0x20 || rnd >= 0x7F);
    string[i] = rnd;
  }
  char *temp = malloc(strlen(string) + 1);
  char *r;
  strcpy(temp, string);
  r = dirname(temp);
  strcpy(temp, string);
  r = basename(temp);
  free(temp);
}

// Fuzz with a random 64-byte string, with all bytes in [0x20, 0xFF)
void fuzz2() {
  unsigned int rnd;
  char string[65] = {0};
  for (int i = 0; i < 64; i++) {
    do {
      if (0 != rand_s(&rnd))
        abort();
      rnd %= 256;
    } while (rnd < 0x20 || rnd == 0x7F || rnd == 0xFF);
    string[i] = rnd;
  }
  char *temp = malloc(strlen(string) + 1);
  char *r;
  strcpy(temp, string);
  r = dirname(temp);
  strcpy(temp, string);
  r = basename(temp);
  free(temp);
}

// Fuzz with a random 64-byte string, with all bytes in [0x01, 0xFF]
void fuzz3() {
  unsigned int rnd;
  char string[65] = {0};
  for (int i = 0; i < 64; i++) {
    do {
      if (0 != rand_s(&rnd))
        abort();
      rnd %= 256;
    } while (rnd == 0);
    string[i] = rnd;
  }
  char *temp = malloc(strlen(string) + 1);
  char *r;
  strcpy(temp, string);
  r = dirname(temp);
  strcpy(temp, string);
  r = basename(temp);
  free(temp);
}

// Fuzz with completely random 64-byte data.
void fuzz4() {
  unsigned int rnd;
  char string[65] = {0};
  for (int i = 0; i < 16; i++) {
    if (0 != rand_s(&rnd))
      abort();
    memcpy(&string[i * 4], &rnd, 4);
  }
  char *temp = malloc(strlen(string) + 1);
  char *r;
  strcpy(temp, string);
  r = dirname(temp);
  strcpy(temp, string);
  r = basename(temp);
  free(temp);
}

int
main(void)
  {
    srand(0);
    SetConsoleOutputCP(TEST_CODEPAGE);
    // char temp[256];
    char *temp;
    char* r;
    size_t i;

    for(i = 0;  i != sizeof(tests) / sizeof(*tests);  ++i) {
      printf("in: `%s`\nd:  `%s`\nb:  `%s`\n", tests[i].path, tests[i].dir, 
tests[i].base);

      temp = malloc(strlen(tests[i].path) + 1);
      strcpy(temp, tests[i].path);
      r = dirname(temp);
      if(strcmp(r, tests[i].dir) != 0)
        printf("-- error: dirname('%s'): '%s' != '%s'\n", tests[i].path, r, 
tests[i].dir);

      strcpy(temp, tests[i].path);
      r = basename(temp);
      if(strcmp(r, tests[i].base) != 0)
        printf("-- error: basename('%s'): '%s' != '%s'\n", tests[i].path, r, 
tests[i].base);
      puts("\n");
      free(temp);
    }

    printf("done\n");

    for (i = 0; i < 10000; i++) {
      fuzz1();
      fuzz2();
      fuzz3();
      fuzz4();
    }
    printf("done fuzzing\n");
  }
_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public

Reply via email to