Hi,
Overall I think the implementation looks fine. The test cases are
missing some of the more exotic DOS device paths[1], for example
`\\.\Volume{b75e2c83-0000-0000-0000-602f00000000}\` but to be frank I
don't know what should be the expected outputs for it. Should we simply
declare that DOS device paths may not work as expected?
[1]:
https://learn.microsoft.com/en-us/dotnet/standard/io/file-path-formats#dos-device-paths
Manual fuzzing with asan and ubsan revealed an issue: The line
`info.prefix_end[1] = 0;` (line 213) can result in a buffer overrun.
This happens with inputs in the form of an UNC host only, e.g. `\\host`,
which is also missing from the test cases.
I would think the fix is as simple as changing line 211 to
`if(info.prefix_end < info.path_end - 1) {`, but this revealed another
issue because `info.path_end` contained an uninitialized value -- an
early return in `do_get_path_info` (line 127) missed setting
`info->path_end = pos;`. After applying these fixes there seems to be no
other sanitizer issues. I have attached my test which I built with
`clang -fsanitize=address,undefined`.
Cheers,
Alvin
On 26/3/2023 17:09, LIU Hao wrote:
As discussed on IRC just now, paths are not affected by the global
locale. According to Microsoft documentation about `fopen()` [1],
paths are interpreted with `CP_ACP` if `AreFileApisANSI()` returns
true, and `CP_OEMCP` otherwise. We had better follow that convention.
[1]
https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/fopen-wfopen?view=msvc-170
_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public
#define TEST_CODEPAGE 950
// #define TEST_CODEPAGE CP_UTF8
#define _CRT_RAND_S
// -----------------
/**
* This file has no copyright assigned and is placed in the Public Domain.
* This file is part of the mingw-w64 runtime package.
* No warranty is given; refer to the file DISCLAIMER.PD within this package.
*/
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#include <stdlib.h>
#include <libgen.h>
#include <windows.h>
/* A 'directory separator' is a byte that equals 0x2F ('solidus' or more
* commonly 'forward slash') or 0x5C ('reverse solidus' or more commonly
* 'backward slash'). The byte 0x5C may look different from a backward slash
* in some locales; for example, it looks the same as a Yen sign in Japanese
* locales and a Won sign in Korean locales. Despite its appearance, it still
* functions as a directory separator.
*
* A 'path' comprises an optional DOS drive letter with a colon, and then an
* arbitrary number of possibily empty components, separated by non-empty
* sequences of directory separators (in other words, consecutive directory
* separators are treated as a single one). A path that comprises an empty
* component denotes the current working directory.
*
* An 'absolute path' comprises at least two components, the first of which
* is empty.
*
* A 'relative path' is a path that is not an absolute path. In other words,
* it either comprises an empty component, or begins with a non-empty
* component.
*
* POSIX doesn't have a concept about DOS drives. A path that does not have a
* drive letter starts from the same drive as the current working directory.
*
* For example:
* (Examples without drive letters match POSIX.)
*
* Argument dirname() returns basename() returns
* -------- ----------------- ------------------
* `` or NULL `.` `.`
* `usr` `.` `usr`
* `usr\` `.` `usr`
* `\` `\` `\`
* `\usr` `\` `usr`
* `\usr\lib` `\usr` `lib`
* `\home\\dwc\\test` `\home\\dwc` `test`
* `\\host\usr` `\\host\.` `usr`
* `\\host\usr\lib` `\\host\usr` `lib`
* `\\host\\usr` `\\host\\` `usr`
* `\\host\\usr\lib` `\\host\\usr` `lib`
* `C:` `C:.` `.`
* `C:usr` `C:.` `usr`
* `C:usr\` `C:.` `usr`
* `C:\` `C:\` `\`
* `C:\\` `C:\` `\`
* `C:\\\` `C:\` `\`
* `C:\usr` `C:\` `usr`
* `C:\usr\lib` `C:\usr` `lib`
* `C:\\usr\\lib\\` `C:\\usr` `lib`
* `C:\home\\dwc\\test` `C:\home\\dwc` `test`
*/
struct path_info
{
/* This points to end of the UNC prefix and drive letter, if any. */
char* prefix_end;
/* These point to the directory separator in front of the last non-empty
* component. */
char* base_sep_begin;
char* base_sep_end;
/* This points to the last directory separator sequence if no other
* non-separator characters follow it. */
char* term_sep_begin;
/* This points to the end of the string. */
char* path_end;
};
#define IS_DIR_SEP(c) ((c) == '/' || (c) == '\\')
static
void
do_get_path_info(struct path_info* info, char* path)
{
char* pos = path;
DWORD cp;
int dbcs_tb, dir_sep;
/* Get the code page for paths in the same way as `fopen()`. */
// cp = AreFileApisANSI() ? CP_ACP : CP_OEMCP;
cp = TEST_CODEPAGE;
/* Set the structure to 'no data'. */
info->prefix_end = NULL;
info->base_sep_begin = NULL;
info->base_sep_end = NULL;
info->term_sep_begin = NULL;
/* Check for a UNC prefix. */
if(IS_DIR_SEP(pos[0]) && IS_DIR_SEP(pos[1])) {
pos += 2;
info->prefix_end = pos;
/* Seek to the end of the host name. */
dbcs_tb = 0;
while(*pos != 0) {
dir_sep = 0;
if(dbcs_tb)
dbcs_tb = 0;
else if(IsDBCSLeadByteEx(cp, *pos))
dbcs_tb = 1;
else
dir_sep = IS_DIR_SEP(*pos);
if(dir_sep)
break;
pos ++;
}
if(*pos == 0) {
/* Only a host name exists. */
info->prefix_end = pos;
info->path_end = pos;
return;
}
/* Host name terminates here. The terminating directory separator is
* part of the prefix. */
pos ++;
info->prefix_end = pos;
}
/* Check for a DOS drive letter. */
if((pos[0] >= 'A' && pos[0] <= 'Z' && pos[1] == ':')
|| (pos[0] >= 'a' && pos[0] <= 'z' && pos[1] == ':')) {
pos += 2;
info->prefix_end = pos;
}
/* The remaining part of the path is almost the same as POSIX. */
dbcs_tb = 0;
while(*pos != 0) {
dir_sep = 0;
if(dbcs_tb)
dbcs_tb = 0;
else if(IsDBCSLeadByteEx(cp, *pos))
dbcs_tb = 1;
else
dir_sep = IS_DIR_SEP(*pos);
/* If a separator has been encountered and the previous character
* was not, mark this as the beginning of the terminating separator
* sequence. */
if(dir_sep && !info->term_sep_begin)
info->term_sep_begin = pos;
/* If a non-separator character has been encountered and a previous
* terminating separator sequence exists, start a new component. */
if(!dir_sep && info->term_sep_begin) {
info->base_sep_begin = info->term_sep_begin;
info->base_sep_end = pos;
info->term_sep_begin = NULL;
}
pos ++;
}
/* Stores the end of the path for convenience. */
info->path_end = pos;
}
char*
dirname(char* path)
{
struct path_info info;
char* upath;
const char* top;
static char* static_path_copy;
if(path == NULL|| path[0] == 0)
return (char*) ".";
do_get_path_info(&info, path);
upath = info.prefix_end ? info.prefix_end : path;
top = IS_DIR_SEP(upath[0]) ? "\\" : ".";
/* If a non-terminating directory separator exists, it terminates the
* dirname. Truncate the path there. */
if(info.base_sep_begin) {
info.base_sep_begin[0] = 0;
/* If the unprefixed path has not been truncated to empty, it is now
* the dirname, so return it. */
if(upath[0])
return path;
}
/* The dirname is empty. In principle we return `<prefix>.` if the
* path is relative and `<prefix>\` if it is absolute. This can be
* optimized if there is no prefix. */
if(upath == path)
return (char*) top;
/* When there is a prefix, we must append a character to the prefix.
* If there is enough room in the original path, we just reuse its
* storage. */
if(info.prefix_end < info.path_end - 1) {
info.prefix_end[0] = *top;
info.prefix_end[1] = 0;
return path;
}
/* This is only the last resort. If there is no room, we have to copy
* the prefix elsewhere. */
upath = realloc(static_path_copy, info.prefix_end - path + 2);
if(!upath)
return (char*) top;
memcpy(upath, path, info.prefix_end - path);
static_path_copy = upath;
upath = static_path_copy + (info.prefix_end - path);
upath[0] = *top;
upath[1] = 0;
return static_path_copy;
}
char*
basename(char* path)
{
struct path_info info;
char* upath;
if(path == NULL)
return (char*) ".";
do_get_path_info(&info, path);
upath = info.prefix_end ? info.prefix_end : path;
/* If the unprefixed path is empty, POSIX says '.' shall be returned. */
if(upath[0] == 0)
return (char*) ".";
/* If a terminating separator sequence exists, it is not part of the
* name and shall be truncated. */
if(info.term_sep_begin)
info.term_sep_begin[0] = 0;
/* If some other separator sequence has been found, the basename
* immediately follows it. */
if(info.base_sep_end)
return info.base_sep_end;
/* If removal of the terminating separator sequence has caused the
* unprefixed path to become empty, it must have comprised only
* separators. POSIX says `/` shall be returned, but on Windows, we
* return `\` instead. */
if(upath[0] == 0)
return (char*) "\\";
/* Return the unprefixed path. */
return upath;
}
// -----------------
#include <stdio.h>
#include <string.h>
#include <libgen.h>
struct
{
char path[256], dir[128], base[128];
}
const tests[] =
{
{ "", ".", "." },
{ "usr", ".", "usr" },
{ "usr\\", ".", "usr" },
{ "\\", "\\", "\\" },
{ "\\usr", "\\", "usr" },
{ "\\usr\\lib", "\\usr", "lib" },
{ "\\home\\\\dwc\\\\test", "\\home\\\\dwc", "test" },
{ "\\\\host", "\\\\host", "" },
{ "\\\\host\\usr", "\\\\host\\.", "usr" },
{ "\\\\host\\usr\\lib", "\\\\host\\usr", "lib" },
{ "\\\\host\\\\usr", "\\\\host\\\\", "usr" },
{ "\\\\host\\\\usr\\lib", "\\\\host\\\\usr", "lib" },
{ "C:", "C:.", "." },
{ "C:usr", "C:.", "usr" },
{ "C:usr\\", "C:.", "usr" },
{ "C:\\", "C:\\", "\\" },
{ "C:\\\\", "C:\\", "\\" },
{ "C:\\\\\\", "C:\\", "\\" },
{ "C:\\usr", "C:\\", "usr" },
{ "C:\\usr\\lib", "C:\\usr", "lib" },
{ "C:\\\\usr\\\\lib\\\\", "C:\\\\usr", "lib" },
{ "C:\\home\\\\dwc\\\\test", "C:\\home\\\\dwc", "test" },
{ "\\\\.\\C:\\Test\\Foo.txt", "\\\\.\\C:\\Test", "Foo.txt" },
{ "\\\\.\\C:\\", "\\\\.\\C:\\", "\\" },
{ "\\\\?\\C:\\Test\\Foo.txt", "\\\\?\\C:\\Test", "Foo.txt" },
{ "\\\\?\\C:\\", "\\\\?\\C:\\", "\\" },
{ "\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\Test\\Foo.txt",
"\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\Test", "Foo.txt" },
{ "\\\\.\\Volume{b75e2c83-0000-0000-0000-602f00000000}\\", "\\\\.\\",
"Volume{b75e2c83-0000-0000-0000-602f00000000}" },
#if TEST_CODEPAGE == 950
// `C:\你好世界\檔案.txt` in cp950
{
"\x43\x3a\x5c\xa7\x41\xa6\x6e\xa5\x40\xac\xc9\x5c\xc0\xc9\xae\xd7\x2e\x74\x78\x74",
"\x43\x3a\x5c\xa7\x41\xa6\x6e\xa5\x40\xac\xc9",
"\xc0\xc9\xae\xd7\x2e\x74\x78\x74" },
// `C:\dir\file功name.txt` in cp950
{ "C:\\dir\\file\xa5\x5cname.txt", "C:\\dir", "file\xa5\x5cname.txt" },
// Path containing an invalid DBCS code in cp950
{ "C:\\dir\\file\xa5\x00name.txt", "C:\\dir", "file\xa5" },
{ "C:\\dir\\file\xa5", "C:\\dir", "file\xa5" },
#elif TEST_CODEPAGE == CP_UTF8
// `C:\你好世界こにちは😁\檔案.📃` in UTF-8
{
u8"C:\\\u4F60\u597D\u4E16\u754C\u3053\u306B\u3061\u306F\U0001f601\\\u6A94\u6848.\U0001f4c3",
u8"C:\\\u4F60\u597D\u4E16\u754C\u3053\u306B\u3061\u306F\U0001f601",
u8"\u6A94\u6848.\U0001f4c3" },
// Path containing an invalid UTF-8 code unit
{ "C:\\dir\\file\xa5\x00name.txt", "C:\\dir", "file\xa5" },
{ "C:\\dir\\file\xa5", "C:\\dir", "file\xa5" },
#endif
};
// Fuzz with a random 64-byte string, with all bytes in [0x20, 0x7F) (7-bit
ASCII)
void fuzz1() {
unsigned int rnd;
char string[65] = {0};
for (int i = 0; i < 64; i++) {
do {
if (0 != rand_s(&rnd))
abort();
rnd %= 128;
} while (rnd < 0x20 || rnd >= 0x7F);
string[i] = rnd;
}
char *temp = malloc(strlen(string) + 1);
char *r;
strcpy(temp, string);
r = dirname(temp);
strcpy(temp, string);
r = basename(temp);
free(temp);
}
// Fuzz with a random 64-byte string, with all bytes in [0x20, 0xFF)
void fuzz2() {
unsigned int rnd;
char string[65] = {0};
for (int i = 0; i < 64; i++) {
do {
if (0 != rand_s(&rnd))
abort();
rnd %= 256;
} while (rnd < 0x20 || rnd == 0x7F || rnd == 0xFF);
string[i] = rnd;
}
char *temp = malloc(strlen(string) + 1);
char *r;
strcpy(temp, string);
r = dirname(temp);
strcpy(temp, string);
r = basename(temp);
free(temp);
}
// Fuzz with a random 64-byte string, with all bytes in [0x01, 0xFF]
void fuzz3() {
unsigned int rnd;
char string[65] = {0};
for (int i = 0; i < 64; i++) {
do {
if (0 != rand_s(&rnd))
abort();
rnd %= 256;
} while (rnd == 0);
string[i] = rnd;
}
char *temp = malloc(strlen(string) + 1);
char *r;
strcpy(temp, string);
r = dirname(temp);
strcpy(temp, string);
r = basename(temp);
free(temp);
}
// Fuzz with completely random 64-byte data.
void fuzz4() {
unsigned int rnd;
char string[65] = {0};
for (int i = 0; i < 16; i++) {
if (0 != rand_s(&rnd))
abort();
memcpy(&string[i * 4], &rnd, 4);
}
char *temp = malloc(strlen(string) + 1);
char *r;
strcpy(temp, string);
r = dirname(temp);
strcpy(temp, string);
r = basename(temp);
free(temp);
}
int
main(void)
{
srand(0);
SetConsoleOutputCP(TEST_CODEPAGE);
// char temp[256];
char *temp;
char* r;
size_t i;
for(i = 0; i != sizeof(tests) / sizeof(*tests); ++i) {
printf("in: `%s`\nd: `%s`\nb: `%s`\n", tests[i].path, tests[i].dir,
tests[i].base);
temp = malloc(strlen(tests[i].path) + 1);
strcpy(temp, tests[i].path);
r = dirname(temp);
if(strcmp(r, tests[i].dir) != 0)
printf("-- error: dirname('%s'): '%s' != '%s'\n", tests[i].path, r,
tests[i].dir);
strcpy(temp, tests[i].path);
r = basename(temp);
if(strcmp(r, tests[i].base) != 0)
printf("-- error: basename('%s'): '%s' != '%s'\n", tests[i].path, r,
tests[i].base);
puts("\n");
free(temp);
}
printf("done\n");
for (i = 0; i < 10000; i++) {
fuzz1();
fuzz2();
fuzz3();
fuzz4();
}
printf("done fuzzing\n");
}
_______________________________________________
Mingw-w64-public mailing list
Mingw-w64-public@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mingw-w64-public