[PATCH v2 RESEND] target/ppc/arch_dump: set prstatus pid to cpuid
Every other architecture does this, and debuggers need it to be able to identify which prstatus note corresponds to which CPU. Reviewed-by: Thomas Huth Reviewed-by: Harsh Prateek Bora Signed-off-by: Omar Sandoval --- Resend of [1]. No changes other than adding Thomas's Reviewed-by. Thanks, Omar 1: https://lore.kernel.org/qemu-devel/3c3dd56b4e88b6863e971d72daae7c0324499712.1719852483.git.osan...@osandov.com/ target/ppc/arch_dump.c | 24 +++- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/target/ppc/arch_dump.c b/target/ppc/arch_dump.c index a8315659d9..ff93cac61e 100644 --- a/target/ppc/arch_dump.c +++ b/target/ppc/arch_dump.c @@ -47,9 +47,14 @@ struct PPCUserRegStruct { } QEMU_PACKED; struct PPCElfPrstatus { -char pad1[112]; +char pad1[32]; /* 32 == offsetof(struct elf_prstatus, pr_pid) */ +uint32_t pid; +char pad2[76]; /* 76 == offsetof(struct elf_prstatus, pr_reg) - +offsetof(struct elf_prstatus, pr_ppid) */ struct PPCUserRegStruct pr_reg; -char pad2[40]; +char pad3[40]; /* 40 == sizeof(struct elf_prstatus) - + offsetof(struct elf_prstatus, pr_reg) - + sizeof(struct user_pt_regs) */ } QEMU_PACKED; @@ -96,7 +101,7 @@ typedef struct NoteFuncArg { DumpState *state; } NoteFuncArg; -static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; reg_t cr; @@ -109,6 +114,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) prstatus = ¬e->contents.prstatus; memset(prstatus, 0, sizeof(*prstatus)); +prstatus->pid = cpu_to_dump32(s, id); reg = &prstatus->pr_reg; for (i = 0; i < 32; i++) { @@ -127,7 +133,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) reg->ccr = cpu_to_dump_reg(s, cr); } -static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfFpregset *fpregset; @@ -146,7 +152,7 @@ static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) fpregset->fpscr = cpu_to_dump_reg(s, cpu->env.fpscr); } -static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVmxregset *vmxregset; @@ -178,7 +184,7 @@ static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) vmxregset->vscr.u32[3] = cpu_to_dump32(s, ppc_get_vscr(&cpu->env)); } -static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVsxregset *vsxregset; @@ -195,7 +201,7 @@ static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) } } -static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { struct PPCElfSperegset *speregset; Note *note = &arg->note; @@ -211,7 +217,7 @@ static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) static const struct NoteFuncDescStruct { int contents_size; -void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu); +void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu, int id); } note_func[] = { {sizeof_field(Note, contents.prstatus), ppc_write_elf_prstatus}, {sizeof_field(Note, contents.fpregset), ppc_write_elf_fpregset}, @@ -282,7 +288,7 @@ static int ppc_write_all_elf_notes(const char *note_name, arg.note.hdr.n_descsz = cpu_to_dump32(s, nf->contents_size); strncpy(arg.note.name, note_name, sizeof(arg.note.name)); -(*nf->note_contents_func)(&arg, cpu); +(*nf->note_contents_func)(&arg, cpu, id); note_size = sizeof(arg.note) - sizeof(arg.note.contents) + nf->contents_size; -- 2.45.2
Re: [PATCH v2] target/ppc/arch_dump: set prstatus pid to cpuid
On Mon, Jul 01, 2024 at 09:51:35AM -0700, Omar Sandoval wrote: > Every other architecture does this, and debuggers need it to be able to > identify which prstatus note corresponds to which CPU. > > Reviewed-by: Harsh Prateek Bora Oops, I forgot to copy Thomas's reviewed-by from v1: Reviewed-by: Thomas Huth > Signed-off-by: Omar Sandoval
[PATCH v2] target/ppc/arch_dump: set prstatus pid to cpuid
Every other architecture does this, and debuggers need it to be able to identify which prstatus note corresponds to which CPU. Reviewed-by: Harsh Prateek Bora Signed-off-by: Omar Sandoval --- This is v2 from my small series [1] making QEMU fill the pr_pid field of the NT_PRSTATUS note consistently. Changes from v1: - Add comments explaning sizes of padding fields. 1: https://lore.kernel.org/qemu-devel/cover.1718771802.git.osan...@osandov.com/ target/ppc/arch_dump.c | 24 +++- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/target/ppc/arch_dump.c b/target/ppc/arch_dump.c index a8315659d9..ff93cac61e 100644 --- a/target/ppc/arch_dump.c +++ b/target/ppc/arch_dump.c @@ -47,9 +47,14 @@ struct PPCUserRegStruct { } QEMU_PACKED; struct PPCElfPrstatus { -char pad1[112]; +char pad1[32]; /* 32 == offsetof(struct elf_prstatus, pr_pid) */ +uint32_t pid; +char pad2[76]; /* 76 == offsetof(struct elf_prstatus, pr_reg) - +offsetof(struct elf_prstatus, pr_ppid) */ struct PPCUserRegStruct pr_reg; -char pad2[40]; +char pad3[40]; /* 40 == sizeof(struct elf_prstatus) - + offsetof(struct elf_prstatus, pr_reg) - + sizeof(struct user_pt_regs) */ } QEMU_PACKED; @@ -96,7 +101,7 @@ typedef struct NoteFuncArg { DumpState *state; } NoteFuncArg; -static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; reg_t cr; @@ -109,6 +114,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) prstatus = ¬e->contents.prstatus; memset(prstatus, 0, sizeof(*prstatus)); +prstatus->pid = cpu_to_dump32(s, id); reg = &prstatus->pr_reg; for (i = 0; i < 32; i++) { @@ -127,7 +133,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) reg->ccr = cpu_to_dump_reg(s, cr); } -static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfFpregset *fpregset; @@ -146,7 +152,7 @@ static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) fpregset->fpscr = cpu_to_dump_reg(s, cpu->env.fpscr); } -static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVmxregset *vmxregset; @@ -178,7 +184,7 @@ static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) vmxregset->vscr.u32[3] = cpu_to_dump32(s, ppc_get_vscr(&cpu->env)); } -static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVsxregset *vsxregset; @@ -195,7 +201,7 @@ static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) } } -static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { struct PPCElfSperegset *speregset; Note *note = &arg->note; @@ -211,7 +217,7 @@ static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) static const struct NoteFuncDescStruct { int contents_size; -void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu); +void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu, int id); } note_func[] = { {sizeof_field(Note, contents.prstatus), ppc_write_elf_prstatus}, {sizeof_field(Note, contents.fpregset), ppc_write_elf_fpregset}, @@ -282,7 +288,7 @@ static int ppc_write_all_elf_notes(const char *note_name, arg.note.hdr.n_descsz = cpu_to_dump32(s, nf->contents_size); strncpy(arg.note.name, note_name, sizeof(arg.note.name)); -(*nf->note_contents_func)(&arg, cpu); +(*nf->note_contents_func)(&arg, cpu, id); note_size = sizeof(arg.note) - sizeof(arg.note.contents) + nf->contents_size; -- 2.45.2
[PATCH 2/2] target/ppc/arch_dump: set prstatus pid to cpuid
Every other architecture does this, and debuggers need it to be able to identify which prstatus note corresponds to which CPU. Signed-off-by: Omar Sandoval --- target/ppc/arch_dump.c | 21 - 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/target/ppc/arch_dump.c b/target/ppc/arch_dump.c index a8315659d9..78b4205319 100644 --- a/target/ppc/arch_dump.c +++ b/target/ppc/arch_dump.c @@ -47,9 +47,11 @@ struct PPCUserRegStruct { } QEMU_PACKED; struct PPCElfPrstatus { -char pad1[112]; +char pad1[32]; +uint32_t pid; +uint8_t pad2[76]; struct PPCUserRegStruct pr_reg; -char pad2[40]; +char pad3[40]; } QEMU_PACKED; @@ -96,7 +98,7 @@ typedef struct NoteFuncArg { DumpState *state; } NoteFuncArg; -static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; reg_t cr; @@ -109,6 +111,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) prstatus = ¬e->contents.prstatus; memset(prstatus, 0, sizeof(*prstatus)); +prstatus->pid = cpu_to_dump32(s, id); reg = &prstatus->pr_reg; for (i = 0; i < 32; i++) { @@ -127,7 +130,7 @@ static void ppc_write_elf_prstatus(NoteFuncArg *arg, PowerPCCPU *cpu) reg->ccr = cpu_to_dump_reg(s, cr); } -static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfFpregset *fpregset; @@ -146,7 +149,7 @@ static void ppc_write_elf_fpregset(NoteFuncArg *arg, PowerPCCPU *cpu) fpregset->fpscr = cpu_to_dump_reg(s, cpu->env.fpscr); } -static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVmxregset *vmxregset; @@ -178,7 +181,7 @@ static void ppc_write_elf_vmxregset(NoteFuncArg *arg, PowerPCCPU *cpu) vmxregset->vscr.u32[3] = cpu_to_dump32(s, ppc_get_vscr(&cpu->env)); } -static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { int i; struct PPCElfVsxregset *vsxregset; @@ -195,7 +198,7 @@ static void ppc_write_elf_vsxregset(NoteFuncArg *arg, PowerPCCPU *cpu) } } -static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) +static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu, int id) { struct PPCElfSperegset *speregset; Note *note = &arg->note; @@ -211,7 +214,7 @@ static void ppc_write_elf_speregset(NoteFuncArg *arg, PowerPCCPU *cpu) static const struct NoteFuncDescStruct { int contents_size; -void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu); +void (*note_contents_func)(NoteFuncArg *arg, PowerPCCPU *cpu, int id); } note_func[] = { {sizeof_field(Note, contents.prstatus), ppc_write_elf_prstatus}, {sizeof_field(Note, contents.fpregset), ppc_write_elf_fpregset}, @@ -282,7 +285,7 @@ static int ppc_write_all_elf_notes(const char *note_name, arg.note.hdr.n_descsz = cpu_to_dump32(s, nf->contents_size); strncpy(arg.note.name, note_name, sizeof(arg.note.name)); -(*nf->note_contents_func)(&arg, cpu); +(*nf->note_contents_func)(&arg, cpu, id); note_size = sizeof(arg.note) - sizeof(arg.note.contents) + nf->contents_size; -- 2.45.2
[PATCH 1/2] target/s390x/arch_dump: use correct byte order for pid
The pid field of prstatus needs to be big endian like all of the other fields. Fixes: f738f296eaae ("s390x/arch_dump: pass cpuid into notes sections") Signed-off-by: Omar Sandoval --- target/s390x/arch_dump.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/target/s390x/arch_dump.c b/target/s390x/arch_dump.c index 7e8a1b4fc0..029d91d93a 100644 --- a/target/s390x/arch_dump.c +++ b/target/s390x/arch_dump.c @@ -102,7 +102,7 @@ static void s390x_write_elf64_prstatus(Note *note, S390CPU *cpu, int id) regs->acrs[i] = cpu_to_be32(cpu->env.aregs[i]); regs->gprs[i] = cpu_to_be64(cpu->env.regs[i]); } -note->contents.prstatus.pid = id; +note->contents.prstatus.pid = cpu_to_be32(id); } static void s390x_write_elf64_fpregset(Note *note, S390CPU *cpu, int id) -- 2.45.2
[PATCH 0/2] arch_dump: fix prstatus pid on s390x and ppc
Hello, I maintain drgn [1], a debugger for the Linux kernel. I ran into a quirk of the NT_PRSTATUS note in kernel core dumps [2], so I looked into how QEMU's dump-guest-memory command generates NT_PRSTATUS. I noticed that on most architectures, the note's PID field is set to the CPU ID plus 1. There are two exceptions: on s390x, there's an endianness bug (it's not byte swapped if the host is little endian), and on ppc, it's not set at all (it defaults to zero). They're both easy fixes. Thanks, Omar 1: https://github.com/osandov/drgn 2: https://github.com/osandov/drgn/issues/404 Omar Sandoval (2): target/s390x/arch_dump: use correct byte order for pid target/ppc/arch_dump: set prstatus pid to cpuid target/ppc/arch_dump.c | 21 - target/s390x/arch_dump.c | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) -- 2.45.2
Re: [PATCH qemu 2/2] dump: Only use the makedumpfile flattened format when necessary
On Tue, Sep 12, 2023 at 10:34:04AM +0400, Marc-André Lureau wrote: > Hi > > On Wed, Aug 23, 2023 at 2:03 PM Marc-André Lureau > wrote: > > > > Hi > > > > On Wed, Aug 23, 2023 at 4:31 AM Stephen Brennan > > wrote: > > > > > > Stephen Brennan writes: > > > > Marc-André Lureau writes: > > > >> I am a bit reluctant to change the dump format by default. But since > > > >> the > > > >> flatten format is more "complicated" than the "normal" format, perhaps > > > >> we > > > >> can assume all users will handle it. > > > >> > > > >> The change is probably late for 8.1 though.. > > > > > > > > Thank you for your review and testing! > > > > > > > > I totally understand the concern about making the change by default. I > > > > do believe that nobody should notice too much because the normal format > > > > should be easier to work with, and more broadly compatible. I don't know > > > > of any tool which deals with the flattened format that can't handle the > > > > normal format, except for "makedumpfile -R" itself. > > > > > > > > If it's a blocker, I can go ahead and add a new flag to the command to > > > > enable the behavior. However there are a few good justifications not to > > > > add a new flag. I think it's going to be difficult to explain the > > > > difference between the two formats in documentation, as the > > > > implementation of the formats is a bit "into the weeds". The libvirt API > > > > also specifies each format separately (kdump-zlib, kdump-lzo, > > > > kdump-snappy) and so adding several new options there would be > > > > unfortunate for end-users as well. > > > > > > > > At the end of the day, it's your judgment call, and I'll implement it > > > > how you prefer. > > > > > > I just wanted to check back on this to clarify the next step. Are you > > > satisfied with this and waiting to apply it until the right time? Or > > > would you prefer me to send a new version making this opt-in? > > > > > > > Nobody seems to raise concerns. If it would be just me, I would change > > it. But we should rather be safe, so let's make it this opt-in please. > > > > > > Alex, Daniel, Thomas .. Do you have an opinion on changing the dump format? Chiming in as a user here, but I'd much prefer for the normal format to be the default. The last time that I tried dump-guest-memory with the kdump format, it took me awhile to figure out what format it was actually using since libkdumpfile didn't understand it.
[PATCH v2] 9pfs: local: ignore O_NOATIME if we don't have permissions
From: Omar Sandoval QEMU's local 9pfs server passes through O_NOATIME from the client. If the QEMU process doesn't have permissions to use O_NOATIME (namely, it does not own the file nor have the CAP_FOWNER capability), the open will fail. This causes issues when from the client's point of view, it believes it has permissions to use O_NOATIME (e.g., a process running as root in the virtual machine). Additionally, overlayfs on Linux opens files on the lower layer using O_NOATIME, so in this case a 9pfs mount can't be used as a lower layer for overlayfs (cf. https://github.com/osandov/drgn/blob/dabfe1971951701da13863dbe6d8a1d172ad9650/vmtest/onoatimehack.c and https://github.com/NixOS/nixpkgs/issues/54509). Luckily, O_NOATIME is effectively a hint, and is often ignored by, e.g., network filesystems. open(2) notes that O_NOATIME "may not be effective on all filesystems. One example is NFS, where the server maintains the access time." This means that we can honor it when possible but fall back to ignoring it. Acked-by: Christian Schoenebeck Signed-off-by: Omar Sandoval --- Changes from v1: * Add comment * Add Christian's acked-by hw/9pfs/9p-util.h | 13 + 1 file changed, 13 insertions(+) diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index 79ed6b233e..546f46dc7d 100644 --- a/hw/9pfs/9p-util.h +++ b/hw/9pfs/9p-util.h @@ -37,9 +37,22 @@ static inline int openat_file(int dirfd, const char *name, int flags, { int fd, serrno, ret; +again: fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK, mode); if (fd == -1) { +if (errno == EPERM && (flags & O_NOATIME)) { +/* + * The client passed O_NOATIME but we lack permissions to honor it. + * Rather than failing the open, fall back without O_NOATIME. This + * doesn't break the semantics on the client side, as the Linux + * open(2) man page notes that O_NOATIME "may not be effective on + * all filesystems". In particular, NFS and other network + * filesystems ignore it entirely. + */ +flags &= ~O_NOATIME; +goto again; +} return -1; } -- 2.26.1
Re: [PATCH] 9pfs: local: ignore O_NOATIME if we don't have permissions
On Fri, Apr 17, 2020 at 12:45:36PM +0200, Christian Schoenebeck wrote: > On Donnerstag, 16. April 2020 20:47:11 CEST Omar Sandoval wrote: > > > > Luckily, O_NOATIME is effectively a hint, and is often ignored by, e.g., > > > > network filesystems. open(2) notes that O_NOATIME "may not be effective > > > > on all filesystems. One example is NFS, where the server maintains the > > > > access time." This means that we can honor it when possible but fall > > > > back to ignoring it. > > > > > > I am not sure whether NFS would simply silently ignore O_NOATIME i.e. > > > without returning EPERM. I don't read it that way. > > > > As far as I can tell, the NFS protocol has nothing equivalent to > > O_NOATIME and thus can't honor it. Feel free to test it: > > I did not doubt that NFS does not support O_NOATIME, what I said was that I > thought using O_NOATIME on NFS would return EPERM, but ... > > > # mount -t nfs -o vers=4,rw 10.0.2.2:/ /mnt > > # echo foo > /mnt/foo > > # touch -d "1 hour ago" /mnt/foo > > # stat /mnt/foo | grep 'Access: [0-9]' > > Access: 2020-04-16 10:43:36.838952593 -0700 > > # # Drop caches so we have to go to the NFS server. > > # echo 3 > /proc/sys/vm/drop_caches > > # strace -e openat dd if=/mnt/foo of=/dev/null iflag=noatime |& grep > > /mnt/foo openat(AT_FDCWD, "/mnt/foo", O_RDONLY|O_NOATIME) = 3 > > # stat /mnt/foo | grep 'Access: [0-9]' > > Access: 2020-04-16 11:43:36.906462928 -0700 > > ... I tried this as well, and you are right, O_NOATIME is indeed simply > *silently* dropped/ignored by NFS (i.e. without raising any error). > > > > It would certainly fix the problem in your use case. But it would also > > > unmask O_NOATIME for all other ones (i.e. regular users on guest). > > > > The guest kernel will still check whether processes on the guest have > > permission to use O_NOATIME. This only changes the behavior when the > > guest kernel believes that the process has permission even though the > > host QEMU process doesn't. > > Good point! > > > > I mean I understand your point, but I also have to take other use cases > > > into account which might expect to receive EPERM if O_NOATIME cannot be > > > granted. > > If you'd still like to preserve this behavior, would it be acceptable to > > make this a QEMU option? Maybe something like "-virtfs > > honor_noatime=no": the default would be "yes", which is the current > > behavior, and "no" would always mask out NOATIME. > > That was my initial tendency yesterday, but your arguments now convinced me > that the implied re-run, in the way your provided patch already does, is in > this particular case the better choice. > > Making that a configurable option would render this issue unnecessarily > complicated and probably even contra productive for other users which might > stumble over the same issue. > > Just do me a favour: you already thoroughly explained the issue in the commit > log, that's good, but please also add a short comment in code why the rerun > is > required, because it is not obvious by just reading the code. Finding that > info from git log becomes tedious as soon as code is refactored there. > > Aside of the missing comment in code: > > Acked-by: Christian Schoenebeck Thank you! I'll add the comment and resend.
Re: [PATCH] 9pfs: local: ignore O_NOATIME if we don't have permissions
On Thu, Apr 16, 2020 at 04:58:31PM +0200, Christian Schoenebeck wrote: > On Donnerstag, 16. April 2020 02:44:33 CEST Omar Sandoval wrote: > > From: Omar Sandoval > > > > QEMU's local 9pfs server passes through O_NOATIME from the client. If > > the QEMU process doesn't have permissions to use O_NOATIME (namely, it > > does not own the file nor have the CAP_FOWNER capability), the open will > > fail. This causes issues when from the client's point of view, it > > believes it has permissions to use O_NOATIME (e.g., a process running as > > root in the virtual machine). Additionally, overlayfs on Linux opens > > files on the lower layer using O_NOATIME, so in this case a 9pfs mount > > can't be used as a lower layer for overlayfs (cf. > > https://github.com/osandov/drgn/blob/dabfe1971951701da13863dbe6d8a1d172ad965 > > 0/vmtest/onoatimehack.c and https://github.com/NixOS/nixpkgs/issues/54509). > > > > Luckily, O_NOATIME is effectively a hint, and is often ignored by, e.g., > > network filesystems. open(2) notes that O_NOATIME "may not be effective > > on all filesystems. One example is NFS, where the server maintains the > > access time." This means that we can honor it when possible but fall > > back to ignoring it. > > I am not sure whether NFS would simply silently ignore O_NOATIME i.e. without > returning EPERM. I don't read it that way. As far as I can tell, the NFS protocol has nothing equivalent to O_NOATIME and thus can't honor it. Feel free to test it: # mount -t nfs -o vers=4,rw 10.0.2.2:/ /mnt # echo foo > /mnt/foo # touch -d "1 hour ago" /mnt/foo # stat /mnt/foo | grep 'Access: [0-9]' Access: 2020-04-16 10:43:36.838952593 -0700 # # Drop caches so we have to go to the NFS server. # echo 3 > /proc/sys/vm/drop_caches # strace -e openat dd if=/mnt/foo of=/dev/null iflag=noatime |& grep /mnt/foo openat(AT_FDCWD, "/mnt/foo", O_RDONLY|O_NOATIME) = 3 # stat /mnt/foo | grep 'Access: [0-9]' Access: 2020-04-16 11:43:36.906462928 -0700 > Fact is on Linux the expected > behaviour is returning EPERM if O_NOATIME cannot be satisfied, consistent > since its introduction 22 years ago: > http://lkml.iu.edu/hypermail/linux/kernel/9811.2/0118.html The exact phrasing in the man-page is: "EPERM The O_NOATIME flag was specified, but the effective user ID of the caller did not match the owner of the file and the caller was not privileged." IMO, it's about whether the (guest) process has permission from the (guest) kernel's point of view, not whether the filesystem could satisfy it. > > Signed-off-by: Omar Sandoval > > --- > > hw/9pfs/9p-util.h | 5 + > > 1 file changed, 5 insertions(+) > > > > diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h > > index 79ed6b233e..50842d540f 100644 > > --- a/hw/9pfs/9p-util.h > > +++ b/hw/9pfs/9p-util.h > > @@ -37,9 +37,14 @@ static inline int openat_file(int dirfd, const char > > *name, int flags, { > > int fd, serrno, ret; > > > > +again: > > fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK, > > mode); > > if (fd == -1) { > > +if (errno == EPERM && (flags & O_NOATIME)) { > > +flags &= ~O_NOATIME; > > +goto again; > > +} > > return -1; > > } > > It would certainly fix the problem in your use case. But it would also unmask > O_NOATIME for all other ones (i.e. regular users on guest). The guest kernel will still check whether processes on the guest have permission to use O_NOATIME. This only changes the behavior when the guest kernel believes that the process has permission even though the host QEMU process doesn't. > I mean I understand your point, but I also have to take other use cases into > account which might expect to receive EPERM if O_NOATIME cannot be granted. If you'd still like to preserve this behavior, would it be acceptable to make this a QEMU option? Maybe something like "-virtfs honor_noatime=no": the default would be "yes", which is the current behavior, and "no" would always mask out NOATIME. > May I ask how come that file/dir in question does not share the same uid in > your specific use case? Are the file(s) created outside of QEMU, i.e. > directly > by some app on host? My use case is running tests on different versions of the Linux kernel while reusing the host's userspace environment. I export the host's root filesystem read-only to the guest via 9pfs, and the guest sets up overlayfs on top of it (to allow certain modifications) and chroots into that. Without a workaround like the LD_PRELOAD one I mentioned in the commit message, any (read) accesses to files owned by root on the host (like /bin/sh) will fail.
[PATCH] 9pfs: local: ignore O_NOATIME if we don't have permissions
From: Omar Sandoval QEMU's local 9pfs server passes through O_NOATIME from the client. If the QEMU process doesn't have permissions to use O_NOATIME (namely, it does not own the file nor have the CAP_FOWNER capability), the open will fail. This causes issues when from the client's point of view, it believes it has permissions to use O_NOATIME (e.g., a process running as root in the virtual machine). Additionally, overlayfs on Linux opens files on the lower layer using O_NOATIME, so in this case a 9pfs mount can't be used as a lower layer for overlayfs (cf. https://github.com/osandov/drgn/blob/dabfe1971951701da13863dbe6d8a1d172ad9650/vmtest/onoatimehack.c and https://github.com/NixOS/nixpkgs/issues/54509). Luckily, O_NOATIME is effectively a hint, and is often ignored by, e.g., network filesystems. open(2) notes that O_NOATIME "may not be effective on all filesystems. One example is NFS, where the server maintains the access time." This means that we can honor it when possible but fall back to ignoring it. Signed-off-by: Omar Sandoval --- hw/9pfs/9p-util.h | 5 + 1 file changed, 5 insertions(+) diff --git a/hw/9pfs/9p-util.h b/hw/9pfs/9p-util.h index 79ed6b233e..50842d540f 100644 --- a/hw/9pfs/9p-util.h +++ b/hw/9pfs/9p-util.h @@ -37,9 +37,14 @@ static inline int openat_file(int dirfd, const char *name, int flags, { int fd, serrno, ret; +again: fd = openat(dirfd, name, flags | O_NOFOLLOW | O_NOCTTY | O_NONBLOCK, mode); if (fd == -1) { +if (errno == EPERM && (flags & O_NOATIME)) { +flags &= ~O_NOATIME; +goto again; +} return -1; } -- 2.26.1