Hi Shachar,

Thank you for your comments. Please, find my answers below.

Regards,
Alexey Ishchuk

On 08/27/2014 07:18 PM, Shachar Raindel wrote:
Hi Alex,

Few comments on your patch below.


Thanks,
--Shachar

-----Original Message-----
From: linux-rdma-ow...@vger.kernel.org [mailto:linux-rdma-
ow...@vger.kernel.org] On Behalf Of Alexey Ishchuk
Sent: Wednesday, August 27, 2014 1:29 PM
To: linux-rdma@vger.kernel.org
Cc: arlin.r.da...@intel.com; g...@dev.mellanox.co.il; rol...@kernel.org;
linux-s...@vger.kernel.org; gmue...@de.ibm.com;
utz.bac...@de.ibm.com; martin.schwidef...@de.ibm.com;
frank.blasc...@de.ibm.com; Alexey Ishchuk
Subject: [PATCH] s390/kernel: add system calls for access PCI memory

Add the new __NR_s390_pci_mmio_write and __NR_s390_pci_mmio_read
system calls to allow user space applications to access device PCI I/O

Why do you need this to be a special syscall for this functionality? If S390 
platform supports mapping MMIO pages to the user space? If this must happen in 
kernel, it should be provided as a device file (probably character), on which 
writes or ioctls does mmio_write, and reads or ioctl does mmio_reads.

The special syscall is needed for this functionality because the s390
platform does not really support MMIO page mappings to user space. The
PCI memory itself on this platform can be accessed only using special
privileged CPU instructions. Because of that the user space program
cannot simply store the data into a mapped memory that belongs to the
PCI memory address space.

To get as close to the programming model used by "normal" architectures
the PCI MMIO address is stored into the page table for the user space
process as if the process could store to the memory. But in fact it can
not, every access to this memory area would create an exception.

To write data to the PCI memory address space  the user space program on
the s390 platform needs to call into the kernel to have these privileged
PCI instructions executed. A system call is a straight-forward way to do
this.

I tried to provide a new Infiniband verb command for data writing to PCI
memory but that approach was rejected by the community. My tests shown
that the syscall approach implementation works noticeably faster than
new verb command way. Concerning the security issues, please, see my
considerations below.

memory pages on s390x platform.

Signed-off-by: Alexey Ishchuk <aishc...@linux.vnet.ibm.com>
---
  arch/s390/include/uapi/asm/unistd.h |   4 +-
  arch/s390/kernel/Makefile           |   1 +
  arch/s390/kernel/entry.h            |   4 +
  arch/s390/kernel/pci_mmio.c         | 197
++++++++++++++++++++++++++++++++++++
  arch/s390/kernel/syscalls.S         |   2 +
  5 files changed, 207 insertions(+), 1 deletion(-)
  create mode 100644 arch/s390/kernel/pci_mmio.c

diff --git a/arch/s390/include/uapi/asm/unistd.h
b/arch/s390/include/uapi/asm/unistd.h
index 3802d2d..ab49d1d 100644
--- a/arch/s390/include/uapi/asm/unistd.h
+++ b/arch/s390/include/uapi/asm/unistd.h
@@ -283,7 +283,9 @@
  #define __NR_sched_setattr    345
  #define __NR_sched_getattr    346
  #define __NR_renameat2                347
-#define NR_syscalls 348
+#define __NR_s390_pci_mmio_write       348
+#define __NR_s390_pci_mmio_read                349
+#define NR_syscalls 350

  /*
   * There are some system calls that are not present on 64 bit, some
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 8c2518f..44e8fbb 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -62,6 +62,7 @@ ifdef CONFIG_64BIT
  obj-$(CONFIG_PERF_EVENTS)     += perf_event.o perf_cpum_cf.o
perf_cpum_sf.o \
                                                perf_cpum_cf_events.o
  obj-y                         += runtime_instr.o cache.o
+obj-y                          += pci_mmio.o
  endif

  # vdso
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 6ac7819..a36b6f9 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -70,4 +70,8 @@ struct old_sigaction;
  long sys_s390_personality(unsigned int personality);
  long sys_s390_runtime_instr(int command, int signum);

+long sys_s390_pci_mmio_write(const unsigned long mmio_addr,
+                            const void *user_buffer, const size_t length);
+long sys_s390_pci_mmio_read(const unsigned long mmio_addr,
+                           void *user_buffer, const size_t length);
  #endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/pci_mmio.c b/arch/s390/kernel/pci_mmio.c
new file mode 100644
index 0000000..4539d23
--- /dev/null
+++ b/arch/s390/kernel/pci_mmio.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright IBM Corp. 2014
+ */
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+
+union value_buffer {
+       u8 buf8;
+       u16 buf16;
+       u32 buf32;
+       u64 buf64;
+       u8 buf_large[64];
+};
+
+static long get_pfn(const unsigned long user_addr,
+                   const unsigned long access,
+                   unsigned long *pfn)
+{
+       struct vm_area_struct *vma = NULL;
+
+       if (!pfn)
+               return -EINVAL;
+
+       vma = find_vma(current->mm, user_addr);
+       if (!vma)
+               return -EINVAL;
+       if (!(vma->vm_flags & access))
+               return -EACCES;
+
+       return follow_pfn(vma, user_addr, pfn);
+}
+
+static inline int verify_page_addr(const unsigned long page_addr)
+{
+       return !(page_addr < ZPCI_IOMAP_ADDR_BASE ||
+           page_addr > (ZPCI_IOMAP_ADDR_BASE |
ZPCI_IOMAP_ADDR_IDX_MASK));
+}
+
+static long choose_buffer(const size_t length,
+                         union value_buffer *value,
+                         void **buf)
+{
+       long ret = 0UL;
+
+       if (length > sizeof(value->buf_large)) {
+               *buf = kmalloc(length, GFP_KERNEL);
+               if (!*buf)
+                       return -ENOMEM;
+               ret = 1;
+       } else {
+               *buf = value->buf_large;
+       }
+       return ret;
+}
+
+SYSCALL_DEFINE3(s390_pci_mmio_write,
+               const unsigned long, mmio_addr,
+               const void __user *, user_buffer,
+               const size_t, length)
+{

You need some security check in the flow here (i.e. capability or root).
If you don't do that, any user can do mmio to any address in the system, which 
seems like a very bad idea.

An additional security check on the MMIO address given with the syscall
is not required because it uses the address of a previously mapped
MMIO area. That area should be mapped only by a process that has
the appropriate permissions for working with the uverbs file. When the
syscall receives the user space address it tries to find corresponding
vma and checks required access to it, and if vma cannot be found or
access mode is invalid syscall returns an  error code. After the vma is
found, syscall verifies that the memory area belongs to the PCI memory
address space, and only if a correct address is received it writes the
data using the specified address. Therefore, as the address should be
previously mapped for a correct file and belong to the PCI memory
address space, the syscall does not work with any user space address,
and will work only with the addresses mapped from correct uverbs file
that belong to the PCI memory address space.

Compare it with a "normal" architecture, there is no additional security
check either, the user space process simply access the MMIO area
with read/write instructions directly. All required checks need to go
into the code that establishes the mapping.

+       long ret = 0L;
+       void *buf = NULL;
+       long buf_allocated = 0;
+       void __iomem *io_addr = NULL;
+       unsigned long pfn = 0UL;
+       unsigned long offset = 0UL;
+       unsigned long page_addr = 0UL;
+       union value_buffer value;
+
+       if (!length)
+               return -EINVAL;
+       if (!zpci_is_enabled())
+               return -ENODEV;
+
+       ret = get_pfn(mmio_addr, VM_WRITE, &pfn);
+       if (ret)
+               return ret;
+
+       page_addr = pfn << PAGE_SHIFT;
+       if (!verify_page_addr(page_addr))
+               return -EFAULT;
+
+       offset = mmio_addr & ~PAGE_MASK;
+       if (offset + length > PAGE_SIZE)
+               return -EINVAL;
+       io_addr = (void *)(page_addr | offset);
+
+       buf_allocated = choose_buffer(length, &value, &buf);
+       if (buf_allocated < 0L)
+               return -ENOMEM;
+
+       switch (length) {
+       case 1:
+               ret = get_user(value.buf8, ((u8 *)user_buffer));
+               break;
+       case 2:
+               ret = get_user(value.buf16, ((u16 *)user_buffer));
+               break;
+       case 4:
+               ret = get_user(value.buf32, ((u32 *)user_buffer));
+               break;
+       case 8:
+               ret = get_user(value.buf64, ((u64 *)user_buffer));
+               break;
+       default:
+               ret = copy_from_user(buf, user_buffer, length);
+       }
+       if (ret)
+               goto out;
+
+       switch (length) {
+       case 1:
+               __raw_writeb(value.buf8, io_addr);
+               break;
+       case 2:
+               __raw_writew(value.buf16, io_addr);
+               break;
+       case 4:
+               __raw_writel(value.buf32, io_addr);
+               break;
+       case 8:
+               __raw_writeq(value.buf64, io_addr);
+               break;
+       default:
+               memcpy_toio(io_addr, buf, length);
+       }
+out:
+       if (buf_allocated > 0L)
+               kfree(buf);
+       return ret;
+}
+
+SYSCALL_DEFINE3(s390_pci_mmio_read,
+               const unsigned long, mmio_addr,
+               void __user *, user_buffer,
+               const size_t, length)
+{
+       long ret = 0L;
+       void *buf = NULL;
+       long buf_allocated = 0L;
+       void __iomem *io_addr = NULL;
+       unsigned long pfn = 0UL;
+       unsigned long offset = 0UL;
+       unsigned long page_addr = 0UL;
+       union value_buffer value;
+
+       if (!length)
+               return -EINVAL;
+       if (!zpci_is_enabled())
+               return -ENODEV;
+
+       ret = get_pfn(mmio_addr, VM_READ, &pfn);
+       if (ret)
+               return ret;
+
+       page_addr = pfn << PAGE_SHIFT;
+       if (!verify_page_addr(page_addr))
+               return -EFAULT;
+
+       offset = mmio_addr & ~PAGE_MASK;
+       if (offset + length > PAGE_SIZE)
+               return -EINVAL;
+       io_addr = (void *)(page_addr | offset);
+
+       buf_allocated = choose_buffer(length, &value, &buf);
+       if (buf_allocated < 0L)
+               return -ENOMEM;
+
+       switch (length) {
+       case 1:
+               value.buf8 = __raw_readb(io_addr);
+               ret = put_user(value.buf8, ((u8 *)user_buffer));
+               break;
+       case 2:
+               value.buf16 = __raw_readw(io_addr);
+               ret = put_user(value.buf16, ((u16 *)user_buffer));
+               break;
+       case 4:
+               value.buf32 = __raw_readl(io_addr);
+               ret = put_user(value.buf32, ((u32 *)user_buffer));
+               break;
+       case 8:
+               value.buf64 = __raw_readq(io_addr);
+               ret = put_user(value.buf64, ((u64 *)user_buffer));
+               break;
+       default:
+               memcpy_fromio(buf, io_addr, length);
+               ret = copy_to_user(user_buffer, buf, length);
+       }
+       if (buf_allocated > 0L)
+               kfree(buf);
+       return ret;
+}
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index fe5cdf2..1faa942 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -356,3 +356,5 @@
SYSCALL(sys_finit_module,sys_finit_module,compat_sys_finit_module)
  SYSCALL(sys_sched_setattr,sys_sched_setattr,compat_sys_sched_setattr)
/* 345 */
  SYSCALL(sys_sched_getattr,sys_sched_getattr,compat_sys_sched_getattr)
  SYSCALL(sys_renameat2,sys_renameat2,compat_sys_renameat2)
+SYSCALL(sys_ni_syscall,sys_s390_pci_mmio_write,sys_ni_syscall)
+SYSCALL(sys_ni_syscall,sys_s390_pci_mmio_read,sys_ni_syscall)
--
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to