Re: [PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM
Sam Bobroffwrites: > Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to > determine if a PowerPC KVM guest should use HTM (Hardware Transactional Minor nit, "should" should be "can" IMHO. > Memory). > > This will be used by QEMU to populate the pa-features bits in the > guest's device tree. > > Signed-off-by: Sam Bobroff > --- > > v2: > > * Use CPU_FTR_TM_COMP instead of CPU_FTR_TM. Thanks. Acked-by: Michael Ellerman Or do you want me to merge this before Paul gets back? > * I didn't unbreak the line, as with the extra characters checkpatch will > complain if I do. I did move the break to a more usual place. I would just ignore checkpatch. But I don't mind that much. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH kernel v2 1/2] powerpc/iommu: Stop using @current in mm_iommu_xxx
In some situations the userspace memory context may live longer than the userspace process itself so if we need to do proper memory context cleanup, we better cache @mm and use it later when the process is gone (@current or @current->mm are NULL). This changes mm_iommu_xxx API to receive mm_struct instead of using one from @current. This is needed by the following patch to do proper cleanup in time. This depends on "powerpc/powernv/ioda: Fix endianness when reading TCEs" to do proper cleanup via tce_iommu_clear() patch. This should cause no behavioral change. Signed-off-by: Alexey Kardashevskiy--- arch/powerpc/include/asm/mmu_context.h | 15 ++-- arch/powerpc/mm/mmu_context_iommu.c| 45 +- drivers/vfio/vfio_iommu_spapr_tce.c| 41 +++ 3 files changed, 51 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 9d2cd0c..745b4bd 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -18,16 +18,17 @@ extern void destroy_context(struct mm_struct *mm); #ifdef CONFIG_SPAPR_TCE_IOMMU struct mm_iommu_table_group_mem_t; -extern bool mm_iommu_preregistered(void); -extern long mm_iommu_get(unsigned long ua, unsigned long entries, +extern bool mm_iommu_preregistered(struct mm_struct *mm); +extern long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem); -extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem); +extern long mm_iommu_put(struct mm_struct *mm, + struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_init(mm_context_t *ctx); extern void mm_iommu_cleanup(mm_context_t *ctx); -extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua, - unsigned long size); -extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua, - unsigned long entries); +extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, + unsigned long ua, unsigned long size); +extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, + unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned long *hpa); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index da6a216..65086bf 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -53,7 +53,7 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, } pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n", - current->pid, + current ? current->pid : 0, incr ? '+' : '-', npages << PAGE_SHIFT, mm->locked_vm << PAGE_SHIFT, @@ -63,28 +63,22 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm, return ret; } -bool mm_iommu_preregistered(void) +bool mm_iommu_preregistered(struct mm_struct *mm) { - if (!current || !current->mm) - return false; - - return !list_empty(>mm->context.iommu_group_mem_list); + return !list_empty(>context.iommu_group_mem_list); } EXPORT_SYMBOL_GPL(mm_iommu_preregistered); -long mm_iommu_get(unsigned long ua, unsigned long entries, +long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; long i, j, ret = 0, locked_entries = 0; struct page *page = NULL; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - mutex_lock(_list_mutex); - list_for_each_entry_rcu(mem, >mm->context.iommu_group_mem_list, + list_for_each_entry_rcu(mem, >context.iommu_group_mem_list, next) { if ((mem->ua == ua) && (mem->entries == entries)) { ++mem->used; @@ -102,7 +96,7 @@ long mm_iommu_get(unsigned long ua, unsigned long entries, } - ret = mm_iommu_adjust_locked_vm(current->mm, entries, true); + ret = mm_iommu_adjust_locked_vm(mm, entries, true); if (ret) goto unlock_exit; @@ -142,11 +136,11 @@ long mm_iommu_get(unsigned long ua, unsigned long entries, mem->entries = entries; *pmem = mem; - list_add_rcu(>next, >mm->context.iommu_group_mem_list); + list_add_rcu(>next, >context.iommu_group_mem_list); unlock_exit: if (locked_entries && ret) - mm_iommu_adjust_locked_vm(current->mm, locked_entries, false); +
[PATCH kernel v2 0/2] powerpc/mm/iommu: Put pages on process exit
This is a fix to a bug when guest memory stays Active after QEMU process exited. This happened because the QEMU memory context was not released in a short period of time after QEMU process exited. More details are in the commit logs. Please comment. Thanks. Alexey Kardashevskiy (2): powerpc/iommu: Stop using @current in mm_iommu_xxx powerpc/mm/iommu: Put pages on process exit arch/powerpc/include/asm/mmu_context.h | 16 +++--- arch/powerpc/mm/mmu_context_book3s64.c | 4 -- arch/powerpc/mm/mmu_context_iommu.c| 55 +++-- drivers/vfio/vfio_iommu_spapr_tce.c| 89 -- 4 files changed, 100 insertions(+), 64 deletions(-) -- 2.5.0.rc3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH kernel v2 2/2] powerpc/mm/iommu: Put pages on process exit
At the moment VFIO IOMMU SPAPR v2 driver pins all guest RAM pages when the userspace starts using VFIO. When the userspace process finishes, all the pinned pages need to be put; this is done as a part of the userspace memory context (MM) destruction which happens on the very last mmdrop(). This approach has a problem that a MM of the userspace process may live longer than the userspace process itself as kernel threads use userspace process MMs which was runnning on a CPU where the kernel thread was scheduled to. If this happened, the MM remains referenced until this exact kernel thread wakes up again and releases the very last reference to the MM, on an idle system this can take even hours. This references and caches MM once per container and adds tracking how many times each preregistered area was registered in a specific container. This way we do not depend on @current pointing to a valid task descriptor. This changes the userspce interface to return EBUSY if memory is already registered (mm_iommu_get() used to increment the counter); however it should not have any practical effect as the only userspace tool available now does register memory area once per container anyway. As tce_iommu_register_pages/tce_iommu_unregister_pages are called under container->lock, this does not need additional locking. Cc: David GibsonCc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Balbir Singh Cc: Nicholas Piggin Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/mmu_context.h | 1 - arch/powerpc/mm/mmu_context_book3s64.c | 4 --- arch/powerpc/mm/mmu_context_iommu.c| 10 --- drivers/vfio/vfio_iommu_spapr_tce.c| 52 +- 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 745b4bd..90338fd 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -24,7 +24,6 @@ extern long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long e extern long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_init(mm_context_t *ctx); -extern void mm_iommu_cleanup(mm_context_t *ctx); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 227b2a6..5c67d1c 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -159,10 +159,6 @@ static inline void destroy_pagetable_page(struct mm_struct *mm) void destroy_context(struct mm_struct *mm) { -#ifdef CONFIG_SPAPR_TCE_IOMMU - mm_iommu_cleanup(>context); -#endif - #ifdef CONFIG_PPC_ICSWX drop_cop(mm->context.acop, mm); kfree(mm->context.cop_lockp); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 65086bf..901773d 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -293,13 +293,3 @@ void mm_iommu_init(mm_context_t *ctx) { INIT_LIST_HEAD_RCU(>iommu_group_mem_list); } - -void mm_iommu_cleanup(mm_context_t *ctx) -{ - struct mm_iommu_table_group_mem_t *mem, *tmp; - - list_for_each_entry_safe(mem, tmp, >iommu_group_mem_list, next) { - list_del_rcu(>next); - mm_iommu_do_free(mem); - } -} diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 9752e77..40e71a0 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -89,6 +89,15 @@ struct tce_iommu_group { }; /* + * A container needs to remember which preregistered areas and how many times + * it has referenced to do proper cleanup at the userspace process exit. + */ +struct tce_iommu_prereg { + struct list_head next; + struct mm_iommu_table_group_mem_t *mem; +}; + +/* * The container descriptor supports only a single group per container. * Required by the API as the container is not supplied with the IOMMU group * at the moment of initialization. @@ -101,12 +110,26 @@ struct tce_container { struct mm_struct *mm; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; + struct list_head prereg_list; }; +static long tce_iommu_prereg_free(struct tce_container *container, + struct tce_iommu_prereg *tcemem) +{ + long ret; + + list_del(>next); + ret = mm_iommu_put(container->mm, tcemem->mem); + kfree(tcemem); + + return ret; +} + static long tce_iommu_unregister_pages(struct
[PATCH kernel] powerpc/powernv/ioda: Fix endianness when reading TCEs
The iommu_table_ops::exchange() callback writes new TCE to the table and returns old value and permission mask. The old TCE value is correctly converted from BE to CPU endian; however permission mask was calculated from BE value and therefore always returned DMA_NONE which could cause memory leak on LE systems using VFIO SPAPR TCE IOMMU v1 driver. This fixes pnv_tce_xchg() to have @oldtce a CPU endian. Fixes: 05c6cfb9dce0d13d37e9d007ee6a4af36f1c0a58 Cc: sta...@vger.kernel.org # 4.2+ Signed-off-by: Alexey Kardashevskiy--- arch/powerpc/platforms/powernv/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 1d92bd9..7b17f88 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -620,8 +620,8 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)); - *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE); + oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); + *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; -- 2.5.0.rc3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [v2,1/2] refactor code parsing size based on memory range
Ping.. On Friday 24 June 2016 10:45 PM, Hari Bathini wrote: On 06/24/2016 10:56 AM, Michael Ellerman wrote: On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote: Currently, crashkernel parameter supports the below syntax to parse size based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters with similar syntax. So, move this code to a more generic place for code reuse. Cc: Eric BiedermanCc: Vivek Goyal Cc: Rusty Russell Cc: ke...@lists.infradead.org Signed-off-by: Hari Bathini Hari, it's not immediately clear that this makes no change to the logic in the kexec code. Can you reply with a longer change log explaining why the old & new logic is the same for kexec. Hi Michael, Please consider this changelog for this patch: -- crashkernel parameter supports different syntaxes to specify the amount of memory to be reserved for kdump kernel. Below is one of the supported syntaxes that needs parsing to find the memory size to reserve, based on memory range: crashkernel=:[,:,...] While such parsing is implemented for crashkernel parameter, it applies to other parameters, like fadump_reserve_mem, which could use similar syntax. So, to reuse code, moving the code that checks if the parameter syntax is as above and also the code that parses memory size to reserve, for this syntax. While the code is moved to kernel/params.c file, there is no change in logic for crashkernel parameter parsing as the moved code is invoked with function calls at appropriate places. -- Thanks Hari diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 94aa10f..72f55e5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -436,6 +436,11 @@ extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern bool __init is_param_range_based(const char *cmdline); +extern unsigned long long __init parse_mem_range_size(const char *param, + char **str, + unsigned long long system_ram); + extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 56b3ed0..d43f5cc 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char *cmdline, char *cur = cmdline, *tmp; /* for each entry of the comma-separated list */ -do { -unsigned long long start, end = ULLONG_MAX, size; - -/* get the start of the range */ -start = memparse(cur, ); -if (cur == tmp) { -pr_warn("crashkernel: Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (*cur != '-') { -pr_warn("crashkernel: '-' expected\n"); -return -EINVAL; -} -cur++; - -/* if no ':' is here, than we read the end */ -if (*cur != ':') { -end = memparse(cur, ); -if (cur == tmp) { -pr_warn("crashkernel: Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (end <= start) { -pr_warn("crashkernel: end <= start\n"); -return -EINVAL; -} -} - -if (*cur != ':') { -pr_warn("crashkernel: ':' expected\n"); -return -EINVAL; -} -cur++; - -size = memparse(cur, ); -if (cur == tmp) { -pr_warn("Memory value expected\n"); -return -EINVAL; -} -cur = tmp; -if (size >= system_ram) { -pr_warn("crashkernel: invalid size\n"); -return -EINVAL; -} - -/* match ? */ -if (system_ram >= start && system_ram < end) { -*crash_size = size; -break; -} -} while (*cur++ == ','); +*crash_size = parse_mem_range_size("crashkernel", , system_ram); +if (cur == cmdline) +return -EINVAL; if (*crash_size > 0) { while (*cur && *cur != ' ' && *cur != '@') @@ -1272,7 +1222,6 @@ static int __init __parse_crashkernel(char *cmdline, const char *name, const char *suffix) { -char*first_colon, *first_space; char*ck_cmdline; BUG_ON(!crash_size || !crash_base); @@ -1290,12 +1239,10 @@ static int __init __parse_crashkernel(char *cmdline, return parse_crashkernel_suffix(ck_cmdline, crash_size, suffix); /* - * if the commandline contains a ':', then that's
Re: [PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM
On 20/07/16 13:41, Sam Bobroff wrote: > Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to > determine if a PowerPC KVM guest should use HTM (Hardware Transactional > Memory). > > This will be used by QEMU to populate the pa-features bits in the > guest's device tree. > > Signed-off-by: Sam Bobroff> --- > Make sense Acked-by: Balbir Singh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC 0/3] extend kexec_file_load system call
> > Command line options are not signed. I thought idea behind secureboot > was to execute only trusted code and command line options don't enforce > you to execute unsigned code. > >> >> You can set module.sig_enforce=0 and open up the system a bit assuming >> that you can get a module to load with another attack > > IIUC, sig_enforce bool_enable_only so it can only be enabled. Default > value of it is 0 if CONFIG_MODULE_SIG_FORCE=n. > > IOW, if your kernel forced signature verification, you should not be > able to do sig_enforce=0. If you kernel did not have > CONFIG_MODULE_SIG_FORCE=y, then sig_enforce should be 0 by default anyway > and you are not making it worse using command line. > OK.. I checked and you are right, but that is an example and there are other things like security=, thermal.*, nosmep, nosmap that need auditing for safety and might hurt the system security if used. I still think think that assuming you can pass any command line without breaking security is a broken argument. >> > So it sounds like different class of security problems which you are > referring to and not necessarily covered by secureboot or signed > kernel. Let me give you an example. You have a secure boot setup, where the firmware/ROM validates the boot loader. Good, the boot loader hasn't been tampered with. You interrupt the boot loader and are able to modify the command line for the booted kernel. The boot loader loads the kernel and verifies the kernel's signature. Good, the kernel hasn't been tampered with. The kernel starts running. You've plugged in a USB drive to the device, and specified a partition containing a root filesystem that you control to the kernel. The validated kernel finds the USB drive, and mounts it, and executes your own binaries on the USB drive. >>> You will require physical access to the machine to be able to >>> insert your usb drive. And IIRC, argument was that if attacker has >>> physical access to machine, all bets are off anyway. >>> >> >> You don't need physical access -- your machine controller BMC can >> do the magic for you. So its not always physical access, is it? > > Well, idea was that if you have physical access to machine, then all > bets are off. If BMC can do something which allows running unsigned > code at ring level 0, its a problem I think from secureboot model of > security. > >> You run a shell on the console. You now have control of the system, and can mount the real rootfs, inspect it, and work out what it does, etc. At this point, what use was all the validation that the secure boot has done? Absolutely useless. If you can change the command line arguments given to the kernel, you have no security, no matter how much you verify signatures. It's the illusion of security, nothing more, nothing less. >> >> I agree, if you can change command line arguments, all bets are of lesser >> value > > If changing command line allows execution of unsigned code at ring level > 0, then it is a problem. Otherwise we are talking of security issues which > are not covered by secure I agree that from what I can see/grep there is nothing that allows unsigned code to run at boot in ring0, but there are implications like the ones I've mentioned above. Attacks are typically built as a chain and every bit might matter. One could turn off features that might lead to the system being attacked at run-time Balbir Singh. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM
Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to determine if a PowerPC KVM guest should use HTM (Hardware Transactional Memory). This will be used by QEMU to populate the pa-features bits in the guest's device tree. Signed-off-by: Sam Bobroff--- v2: * Use CPU_FTR_TM_COMP instead of CPU_FTR_TM. * I didn't unbreak the line, as with the extra characters checkpatch will complain if I do. I did move the break to a more usual place. arch/powerpc/kvm/powerpc.c | 4 include/uapi/linux/kvm.h | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 02416fe..5ebc8ff 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; #endif + case KVM_CAP_PPC_HTM: + r = cpu_has_feature(CPU_FTR_TM_COMP) && + is_kvmppc_hv_enabled(kvm); + break; default: r = 0; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 05ebf47..f421d0e 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ARM_PMU_V3 126 #define KVM_CAP_VCPU_ATTRIBUTES 127 #define KVM_CAP_MAX_VCPU_ID 128 +#define KVM_CAP_PPC_HTM 129 #ifdef KVM_CAP_IRQ_ROUTING -- 2.1.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add
On 07/18/2016 10:07 AM, Sahil Mehta wrote: > Indexed-count add for memory hotplug guarantees that a contiguous block > of lmbs beginning at a specified will be assigned (NOT > that lmbs will be added). Because of Qemu's per-DIMM memory > management, the addition of a contiguous block of memory currently > requires a series of individual calls. Indexed-count add reduces > this series into a single call. > > Signed-off-by: Sahil Mehta> --- > v2: -remove potential memory leak when parsing command > -use u32s drc_index and count instead of u32 ic[] >in dlpar_memory > > arch/powerpc/include/asm/rtas.h |2 > arch/powerpc/platforms/pseries/dlpar.c | 34 +++- > arch/powerpc/platforms/pseries/hotplug-memory.c | 100 > +-- > 3 files changed, 124 insertions(+), 12 deletions(-) > > diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h > index 51400ba..f46b271 100644 > --- a/arch/powerpc/include/asm/rtas.h > +++ b/arch/powerpc/include/asm/rtas.h > @@ -307,6 +307,7 @@ struct pseries_hp_errorlog { > union { > __be32 drc_index; > __be32 drc_count; > + __be32 indexed_count[2]; > chardrc_name[1]; > } _drc_u; > }; > @@ -322,6 +323,7 @@ struct pseries_hp_errorlog { > #define PSERIES_HP_ELOG_ID_DRC_NAME 1 > #define PSERIES_HP_ELOG_ID_DRC_INDEX 2 > #define PSERIES_HP_ELOG_ID_DRC_COUNT 3 > +#define PSERIES_HP_ELOG_ID_IC4 > > struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, > uint16_t section_id); > diff --git a/arch/powerpc/platforms/pseries/dlpar.c > b/arch/powerpc/platforms/pseries/dlpar.c > index 2b93ae8..2a6dc9e 100644 > --- a/arch/powerpc/platforms/pseries/dlpar.c > +++ b/arch/powerpc/platforms/pseries/dlpar.c > @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct > pseries_hp_errorlog *hp_elog) > switch (hp_elog->id_type) { > case PSERIES_HP_ELOG_ID_DRC_COUNT: > hp_elog->_drc_u.drc_count = > - be32_to_cpu(hp_elog->_drc_u.drc_count); > + be32_to_cpu(hp_elog->_drc_u.drc_count); > break; > case PSERIES_HP_ELOG_ID_DRC_INDEX: > hp_elog->_drc_u.drc_index = > - be32_to_cpu(hp_elog->_drc_u.drc_index); > + be32_to_cpu(hp_elog->_drc_u.drc_index); > + break; > + case PSERIES_HP_ELOG_ID_IC: > + hp_elog->_drc_u.indexed_count[0] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]); > + hp_elog->_drc_u.indexed_count[1] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]); > } > > switch (hp_elog->resource) { > @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct > class_attribute *attr, > goto dlpar_store_out; > } > > - if (!strncmp(arg, "index", 5)) { > + if (!strncmp(arg, "indexed-count", 13)) { > + u32 index, count; > + char *cstr, *istr; > + > + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC; > + arg += strlen("indexed-count "); > + > + cstr = kstrdup(arg, GFP_KERNEL); > + istr = strchr(cstr, ' '); > + *istr++ = '\0'; > + > + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) { > + rc = -EINVAL; > + pr_err("Invalid index or count : \"%s\"\n", buf); > + kfree(cstr); > + goto dlpar_store_out; > + } > + > + kfree(cstr); > + > + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count); > + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index); > + } else if (!strncmp(arg, "index", 5)) { > u32 index; > > hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 2ce1385..d7942ca 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, > struct property *prop) > return rc; > } > > +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + u32 num_lmbs, *p; > + int i, rc; > + int lmbs_available = 0, start_index = 0, end_index; > + > + pr_info("Attempting to hot-add %u LMB(s) at index %x\n", > + lmbs_to_add, drc_index); > + > + if (lmbs_to_add == 0) > + return -EINVAL; > + > + p = prop->value; > + num_lmbs = *p++; > + lmbs = (struct
Re: [PATCH v3 02/11] mm: Hardened usercopy
On Tue, Jul 19, 2016 at 12:12 PM, Kees Cookwrote: > On Mon, Jul 18, 2016 at 6:52 PM, Laura Abbott wrote: >> On 07/15/2016 02:44 PM, Kees Cook wrote: >>> +static inline const char *check_heap_object(const void *ptr, unsigned >>> long n, >>> + bool to_user) >>> +{ >>> + struct page *page, *endpage; >>> + const void *end = ptr + n - 1; >>> + >>> + if (!virt_addr_valid(ptr)) >>> + return NULL; >>> + >> >> >> virt_addr_valid returns true on vmalloc addresses on arm64 which causes some >> intermittent false positives (tab completion in a qemu buildroot environment >> was showing it fairly reliably). I think this is an arm64 bug because >> virt_addr_valid should return true if and only if virt_to_page returns the >> corresponding page. We can work around this for now by explicitly >> checking against is_vmalloc_addr. > > Hrm, that's weird. Sounds like a bug too, but I'll add a check for > is_vmalloc_addr() to catch it for now. BTW, if you were testing against -next, KASAN moved things around in copy_*_user() in a way I wasn't expecting (__copy* and copy* now both call __arch_copy* instead of copy* calling __copy*). I'll have this fixed in the next version. -Kees -- Kees Cook Chrome OS & Brillo Security ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] linuxppc/devtree: Parse new DRC mem/cpu/dev device tree elements
Responses to your remarks about the patch. Note that I will repost it in smaller segments later this week. On 07/13/2016 03:41 PM, Nathan Fontenot wrote: > On 06/30/2016 04:44 PM, Michael Bringmann wrote: >> Several properties in the DRC device tree format are replaced by >> more compact representations to allow, for example, for the encoding >> of vast amounts of memory, and or reduced duplication of information >> in related data structures. >> >> "ibm,drc-info": This property, when present, replaces the following >> four properties: "ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types" >> and "ibm,drc-power-domains". This property is defined for all >> dynamically reconfigurable platform nodes. The "ibm,drc-info" elements >> are intended to provide a more compact representation, and reduce some >> search overhead. >> >> "ibm,dynamic-memory-v2": This property replaces the "ibm,dynamic-memory" >> node representation within the "ibm,dynamic-reconfiguration-memory" >> property provided by the BMC. This element format is intended to provide > > BMC? Just a term for the underlying platform. I think that it came from a conversation with another developer. We can just use 'underlying platform'. >> +#define DRCONF_V2_CELL_OFFSET(i)((i) * DRCONF_V2_CELLS_LEN) >> +#define DRCONF_V2_CELL_POSITION(p, i) \ >> +(void *)(((char *)(p))+((i) * DRCONF_V2_CELLS_LEN)) >> +#define DYN_MEM_V2_LEN(entries) (((entries) * DRCONF_V2_CELLS_LEN) + \ >> + (1 * sizeof(unsigned int))) >> + > > These should probably be functions instead of #defines, makes debugging > the code easier. 6-of-1 or half-a-dozen to me. The main reason that I made them macros was to document the size calculation in one place, instead of having it embedded in multiple locations in the code as was done for the 'ibm,dynamic-memory' struct parsing. > >> +#define DRCONF_MEM_PRESERVED0x0001 >> +#define DRCONF_MEM_PRESERVABLE 0x0002 >> +#define DRCONF_MEM_PRESERVED_STATE 0x0004 >> +#define DRCONF_MEM_ASSIGNED 0x0008 >> +#define DRCONF_MEM_NO_H_MIGRATE_DATA0x0010 >> +#define DRCONF_MEM_DRC_INVALID 0x0020 >> +#define DRCONF_MEM_AI_INVALID 0x0040 >> +#define DRCONF_MEM_RESERVED 0x0080 >> +#define DRCONF_MEM_RESERVED_SW 0x8000 > > I'll let others chime in, but we don't use all of these flags, or plan > to at this point so I'm not sure we need to include definitions for them. I can cut down the list. 3 were previously defined in this file. > >> /* >> - * Retrieve and validate the ibm,dynamic-memory property of the device tree. >> + * Read the next memblock set entry from the ibm,dynamic-memory-v2 property > > Just saw this here, ans see that it is used elsewhere. You may want to avoid > using the term memblock, this already has a meaning in the kernel and may > cause some confusion. > > Still reviewing this patch, more comments as I review more. > > -Nathan 'memblock' was used by the original comments for 'ibm,dynamic-memory' structures. I will change them. > -- Michael W. Bringmann Linux Technology Center IBM Corporation Tie-Line 363-5196 External: (512) 286-5196 Cell: (512) 466-0650 m...@linux.vnet.ibm.com ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] mm: Add is_migrate_cma_page
On Tue, Jul 19, 2016 at 3:00 PM, Laura Abbottwrote: > Code such as hardened user copy[1] needs a way to tell if a > page is CMA or not. Add is_migrate_cma_page in a similar way > to is_migrate_isolate_page. > > [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 > > Signed-off-by: Laura Abbott Great, thanks! > --- > Here's an explicit patch, slightly different than what I posted before. It can > be kept separate or folded in as needed. Assuming there's no objection, I'll add it to my tree and use the new macro. -Kees > --- > include/linux/mmzone.h | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 02069c2..c8478b2 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES]; > > #ifdef CONFIG_CMA > # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) > +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == > MIGRATE_CMA) > #else > # define is_migrate_cma(migratetype) false > +# define is_migrate_cma_page(_page) false > #endif > > #define for_each_migratetype_order(order, type) \ > -- > 2.7.4 > -- Kees Cook Chrome OS & Brillo Security ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] mm: Add is_migrate_cma_page
Code such as hardened user copy[1] needs a way to tell if a page is CMA or not. Add is_migrate_cma_page in a similar way to is_migrate_isolate_page. [1]http://article.gmane.org/gmane.linux.kernel.mm/155238 Signed-off-by: Laura Abbott--- Here's an explicit patch, slightly different than what I posted before. It can be kept separate or folded in as needed. --- include/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 02069c2..c8478b2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) +# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false +# define is_migrate_cma_page(_page) false #endif #define for_each_migratetype_order(order, type) \ -- 2.7.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 00/14] Present useful limits to user (v2)
On 07/18/16 22:05, Doug Ledford wrote: > On 7/15/2016 12:35 PM, Topi Miettinen wrote: >> On 07/15/16 13:04, Balbir Singh wrote: >>> On Fri, Jul 15, 2016 at 01:35:47PM +0300, Topi Miettinen wrote: Hello, There are many basic ways to control processes, including capabilities, cgroups and resource limits. However, there are far fewer ways to find out useful values for the limits, except blind trial and error. This patch series attempts to fix that by giving at least a nice starting point from the highwater mark values of the resources in question. I looked where each limit is checked and added a call to update the mark nearby. Example run of program from Documentation/accounting/getdelauys.c: ./getdelays -R -p `pidof smartd` printing resource accounting RLIMIT_CPU=0 RLIMIT_FSIZE=0 RLIMIT_DATA=18198528 RLIMIT_STACK=135168 RLIMIT_CORE=0 RLIMIT_RSS=0 RLIMIT_NPROC=1 RLIMIT_NOFILE=55 RLIMIT_MEMLOCK=0 RLIMIT_AS=130879488 RLIMIT_LOCKS=0 RLIMIT_SIGPENDING=0 RLIMIT_MSGQUEUE=0 RLIMIT_NICE=0 RLIMIT_RTPRIO=0 RLIMIT_RTTIME=0 ./getdelays -R -C /sys/fs/cgroup/systemd/system.slice/smartd.service/ printing resource accounting sleeping 1, blocked 0, running 0, stopped 0, uninterruptible 0 RLIMIT_CPU=0 RLIMIT_FSIZE=0 RLIMIT_DATA=18198528 RLIMIT_STACK=135168 RLIMIT_CORE=0 RLIMIT_RSS=0 RLIMIT_NPROC=1 RLIMIT_NOFILE=55 RLIMIT_MEMLOCK=0 RLIMIT_AS=130879488 RLIMIT_LOCKS=0 RLIMIT_SIGPENDING=0 RLIMIT_MSGQUEUE=0 RLIMIT_NICE=0 RLIMIT_RTPRIO=0 RLIMIT_RTTIME=0 >>> >>> Does this mean that rlimit_data and rlimit_stack should be set to the >>> values as specified by the data above? >> >> My plan is that either system administrator, distro maintainer or even >> upstream developer can get reasonable values for the limits. They may >> still be wrong, but things would be better than without any help to >> configure the system. > > This is not necessarily true. It seems like there is a disconnect > between what these various values are for and what you are positioning > them as. Most of these limits are meant to protect the system from > resource starvation crashes. They aren't meant to be any sort of double > check on a specific application. The vast majority of applications can > have bugs, leak resources, and do all sorts of other bad things and > still not hit these limits. A program that leaks a file handle an hour > but only normally has 50 handles in use would take 950 hours of constant > leaking before these limits would kick in to bring the program under > control. That's over a month. What's more though, the kernel couldn't > really care less that a single application leaked files until it got to > 1000 open. The real point of the limit on file handles (since they are > cheap) is just not to let the system get brought down. Someone could > maliciously fire up 1000 processes, and they could all attempt to open > up as many files as possible in order to drown the system in open > inodes. The combination of the limit on maximum user processes and > maximum files per process are intended to prevent this. They are not > intended to prevent a single, properly running application from > operating. In fact, there are very few applications that are likely to > break the 1000 file per process limit. It is outrageously high for most > applications. They will leak files and do all sorts of bad things > without this ever stopping them. But it does stop malicious programs. > And the process limit stops malicious users too. The max locked memory > is used by almost no processes, and for the very few that use it, the > default is more than enough. The major exception is the RDMA stack, > which uses it so much that we just disable it on large systems because > it's impossible to predict how much we'll need and we don't want a job > to get killed because it couldn't get the memory it needs for buffers. > The limit on POSIX message queues is another one where it's more than > enough for most applications which don't use this feature at all, and > the few systems that use this feature adjust the limit to something sane > on their system (we can't make the default sane for these special > systems or else it becomes an avenue for Denial of Service attack, so > the default must stay low and servers that make extensive use of this > feature must up their limit on a case by case basis). > >>> >>> Do we expect a smart user space daemon to then tweak the RLIMIT values? >> >> Someone could write an autotuning daemon that checks if the system has >> changed (for example due to upgrade) and then run some tests to >> reconfigure the system. But the limits are a bit too fragile, or rather, >> applications can't handle failure, so I don't know if that would really >> work. > > This misses the point
Re: Suspected regression?
On Tue, 2016-07-19 at 12:00 +0200, Alessio Igor Bogani wrote: > Hi all, > > I have got two boards MVME5100 (MPC7410 cpu) and MVME7100 (MPC8641D > cpu) for which I use the same cross-compiler (ppc7400). > > I tested these against kernel HEAD to found that these don't boot > anymore (PID 1 crash). > > Bisecting results in first offending commit: > 7aef4136566b0539a1a98391181e188905e33401 > > Removing it from HEAD make boards boot properly again. > > A third system based on P2010 isn't affected at all. > > Is it a regression or I have made something wrong? I booted both my next branch, and Linus's master on MPC8641HPCN and didn't see this -- though possibly your RFS is doing something different. Maybe that's the difference with P2010 as well. Is there any way you can debug the cause of the crash? Or send me a minimal RFS that demonstrates the problem (ideally with debug symbols on the userspace binaries)? -Scott ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 02/11] mm: Hardened usercopy
On 07/19/2016 10:34 PM, Kees Cook wrote: [...] >> >> So what about for the CONFIG text: >> >>An architecture should select this if the kernel mapping has a >> secondary >>linear mapping of the kernel text - in other words more than one >> virtual >>kernel address that points to the kernel image. This is used to verify >>that kernel text exposures are not visible under >> CONFIG_HARDENED_USERCOPY. > > Sounds good, I've adjusted it for now. > >>> I wonder if I can avoid the CONFIG entirely if I just did a >>> __va(__pa(_stext)) != _stext test... would that break anyone? >> >> Can this be resolved on all platforms at compile time? > > Well, I think it still needs a runtime check (compile-time may not be > able to tell about kaslr, or who knows what else). I would really like > to avoid the CONFIG if possible, though. Would this do the right thing > on s390? This appears to work where I'm able to test it (32/64 x86, > 32/64 arm): > > unsigned long textlow = (unsigned long)_stext; > unsigned long texthigh = (unsigned long)_etext; > unsigned long textlow_linear = (unsigned long)__va(__pa(textlow); > unsigned long texthigh_linear = (unsigned long)__va(__pa(texthigh); > as we have #define PAGE_OFFSET 0x0UL #define __pa(x) (unsigned long)(x) #define __va(x) (void *)(unsigned long)(x) both should be identical on s390 as of today, so it should work fine and only do the check once > if (overlaps(ptr, n, textlow, texthigh)) > return ""; > > /* Check against possible secondary linear mapping as well. */ > if (textlow != textlow_linear && > overlaps(ptr, n, textlow_linear, texthigh_linear)) > return ""; > > return NULL; > > > -Kees > PS: Not sure how useful and flexible this offers is but you can get some temporary free access to an s390 on https://developer.ibm.com/linuxone/ ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 02/11] mm: Hardened usercopy
On Tue, Jul 19, 2016 at 1:14 PM, Christian Borntraegerwrote: > On 07/19/2016 09:31 PM, Kees Cook wrote: >> On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraeger >> wrote: >>> On 07/15/2016 11:44 PM, Kees Cook wrote: +config HAVE_ARCH_LINEAR_KERNEL_MAPPING + bool + help + An architecture should select this if it has a secondary linear + mapping of the kernel text. This is used to verify that kernel + text exposures are not visible under CONFIG_HARDENED_USERCOPY. >>> >>> I have trouble parsing this. (What does secondary linear mapping mean?) >> >> I likely need help clarifying this language... >> >>> So let me give an example below >>> + >>> [...] +/* Is this address range in the kernel text area? */ +static inline const char *check_kernel_text_object(const void *ptr, +unsigned long n) +{ + unsigned long textlow = (unsigned long)_stext; + unsigned long texthigh = (unsigned long)_etext; + + if (overlaps(ptr, n, textlow, texthigh)) + return ""; + +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING + /* Check against linear mapping as well. */ + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)), + (unsigned long)__va(__pa(texthigh + return ""; +#endif + + return NULL; +} >>> >>> s390 has an address space for user (primary address space from 0..4TB/8PB) >>> and a separate >>> address space (home space from 0..4TB/8PB) for the kernel. In this home >>> space the kernel >>> mapping is virtual containing the physical memory as well as vmalloc memory >>> (creating aliases >>> into the physical one). The kernel text is mapped from _stext to _etext in >>> this mapping. >>> So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ? >> >> If I understand your example, yes. In the home space you have two >> addresses that reference the kernel image? > > No, there is only one address that points to the kernel. > As we have no kernel ASLR yet, and the kernel mapping is > a 1:1 mapping from 0 to memory end and the kernel is only > from _stext to _etext. The vmalloc area contains modules > and vmalloc but not a 2nd kernel mapping. > > But thanks for your example, now I understood. If we have only > one address + if (overlaps(ptr, n, textlow, texthigh)) + return ""; > > This is just enough. > > So what about for the CONFIG text: > >An architecture should select this if the kernel mapping has a > secondary >linear mapping of the kernel text - in other words more than one > virtual >kernel address that points to the kernel image. This is used to verify >that kernel text exposures are not visible under > CONFIG_HARDENED_USERCOPY. Sounds good, I've adjusted it for now. >> I wonder if I can avoid the CONFIG entirely if I just did a >> __va(__pa(_stext)) != _stext test... would that break anyone? > > Can this be resolved on all platforms at compile time? Well, I think it still needs a runtime check (compile-time may not be able to tell about kaslr, or who knows what else). I would really like to avoid the CONFIG if possible, though. Would this do the right thing on s390? This appears to work where I'm able to test it (32/64 x86, 32/64 arm): unsigned long textlow = (unsigned long)_stext; unsigned long texthigh = (unsigned long)_etext; unsigned long textlow_linear = (unsigned long)__va(__pa(textlow); unsigned long texthigh_linear = (unsigned long)__va(__pa(texthigh); if (overlaps(ptr, n, textlow, texthigh)) return ""; /* Check against possible secondary linear mapping as well. */ if (textlow != textlow_linear && overlaps(ptr, n, textlow_linear, texthigh_linear)) return ""; return NULL; -Kees -- Kees Cook Chrome OS & Brillo Security ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 02/11] mm: Hardened usercopy
On 07/19/2016 09:31 PM, Kees Cook wrote: > On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraeger >wrote: >> On 07/15/2016 11:44 PM, Kees Cook wrote: >>> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING >>> + bool >>> + help >>> + An architecture should select this if it has a secondary linear >>> + mapping of the kernel text. This is used to verify that kernel >>> + text exposures are not visible under CONFIG_HARDENED_USERCOPY. >> >> I have trouble parsing this. (What does secondary linear mapping mean?) > > I likely need help clarifying this language... > >> So let me give an example below >> >>> + >> [...] >>> +/* Is this address range in the kernel text area? */ >>> +static inline const char *check_kernel_text_object(const void *ptr, >>> +unsigned long n) >>> +{ >>> + unsigned long textlow = (unsigned long)_stext; >>> + unsigned long texthigh = (unsigned long)_etext; >>> + >>> + if (overlaps(ptr, n, textlow, texthigh)) >>> + return ""; >>> + >>> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING >>> + /* Check against linear mapping as well. */ >>> + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)), >>> + (unsigned long)__va(__pa(texthigh >>> + return ""; >>> +#endif >>> + >>> + return NULL; >>> +} >> >> s390 has an address space for user (primary address space from 0..4TB/8PB) >> and a separate >> address space (home space from 0..4TB/8PB) for the kernel. In this home >> space the kernel >> mapping is virtual containing the physical memory as well as vmalloc memory >> (creating aliases >> into the physical one). The kernel text is mapped from _stext to _etext in >> this mapping. >> So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ? > > If I understand your example, yes. In the home space you have two > addresses that reference the kernel image? No, there is only one address that points to the kernel. As we have no kernel ASLR yet, and the kernel mapping is a 1:1 mapping from 0 to memory end and the kernel is only from _stext to _etext. The vmalloc area contains modules and vmalloc but not a 2nd kernel mapping. But thanks for your example, now I understood. If we have only one address >>> + if (overlaps(ptr, n, textlow, texthigh)) >>> + return ""; This is just enough. So what about for the CONFIG text: An architecture should select this if the kernel mapping has a secondary linear mapping of the kernel text - in other words more than one virtual kernel address that points to the kernel image. This is used to verify that kernel text exposures are not visible under CONFIG_HARDENED_USERCOPY. > I wonder if I can avoid the CONFIG entirely if I just did a > __va(__pa(_stext)) != _stext test... would that break anyone? Can this be resolved on all platforms at compile time? ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 02/11] mm: Hardened usercopy
On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraegerwrote: > On 07/15/2016 11:44 PM, Kees Cook wrote: >> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING >> + bool >> + help >> + An architecture should select this if it has a secondary linear >> + mapping of the kernel text. This is used to verify that kernel >> + text exposures are not visible under CONFIG_HARDENED_USERCOPY. > > I have trouble parsing this. (What does secondary linear mapping mean?) I likely need help clarifying this language... > So let me give an example below > >> + > [...] >> +/* Is this address range in the kernel text area? */ >> +static inline const char *check_kernel_text_object(const void *ptr, >> +unsigned long n) >> +{ >> + unsigned long textlow = (unsigned long)_stext; >> + unsigned long texthigh = (unsigned long)_etext; >> + >> + if (overlaps(ptr, n, textlow, texthigh)) >> + return ""; >> + >> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING >> + /* Check against linear mapping as well. */ >> + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)), >> + (unsigned long)__va(__pa(texthigh >> + return ""; >> +#endif >> + >> + return NULL; >> +} > > s390 has an address space for user (primary address space from 0..4TB/8PB) > and a separate > address space (home space from 0..4TB/8PB) for the kernel. In this home space > the kernel > mapping is virtual containing the physical memory as well as vmalloc memory > (creating aliases > into the physical one). The kernel text is mapped from _stext to _etext in > this mapping. > So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ? If I understand your example, yes. In the home space you have two addresses that reference the kernel image? The intent is that if __va(__pa(_stext)) != _stext, there's a linear mapping of physical memory in the virtual memory range. On x86_64, the kernel is visible in two locations in virtual memory. The kernel start in physical memory address 0x0100 maps to virtual address 0x88000100, and the "regular" virtual memory kernel address is at 0x8100: # grep Kernel /proc/iomem 0100-01a59767 : Kernel code 01a59768-0213d77f : Kernel data 0228-02fdefff : Kernel bss # grep startup_64 /proc/kallsyms 8100 T startup_64 # less /sys/kernel/debug/kernel_page_tables ... ---[ Low Kernel Mapping ]--- ... 0x88000100-0x880001a0 10M ro PSE GLB NX pmd 0x880001a0-0x880001a5c000 368K ro GLB NX pte 0x880001a5c000-0x880001c01680K RW GLB NX pte ... ---[ High Kernel Mapping ]--- ... 0x8100-0x81a0 10M ro PSE GLB x pmd 0x81a0-0x81a5c000 368K ro GLB x pte 0x81a5c000-0x81c01680K RW GLB NX pte ... I wonder if I can avoid the CONFIG entirely if I just did a __va(__pa(_stext)) != _stext test... would that break anyone? -Kees -- Kees Cook Chrome OS & Brillo Security ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats
On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singhwrote: > vcpu stats are used to collect information about a vcpu which can be viewed > in the debugfs. For example halt_attempted_poll and halt_successful_poll > are used to keep track of the number of times the vcpu attempts to and > successfully polls. These stats are currently not used on powerpc. > > Implement incrementation of the halt_attempted_poll and > halt_successful_poll vcpu stats for powerpc. Since these stats are summed > over all the vcpus for all running guests it doesn't matter which vcpu > they are attributed to, thus we choose the current runner vcpu of the > vcore. > > Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and > halt_wait_ns to be used to accumulate the total time spend polling > successfully, polling unsuccessfully and waiting respectively, and > halt_successful_wait to accumulate the number of times the vcpu waits. > Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are > expressed in nanoseconds it is necessary to represent these as 64-bit > quantities, otherwise they would overflow after only about 4 seconds. > > Given that the total time spend either polling or waiting will be known and > the number of times that each was done, it will be possible to determine > the average poll and wait times. This will give the ability to tune the kvm > module parameters based on the calculated average wait and poll times. > > Signed-off-by: Suraj Jitindar Singh Reviewed-by: David Matlack > > --- > Change Log: > > V3 -> V4: > - Instead of accounting just wait and poll time, separate these > into successful_poll_time, failed_poll_time and wait_time. > --- > arch/powerpc/include/asm/kvm_host.h | 4 > arch/powerpc/kvm/book3s.c | 4 > arch/powerpc/kvm/book3s_hv.c| 36 +++- > 3 files changed, 39 insertions(+), 5 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > index f6304c5..f15ffc0 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -114,8 +114,12 @@ struct kvm_vcpu_stat { > u64 emulated_inst_exits; > u64 dec_exits; > u64 ext_intr_exits; > + u64 halt_poll_success_ns; > + u64 halt_poll_fail_ns; > + u64 halt_wait_ns; > u64 halt_successful_poll; > u64 halt_attempted_poll; > + u64 halt_successful_wait; > u64 halt_poll_invalid; > u64 halt_wakeup; > u64 dbell_exits; > diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c > index 47018fc..71eb8f3 100644 > --- a/arch/powerpc/kvm/book3s.c > +++ b/arch/powerpc/kvm/book3s.c > @@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { > { "dec", VCPU_STAT(dec_exits) }, > { "ext_intr",VCPU_STAT(ext_intr_exits) }, > { "queue_intr", VCPU_STAT(queue_intr) }, > + { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) }, > + { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) }, > + { "halt_wait_ns", VCPU_STAT(halt_wait_ns) }, > { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, > { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, > + { "halt_successful_wait", VCPU_STAT(halt_successful_wait) }, > { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, > { "halt_wakeup", VCPU_STAT(halt_wakeup) }, > { "pf_storage", VCPU_STAT(pf_storage) }, > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index a9de1d4..81072f2 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -2679,15 +2679,16 @@ static int kvmppc_vcore_check_block(struct > kvmppc_vcore *vc) > */ > static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) > { > + ktime_t cur, start_poll, start_wait; > int do_sleep = 1; > - ktime_t cur, start; > u64 block_ns; > DECLARE_SWAITQUEUE(wait); > > /* Poll for pending exceptions and ceded state */ > - cur = start = ktime_get(); > + cur = start_poll = ktime_get(); > if (vc->halt_poll_ns) { > - ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); > + ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); > + ++vc->runner->stat.halt_attempted_poll; > > vc->vcore_state = VCORE_POLLING; > spin_unlock(>lock); > @@ -2703,8 +2704,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore > *vc) > spin_lock(>lock); > vc->vcore_state = VCORE_INACTIVE; > > - if (!do_sleep) > + if (!do_sleep) { > + ++vc->runner->stat.halt_successful_poll; > goto out;
Re: [PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats
On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singhwrote: > vms and vcpus have statistics associated with them which can be viewed > within the debugfs. Currently it is assumed within the vcpu_stat_get() and > vm_stat_get() functions that all of these statistics are represented as > u32s, however the next patch adds some u64 vcpu statistics. > > Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly. > Since vcpu statistics are per vcpu, they will only be updated by a single > vcpu at a time so this shouldn't present a problem on 32-bit machines > which can't atomically increment 64-bit numbers. However vm statistics > could potentially be updated by multiple vcpus from that vm at a time. > To avoid the overhead of atomics make all vm statistics ulong such that > they are 64-bit on 64-bit systems where they can be atomically incremented > and are 32-bit on 32-bit systems which may not be able to atomically > increment 64-bit numbers. Modify vm_stat_get() to expect ulongs. > > Signed-off-by: Suraj Jitindar Singh Looks great, thanks. Reviewed-by: David Matlack > > --- > Change Log: > > V2 -> V3: > - Instead of implementing separate u32 and u64 functions keep the > generic functions and modify them to expect u64s. Thus update all > vm and vcpu statistics to u64s accordingly. > V3 -> V4: > - Change vm_stats from u64 to ulong > --- > arch/arm/include/asm/kvm_host.h | 12 ++-- > arch/arm64/include/asm/kvm_host.h | 12 ++-- > arch/mips/include/asm/kvm_host.h| 46 ++--- > arch/powerpc/include/asm/kvm_host.h | 60 - > arch/s390/include/asm/kvm_host.h| 128 > ++-- > arch/x86/include/asm/kvm_host.h | 72 ++-- > virt/kvm/kvm_main.c | 4 +- > 7 files changed, 167 insertions(+), 167 deletions(-) > > diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h > index 96387d4..c8e55b3b 100644 > --- a/arch/arm/include/asm/kvm_host.h > +++ b/arch/arm/include/asm/kvm_host.h > @@ -183,15 +183,15 @@ struct kvm_vcpu_arch { > }; > > struct kvm_vm_stat { > - u32 remote_tlb_flush; > + ulong remote_tlb_flush; > }; > > struct kvm_vcpu_stat { > - u32 halt_successful_poll; > - u32 halt_attempted_poll; > - u32 halt_poll_invalid; > - u32 halt_wakeup; > - u32 hvc_exit_stat; > + u64 halt_successful_poll; > + u64 halt_attempted_poll; > + u64 halt_poll_invalid; > + u64 halt_wakeup; > + u64 hvc_exit_stat; > u64 wfe_exit_stat; > u64 wfi_exit_stat; > u64 mmio_exit_user; > diff --git a/arch/arm64/include/asm/kvm_host.h > b/arch/arm64/include/asm/kvm_host.h > index 49095fc..b14c8bc 100644 > --- a/arch/arm64/include/asm/kvm_host.h > +++ b/arch/arm64/include/asm/kvm_host.h > @@ -291,15 +291,15 @@ struct kvm_vcpu_arch { > #endif > > struct kvm_vm_stat { > - u32 remote_tlb_flush; > + ulong remote_tlb_flush; > }; > > struct kvm_vcpu_stat { > - u32 halt_successful_poll; > - u32 halt_attempted_poll; > - u32 halt_poll_invalid; > - u32 halt_wakeup; > - u32 hvc_exit_stat; > + u64 halt_successful_poll; > + u64 halt_attempted_poll; > + u64 halt_poll_invalid; > + u64 halt_wakeup; > + u64 hvc_exit_stat; > u64 wfe_exit_stat; > u64 wfi_exit_stat; > u64 mmio_exit_user; > diff --git a/arch/mips/include/asm/kvm_host.h > b/arch/mips/include/asm/kvm_host.h > index 36a391d..9704888 100644 > --- a/arch/mips/include/asm/kvm_host.h > +++ b/arch/mips/include/asm/kvm_host.h > @@ -98,32 +98,32 @@ extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); > extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); > > struct kvm_vm_stat { > - u32 remote_tlb_flush; > + ulong remote_tlb_flush; > }; > > struct kvm_vcpu_stat { > - u32 wait_exits; > - u32 cache_exits; > - u32 signal_exits; > - u32 int_exits; > - u32 cop_unusable_exits; > - u32 tlbmod_exits; > - u32 tlbmiss_ld_exits; > - u32 tlbmiss_st_exits; > - u32 addrerr_st_exits; > - u32 addrerr_ld_exits; > - u32 syscall_exits; > - u32 resvd_inst_exits; > - u32 break_inst_exits; > - u32 trap_inst_exits; > - u32 msa_fpe_exits; > - u32 fpe_exits; > - u32 msa_disabled_exits; > - u32 flush_dcache_exits; > - u32 halt_successful_poll; > - u32 halt_attempted_poll; > - u32 halt_poll_invalid; > - u32 halt_wakeup; > + u64 wait_exits; > + u64 cache_exits; > + u64 signal_exits; > + u64 int_exits; > + u64 cop_unusable_exits; > + u64 tlbmod_exits; > + u64 tlbmiss_ld_exits; > + u64 tlbmiss_st_exits; > + u64 addrerr_st_exits; > + u64 addrerr_ld_exits; > + u64
Re: [PATCH v3 02/11] mm: Hardened usercopy
On Mon, Jul 18, 2016 at 6:52 PM, Laura Abbottwrote: > On 07/15/2016 02:44 PM, Kees Cook wrote: >> >> This is the start of porting PAX_USERCOPY into the mainline kernel. This >> is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The >> work is based on code by PaX Team and Brad Spengler, and an earlier port >> from Casey Schaufler. Additional non-slab page tests are from Rik van >> Riel. >> >> This patch contains the logic for validating several conditions when >> performing copy_to_user() and copy_from_user() on the kernel object >> being copied to/from: >> - address range doesn't wrap around >> - address range isn't NULL or zero-allocated (with a non-zero copy size) >> - if on the slab allocator: >> - object size must be less than or equal to copy size (when check is >> implemented in the allocator, which appear in subsequent patches) >> - otherwise, object must not span page allocations >> - if on the stack >> - object must not extend before/after the current process task >> - object must be contained by the current stack frame (when there is >> arch/build support for identifying stack frames) >> - object must not overlap with kernel text >> >> Signed-off-by: Kees Cook >> Tested-By: Valdis Kletnieks >> Tested-by: Michael Ellerman >> --- >> arch/Kconfig| 7 ++ >> include/linux/slab.h| 12 +++ >> include/linux/thread_info.h | 15 +++ >> mm/Makefile | 4 + >> mm/usercopy.c | 234 >> >> security/Kconfig| 28 ++ >> 6 files changed, 300 insertions(+) >> create mode 100644 mm/usercopy.c >> >> diff --git a/arch/Kconfig b/arch/Kconfig >> index 5e2776562035..195ee4cc939a 100644 >> --- a/arch/Kconfig >> +++ b/arch/Kconfig >> @@ -433,6 +433,13 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES >> and similar) by implementing an inline >> arch_within_stack_frames(), >> which is used by CONFIG_HARDENED_USERCOPY. >> >> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING >> + bool >> + help >> + An architecture should select this if it has a secondary linear >> + mapping of the kernel text. This is used to verify that kernel >> + text exposures are not visible under CONFIG_HARDENED_USERCOPY. >> + >> config HAVE_CONTEXT_TRACKING >> bool >> help >> diff --git a/include/linux/slab.h b/include/linux/slab.h >> index aeb3e6d00a66..96a16a3fb7cb 100644 >> --- a/include/linux/slab.h >> +++ b/include/linux/slab.h >> @@ -155,6 +155,18 @@ void kfree(const void *); >> void kzfree(const void *); >> size_t ksize(const void *); >> >> +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR >> +const char *__check_heap_object(const void *ptr, unsigned long n, >> + struct page *page); >> +#else >> +static inline const char *__check_heap_object(const void *ptr, >> + unsigned long n, >> + struct page *page) >> +{ >> + return NULL; >> +} >> +#endif >> + >> /* >> * Some archs want to perform DMA into kmalloc caches and need a >> guaranteed >> * alignment larger than the alignment of a 64-bit integer. >> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h >> index 3d5c80b4391d..f24b99eac969 100644 >> --- a/include/linux/thread_info.h >> +++ b/include/linux/thread_info.h >> @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void >> * const stack, >> } >> #endif >> >> +#ifdef CONFIG_HARDENED_USERCOPY >> +extern void __check_object_size(const void *ptr, unsigned long n, >> + bool to_user); >> + >> +static inline void check_object_size(const void *ptr, unsigned long n, >> +bool to_user) >> +{ >> + __check_object_size(ptr, n, to_user); >> +} >> +#else >> +static inline void check_object_size(const void *ptr, unsigned long n, >> +bool to_user) >> +{ } >> +#endif /* CONFIG_HARDENED_USERCOPY */ >> + >> #endif /* __KERNEL__ */ >> >> #endif /* _LINUX_THREAD_INFO_H */ >> diff --git a/mm/Makefile b/mm/Makefile >> index 78c6f7dedb83..32d37247c7e5 100644 >> --- a/mm/Makefile >> +++ b/mm/Makefile >> @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n >> KCOV_INSTRUMENT_mmzone.o := n >> KCOV_INSTRUMENT_vmstat.o := n >> >> +# Since __builtin_frame_address does work as used, disable the warning. >> +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) >> + >> mmu-y := nommu.o >> mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ >>mlock.o mmap.o mprotect.o mremap.o msync.o >> rmap.o \ >> @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o >> obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o >>
Re: [PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module
On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singhwrote: > This patch introduces new halt polling functionality into the kvm_hv kernel > module. When a vcore is idle it will poll for some period of time before > scheduling itself out. > > When all of the runnable vcpus on a vcore have ceded (and thus the vcore is > idle) we schedule ourselves out to allow something else to run. In the > event that we need to wake up very quickly (for example an interrupt > arrives), we are required to wait until we get scheduled again. > > Implement halt polling so that when a vcore is idle, and before scheduling > ourselves, we poll for vcpus in the runnable_threads list which have > pending exceptions or which leave the ceded state. If we poll successfully > then we can get back into the guest very quickly without ever scheduling > ourselves, otherwise we schedule ourselves out as before. > > Testing of this patch with a TCP round robin test between two guests with > virtio network interfaces has found a decrease in round trip time of ~15us > on average. A performance gain is only seen when going out of and > back into the guest often and quickly, otherwise there is no net benefit > from the polling. The polling interval is adjusted such that when we are > often scheduled out for long periods of time it is reduced, and when we > often poll successfully it is increased. The rate at which the polling > interval increases or decreases, and the maximum polling interval, can > be set through module parameters. > > Based on the implementation in the generic kvm module by Wanpeng Li and > Paolo Bonzini, and on direction from Paul Mackerras. > > Signed-off-by: Suraj Jitindar Singh > --- > arch/powerpc/include/asm/kvm_book3s.h | 1 + > arch/powerpc/include/asm/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_hv.c | 116 > ++ > arch/powerpc/kvm/trace_hv.h | 22 +++ > 4 files changed, 126 insertions(+), 14 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h > b/arch/powerpc/include/asm/kvm_book3s.h > index 151f817..c261f52 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -102,6 +102,7 @@ struct kvmppc_vcore { > ulong pcr; > ulong dpdes;/* doorbell state (POWER8) */ > ulong conferring_threads; > + unsigned int halt_poll_ns; > }; > > struct kvmppc_vcpu_book3s { > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > index 02d06e9..610f393 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -294,6 +294,7 @@ struct kvm_arch { > #define VCORE_SLEEPING 3 > #define VCORE_RUNNING 4 > #define VCORE_EXITING 5 > +#define VCORE_POLLING 6 > > /* > * Struct used to manage memory for a virtual processor area > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 3bcf9e6..a9de1d4 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, > _ipi_redirect, > MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host > core"); > #endif > > +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ > +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; > +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); > +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); > + > +/* Factor by which the vcore halt poll interval is grown, default is to > double > + */ > +static unsigned int halt_poll_ns_grow = 2; > +module_param(halt_poll_ns_grow, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); > + > +/* Factor by which the vcore halt poll interval is shrunk, default is to > reset > + */ > +static unsigned int halt_poll_ns_shrink; > +module_param(halt_poll_ns_shrink, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); > + > static void kvmppc_end_cede(struct kvm_vcpu *vcpu); > static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); > > @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore > *vc, > finish_wait(>arch.cpu_run, ); > } > > +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + /* 10us base */ > + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) > + vc->halt_poll_ns = 1; > + else > + vc->halt_poll_ns *= halt_poll_ns_grow; > + > + if (vc->halt_poll_ns > halt_poll_max_ns) > + vc->halt_poll_ns = halt_poll_max_ns; > +} > + > +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + if (halt_poll_ns_shrink == 0) > + vc->halt_poll_ns = 0; > + else > + vc->halt_poll_ns /= halt_poll_ns_shrink; > +} > + > +/* Check
Re: [PATCH v3 02/11] mm: Hardened usercopy
On Mon, Jul 18, 2016 at 6:06 PM, Laura Abbottwrote: > On 07/15/2016 02:44 PM, Kees Cook wrote: >> >> This is the start of porting PAX_USERCOPY into the mainline kernel. This >> is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The >> work is based on code by PaX Team and Brad Spengler, and an earlier port >> from Casey Schaufler. Additional non-slab page tests are from Rik van >> Riel. >> >> This patch contains the logic for validating several conditions when >> performing copy_to_user() and copy_from_user() on the kernel object >> being copied to/from: >> - address range doesn't wrap around >> - address range isn't NULL or zero-allocated (with a non-zero copy size) >> - if on the slab allocator: >> - object size must be less than or equal to copy size (when check is >> implemented in the allocator, which appear in subsequent patches) >> - otherwise, object must not span page allocations >> - if on the stack >> - object must not extend before/after the current process task >> - object must be contained by the current stack frame (when there is >> arch/build support for identifying stack frames) >> - object must not overlap with kernel text >> >> Signed-off-by: Kees Cook >> Tested-By: Valdis Kletnieks >> Tested-by: Michael Ellerman >> --- >> arch/Kconfig| 7 ++ >> include/linux/slab.h| 12 +++ >> include/linux/thread_info.h | 15 +++ >> mm/Makefile | 4 + >> mm/usercopy.c | 234 >> >> security/Kconfig| 28 ++ >> 6 files changed, 300 insertions(+) >> create mode 100644 mm/usercopy.c >> >> diff --git a/arch/Kconfig b/arch/Kconfig >> index 5e2776562035..195ee4cc939a 100644 >> --- a/arch/Kconfig >> +++ b/arch/Kconfig >> @@ -433,6 +433,13 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES >> and similar) by implementing an inline >> arch_within_stack_frames(), >> which is used by CONFIG_HARDENED_USERCOPY. >> >> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING >> + bool >> + help >> + An architecture should select this if it has a secondary linear >> + mapping of the kernel text. This is used to verify that kernel >> + text exposures are not visible under CONFIG_HARDENED_USERCOPY. >> + >> config HAVE_CONTEXT_TRACKING >> bool >> help >> diff --git a/include/linux/slab.h b/include/linux/slab.h >> index aeb3e6d00a66..96a16a3fb7cb 100644 >> --- a/include/linux/slab.h >> +++ b/include/linux/slab.h >> @@ -155,6 +155,18 @@ void kfree(const void *); >> void kzfree(const void *); >> size_t ksize(const void *); >> >> +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR >> +const char *__check_heap_object(const void *ptr, unsigned long n, >> + struct page *page); >> +#else >> +static inline const char *__check_heap_object(const void *ptr, >> + unsigned long n, >> + struct page *page) >> +{ >> + return NULL; >> +} >> +#endif >> + >> /* >> * Some archs want to perform DMA into kmalloc caches and need a >> guaranteed >> * alignment larger than the alignment of a 64-bit integer. >> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h >> index 3d5c80b4391d..f24b99eac969 100644 >> --- a/include/linux/thread_info.h >> +++ b/include/linux/thread_info.h >> @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void >> * const stack, >> } >> #endif >> >> +#ifdef CONFIG_HARDENED_USERCOPY >> +extern void __check_object_size(const void *ptr, unsigned long n, >> + bool to_user); >> + >> +static inline void check_object_size(const void *ptr, unsigned long n, >> +bool to_user) >> +{ >> + __check_object_size(ptr, n, to_user); >> +} >> +#else >> +static inline void check_object_size(const void *ptr, unsigned long n, >> +bool to_user) >> +{ } >> +#endif /* CONFIG_HARDENED_USERCOPY */ >> + >> #endif /* __KERNEL__ */ >> >> #endif /* _LINUX_THREAD_INFO_H */ >> diff --git a/mm/Makefile b/mm/Makefile >> index 78c6f7dedb83..32d37247c7e5 100644 >> --- a/mm/Makefile >> +++ b/mm/Makefile >> @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n >> KCOV_INSTRUMENT_mmzone.o := n >> KCOV_INSTRUMENT_vmstat.o := n >> >> +# Since __builtin_frame_address does work as used, disable the warning. >> +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address) >> + >> mmu-y := nommu.o >> mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ >>mlock.o mmap.o mprotect.o mremap.o msync.o >> rmap.o \ >> @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o >> obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o >>
Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add
On 07/18/2016 08:07 AM, Sahil Mehta wrote: > Indexed-count add for memory hotplug guarantees that a contiguous block > of lmbs beginning at a specified will be assigned (NOT > that lmbs will be added). Because of Qemu's per-DIMM memory > management, the addition of a contiguous block of memory currently > requires a series of individual calls. Indexed-count add reduces > this series into a single call. > > Signed-off-by: Sahil Mehta> --- > v2: -remove potential memory leak when parsing command > -use u32s drc_index and count instead of u32 ic[] >in dlpar_memory > > arch/powerpc/include/asm/rtas.h |2 > arch/powerpc/platforms/pseries/dlpar.c | 34 +++- > arch/powerpc/platforms/pseries/hotplug-memory.c | 100 > +-- > 3 files changed, 124 insertions(+), 12 deletions(-) > > diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h > index 51400ba..f46b271 100644 > --- a/arch/powerpc/include/asm/rtas.h > +++ b/arch/powerpc/include/asm/rtas.h > @@ -307,6 +307,7 @@ struct pseries_hp_errorlog { > union { > __be32 drc_index; > __be32 drc_count; > + __be32 indexed_count[2]; > chardrc_name[1]; > } _drc_u; > }; > @@ -322,6 +323,7 @@ struct pseries_hp_errorlog { > #define PSERIES_HP_ELOG_ID_DRC_NAME 1 > #define PSERIES_HP_ELOG_ID_DRC_INDEX 2 > #define PSERIES_HP_ELOG_ID_DRC_COUNT 3 > +#define PSERIES_HP_ELOG_ID_IC4 For consistency it would be nice if this had the same prefix, namely PSERIES_HP_ELOG_ID_DRC_XXX, as the previous types. Otherwise, we need to remember that indexed count is named slightly different. -Tyrel > > struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, > uint16_t section_id); > diff --git a/arch/powerpc/platforms/pseries/dlpar.c > b/arch/powerpc/platforms/pseries/dlpar.c > index 2b93ae8..2a6dc9e 100644 > --- a/arch/powerpc/platforms/pseries/dlpar.c > +++ b/arch/powerpc/platforms/pseries/dlpar.c > @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct > pseries_hp_errorlog *hp_elog) > switch (hp_elog->id_type) { > case PSERIES_HP_ELOG_ID_DRC_COUNT: > hp_elog->_drc_u.drc_count = > - be32_to_cpu(hp_elog->_drc_u.drc_count); > + be32_to_cpu(hp_elog->_drc_u.drc_count); > break; > case PSERIES_HP_ELOG_ID_DRC_INDEX: > hp_elog->_drc_u.drc_index = > - be32_to_cpu(hp_elog->_drc_u.drc_index); > + be32_to_cpu(hp_elog->_drc_u.drc_index); > + break; > + case PSERIES_HP_ELOG_ID_IC: > + hp_elog->_drc_u.indexed_count[0] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]); > + hp_elog->_drc_u.indexed_count[1] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]); > } > > switch (hp_elog->resource) { > @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct > class_attribute *attr, > goto dlpar_store_out; > } > > - if (!strncmp(arg, "index", 5)) { > + if (!strncmp(arg, "indexed-count", 13)) { > + u32 index, count; > + char *cstr, *istr; > + > + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC; > + arg += strlen("indexed-count "); > + > + cstr = kstrdup(arg, GFP_KERNEL); > + istr = strchr(cstr, ' '); > + *istr++ = '\0'; > + > + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) { > + rc = -EINVAL; > + pr_err("Invalid index or count : \"%s\"\n", buf); > + kfree(cstr); > + goto dlpar_store_out; > + } > + > + kfree(cstr); > + > + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count); > + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index); > + } else if (!strncmp(arg, "index", 5)) { > u32 index; > > hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 2ce1385..d7942ca 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, > struct property *prop) > return rc; > } > > +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + u32 num_lmbs, *p; > + int i, rc; > + int lmbs_available = 0, start_index = 0, end_index; > + > + pr_info("Attempting to hot-add %u
Re: [PATCH v2 2/2] powerpc/pseries: Implemented indexed-count hotplug memory remove
On 07/18/2016 10:08 AM, Sahil Mehta wrote: > Indexed-count remove for memory hotplug guarantees that a contiguous block > of lmbs beginning at a specified will be unassigned (NOT > that lmbs will be removed). Because of Qemu's per-DIMM memory > management, the removal of a contiguous block of memory currently > requires a series of individual calls. Indexed-count remove reduces > this series into a single call. > > Signed-off-by: Sahil MehtaReviewed-by: Nathan Fontenot > --- > v2: -use u32s drc_index and count instead of u32 ic[] >in dlpar_memory > > arch/powerpc/platforms/pseries/hotplug-memory.c | 84 > +++ > 1 file changed, 84 insertions(+) > > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index d7942ca..244e1a8 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -503,6 +503,86 @@ static int dlpar_memory_remove_by_index(u32 drc_index, > struct property *prop) > return rc; > } > > +static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + u32 num_lmbs, *p; > + int i, rc; > + int lmbs_available = 0, start_index = 0, end_index; > + > + pr_info("Attempting to hot-remove %u LMB(s) at %x\n", > + lmbs_to_remove, drc_index); > + > + if (lmbs_to_remove == 0) > + return -EINVAL; > + > + p = prop->value; > + num_lmbs = *p++; > + lmbs = (struct of_drconf_cell *)p; > + > + /* Navigate to drc_index */ > + while (start_index < num_lmbs) { > + if (lmbs[start_index].drc_index == drc_index) > + break; > + > + start_index++; > + } > + > + end_index = start_index + lmbs_to_remove; > + > + /* Validate that there are enough LMBs to satisfy the request */ > + for (i = start_index; i < end_index; i++) { > + if (lmbs[i].flags & DRCONF_MEM_RESERVED) > + break; > + > + lmbs_available++; > + } > + > + if (lmbs_available < lmbs_to_remove) > + return -EINVAL; > + > + for (i = 0; i < end_index; i++) { > + if (!(lmbs[i].flags & DRCONF_MEM_ASSIGNED)) > + continue; > + > + rc = dlpar_remove_lmb([i]); > + if (rc) > + break; > + > + lmbs[i].reserved = 1; > + } > + > + if (rc) { > + pr_err("Memory indexed-count-remove failed, adding any removed > LMBs\n"); > + > + for (i = start_index; i < end_index; i++) { > + if (!lmbs[i].reserved) > + continue; > + > + rc = dlpar_add_lmb([i]); > + if (rc) > + pr_err("Failed to add LMB, drc index %x\n", > +be32_to_cpu(lmbs[i].drc_index)); > + > + lmbs[i].reserved = 0; > + } > + rc = -EINVAL; > + } else { > + for (i = start_index; i < end_index; i++) { > + if (!lmbs[i].reserved) > + continue; > + > + pr_info("Memory at %llx (drc index %x) was > hot-removed\n", > + lmbs[i].base_addr, lmbs[i].drc_index); > + > + lmbs[i].reserved = 0; > + } > + } > + > + return rc; > +} > + > #else > static inline int pseries_remove_memblock(unsigned long base, > unsigned int memblock_size) > @@ -821,6 +901,10 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) > } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) { > drc_index = hp_elog->_drc_u.drc_index; > rc = dlpar_memory_remove_by_index(drc_index, prop); > + } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_IC) { > + count = hp_elog->_drc_u.indexed_count[0]; > + drc_index = hp_elog->_drc_u.indexed_count[1]; > + rc = dlpar_memory_remove_by_ic(count, drc_index, prop); > } else > rc = -EINVAL; > break; > > On 07/18/2016 10:04 AM, Sahil Mehta wrote: >> Indexed-count memory management allows addition and removal of contiguous >> lmb blocks with a single command. When compared to the series of calls >> previously required to manage contiguous blocks, indexed-count decreases >> command frequency and reduces risk of buffer overflow. >> >> Changes in v2: >> -- >> -[PATCH 1/2]:-remove potential memory leak when parsing command >> -use u32s drc_index and count instead of u32 ic[] >> in
Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add
On 07/18/2016 10:07 AM, Sahil Mehta wrote: > Indexed-count add for memory hotplug guarantees that a contiguous block > of lmbs beginning at a specified will be assigned (NOT > that lmbs will be added). Because of Qemu's per-DIMM memory > management, the addition of a contiguous block of memory currently > requires a series of individual calls. Indexed-count add reduces > this series into a single call. > > Signed-off-by: Sahil MehtaReviewed-by: Nathan Fontenot > --- > v2: -remove potential memory leak when parsing command > -use u32s drc_index and count instead of u32 ic[] >in dlpar_memory > > arch/powerpc/include/asm/rtas.h |2 > arch/powerpc/platforms/pseries/dlpar.c | 34 +++- > arch/powerpc/platforms/pseries/hotplug-memory.c | 100 > +-- > 3 files changed, 124 insertions(+), 12 deletions(-) > > diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h > index 51400ba..f46b271 100644 > --- a/arch/powerpc/include/asm/rtas.h > +++ b/arch/powerpc/include/asm/rtas.h > @@ -307,6 +307,7 @@ struct pseries_hp_errorlog { > union { > __be32 drc_index; > __be32 drc_count; > + __be32 indexed_count[2]; > chardrc_name[1]; > } _drc_u; > }; > @@ -322,6 +323,7 @@ struct pseries_hp_errorlog { > #define PSERIES_HP_ELOG_ID_DRC_NAME 1 > #define PSERIES_HP_ELOG_ID_DRC_INDEX 2 > #define PSERIES_HP_ELOG_ID_DRC_COUNT 3 > +#define PSERIES_HP_ELOG_ID_IC4 > > struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, > uint16_t section_id); > diff --git a/arch/powerpc/platforms/pseries/dlpar.c > b/arch/powerpc/platforms/pseries/dlpar.c > index 2b93ae8..2a6dc9e 100644 > --- a/arch/powerpc/platforms/pseries/dlpar.c > +++ b/arch/powerpc/platforms/pseries/dlpar.c > @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct > pseries_hp_errorlog *hp_elog) > switch (hp_elog->id_type) { > case PSERIES_HP_ELOG_ID_DRC_COUNT: > hp_elog->_drc_u.drc_count = > - be32_to_cpu(hp_elog->_drc_u.drc_count); > + be32_to_cpu(hp_elog->_drc_u.drc_count); > break; > case PSERIES_HP_ELOG_ID_DRC_INDEX: > hp_elog->_drc_u.drc_index = > - be32_to_cpu(hp_elog->_drc_u.drc_index); > + be32_to_cpu(hp_elog->_drc_u.drc_index); > + break; > + case PSERIES_HP_ELOG_ID_IC: > + hp_elog->_drc_u.indexed_count[0] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]); > + hp_elog->_drc_u.indexed_count[1] = > + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]); > } > > switch (hp_elog->resource) { > @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct > class_attribute *attr, > goto dlpar_store_out; > } > > - if (!strncmp(arg, "index", 5)) { > + if (!strncmp(arg, "indexed-count", 13)) { > + u32 index, count; > + char *cstr, *istr; > + > + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC; > + arg += strlen("indexed-count "); > + > + cstr = kstrdup(arg, GFP_KERNEL); > + istr = strchr(cstr, ' '); > + *istr++ = '\0'; > + > + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) { > + rc = -EINVAL; > + pr_err("Invalid index or count : \"%s\"\n", buf); > + kfree(cstr); > + goto dlpar_store_out; > + } > + > + kfree(cstr); > + > + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count); > + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index); > + } else if (!strncmp(arg, "index", 5)) { > u32 index; > > hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 2ce1385..d7942ca 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, > struct property *prop) > return rc; > } > > +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + u32 num_lmbs, *p; > + int i, rc; > + int lmbs_available = 0, start_index = 0, end_index; > + > + pr_info("Attempting to hot-add %u LMB(s) at index %x\n", > + lmbs_to_add, drc_index); > + > + if (lmbs_to_add == 0) > + return -EINVAL; > + > + p =
RE: [PATCH V2 7/7] thermal: qoriq: Add thermal management support
Hi Eduardo, Any comments on this patch? Thanks. -Hongtao. > -Original Message- > From: Jia Hongtao [mailto:hongtao@nxp.com] > Sent: Thursday, June 30, 2016 11:09 AM > To: edubez...@gmail.com; rui.zh...@intel.com; robh...@kernel.org; > ga...@codeaurora.org; Scott Wood; > shawn...@kernel.org > Cc: linux...@vger.kernel.org; devicet...@vger.kernel.org; linux- > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-arm- > ker...@lists.infradead.org; Hongtao Jia > Subject: [PATCH V2 7/7] thermal: qoriq: Add thermal management support > > This driver add thermal management support by enabling TMU (Thermal > Monitoring Unit) on QorIQ platform. > > It's based on thermal of framework: > - Trip points defined in device tree. > - Cpufreq as cooling device registered in qoriq cpufreq driver. > > Signed-off-by: Jia Hongtao > --- > Changes of V2: > * Add HAS_IOMEM dependency to fix build error on UM > > drivers/thermal/Kconfig | 10 ++ > drivers/thermal/Makefile| 1 + > drivers/thermal/qoriq_thermal.c | 328 > > 3 files changed, 339 insertions(+) > create mode 100644 drivers/thermal/qoriq_thermal.c > > diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index > 2d702ca..56ef30d 100644 > --- a/drivers/thermal/Kconfig > +++ b/drivers/thermal/Kconfig > @@ -195,6 +195,16 @@ config IMX_THERMAL > cpufreq is used as the cooling device to throttle CPUs when the > passive trip is crossed. > > +config QORIQ_THERMAL > + tristate "QorIQ Thermal Monitoring Unit" > + depends on THERMAL_OF > + depends on HAS_IOMEM > + help > + Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms. > + It supports one critical trip point and one passive trip point. The > + cpufreq is used as the cooling device to throttle CPUs when the > + passive trip is crossed. > + > config SPEAR_THERMAL > tristate "SPEAr thermal sensor driver" > depends on PLAT_SPEAR || COMPILE_TEST > diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index > 10b07c1..6662232 100644 > --- a/drivers/thermal/Makefile > +++ b/drivers/thermal/Makefile > @@ -37,6 +37,7 @@ obj-$(CONFIG_DB8500_THERMAL)+= db8500_thermal.o > obj-$(CONFIG_ARMADA_THERMAL) += armada_thermal.o > obj-$(CONFIG_TANGO_THERMAL) += tango_thermal.o > obj-$(CONFIG_IMX_THERMAL)+= imx_thermal.o > +obj-$(CONFIG_QORIQ_THERMAL) += qoriq_thermal.o > obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o > obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o > obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o > diff --git a/drivers/thermal/qoriq_thermal.c > b/drivers/thermal/qoriq_thermal.c new > file mode 100644 index 000..644ba52 > --- /dev/null > +++ b/drivers/thermal/qoriq_thermal.c > @@ -0,0 +1,328 @@ > +/* > + * Copyright 2016 Freescale Semiconductor, Inc. > + * > + * This program is free software; you can redistribute it and/or modify > +it > + * under the terms and conditions of the GNU General Public License, > + * version 2, as published by the Free Software Foundation. > + * > + * This program is distributed in the hope it will be useful, but > +WITHOUT > + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY > +or > + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public > +License for > + * more details. > + * > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "thermal_core.h" > + > +#define SITES_MAX16 > + > +/* > + * QorIQ TMU Registers > + */ > +struct qoriq_tmu_site_regs { > + u32 tritsr; /* Immediate Temperature Site Register */ > + u32 tratsr; /* Average Temperature Site Register */ > + u8 res0[0x8]; > +}; > + > +struct qoriq_tmu_regs { > + u32 tmr;/* Mode Register */ > +#define TMR_DISABLE 0x0 > +#define TMR_ME 0x8000 > +#define TMR_ALPF 0x0c00 > + u32 tsr;/* Status Register */ > + u32 tmtmir; /* Temperature measurement interval Register */ > +#define TMTMIR_DEFAULT 0x000f > + u8 res0[0x14]; > + u32 tier; /* Interrupt Enable Register */ > +#define TIER_DISABLE 0x0 > + u32 tidr; /* Interrupt Detect Register */ > + u32 tiscr; /* Interrupt Site Capture Register */ > + u32 ticscr; /* Interrupt Critical Site Capture Register */ > + u8 res1[0x10]; > + u32 tmhtcrh;/* High Temperature Capture Register */ > + u32 tmhtcrl;/* Low Temperature Capture Register */ > + u8 res2[0x8]; > + u32 tmhtitr;/* High Temperature Immediate Threshold */ > + u32 tmhtatr;/* High Temperature Average Threshold */ > + u32 tmhtactr; /* High Temperature Average
Re: [Patch v2] rpaphp: fix slot registration for multiple slots under a PHB
On 07/11/2016 05:16 PM, Tyrel Datwyler wrote: > PowerVM seems to only ever provide a single hotplug slot per PHB. > The under lying slot hotplug registration code assumed multiple slots, > but the actual implementation is broken for multiple slots. This went > unnoticed for years due to the nature of PowerVM as mentioned > previously. Under qemu/kvm the hotplug slot model aligns more with > x86 where multiple slots are presented under a single PHB. As seen > in the following each additional slot after the first fails to > register due to each slot always being compared against the first > child node of the PHB in the device tree. > > [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1 > [6.492569] rpaphp: Slot [Slot 0] registered > [6.492577] rpaphp: pci_hp_register failed with error -16 > [6.493082] rpaphp: pci_hp_register failed with error -16 > [6.493138] rpaphp: pci_hp_register failed with error -16 > [6.493161] rpaphp: pci_hp_register failed with error -16 > > The registration logic is fixed so that each slot is compared > against the existing child devices of the PHB in the device tree to > determine present slots vs empty slots. > > [ 38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1 > [ 38.482004] rpaphp: Slot [C0] registered > [ 38.482127] rpaphp: Slot [C1] registered > [ 38.482241] rpaphp: Slot [C2] registered > [ 38.482356] rpaphp: Slot [C3] registered > [ 38.482495] rpaphp: Slot [C4] registered > > Signed-off-by: Tyrel DatwylerReviewed-by: Nathan Fontenot > --- > > Changes in v2: corrected ibm,my-drc-index property name > > --- > drivers/pci/hotplug/rpaphp_slot.c | 17 - > 1 file changed, 12 insertions(+), 5 deletions(-) > > diff --git a/drivers/pci/hotplug/rpaphp_slot.c > b/drivers/pci/hotplug/rpaphp_slot.c > index 6937c72..388c4d8 100644 > --- a/drivers/pci/hotplug/rpaphp_slot.c > +++ b/drivers/pci/hotplug/rpaphp_slot.c > @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot); > int rpaphp_register_slot(struct slot *slot) > { > struct hotplug_slot *php_slot = slot->hotplug_slot; > + struct device_node *child; > + u32 my_index; > int retval; > - int slotno; > + int slotno = -1; > > dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] > type[%d]\n", > __func__, slot->dn->full_name, slot->index, slot->name, > @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot) > return -EAGAIN; > } > > - if (slot->dn->child) > - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn); > - else > - slotno = -1; > + for_each_child_of_node(slot->dn, child) { > + retval = of_property_read_u32(child, "ibm,my-drc-index", > _index); > + if (my_index == slot->index) { > + slotno = PCI_SLOT(PCI_DN(child)->devfn); > + of_node_put(child); > + break; > + } > + } > + > retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name); > if (retval) { > err("pci_hp_register failed with error %d\n", retval); > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] crypto: vmx - Ignore generated files
Ignore assembly files generated by the perl script. Signed-off-by: Paulo Flabiano Smorigo--- drivers/crypto/vmx/.gitignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 drivers/crypto/vmx/.gitignore diff --git a/drivers/crypto/vmx/.gitignore b/drivers/crypto/vmx/.gitignore new file mode 100644 index 000..af4a7ce --- /dev/null +++ b/drivers/crypto/vmx/.gitignore @@ -0,0 +1,2 @@ +aesp8-ppc.S +ghashp8-ppc.S -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC 3/3] kexec: extend kexec_file_load system call
On Tue, Jul 19, 2016 at 01:47:28PM +0100, Mark Rutland wrote: > On Tue, Jul 19, 2016 at 08:24:06AM -0400, Vivek Goyal wrote: > > On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote: > > > Regardless, this extended syscall changes some underlying assumptions > > > made with the development of kexec_file_load, and I think treating this > > > as an extension is not a great idea. From a user's perspective there is > > > little difference between passing an additional flag or using a > > > different syscall number, so I don't think that we gain much by altering > > > the existing prototype relative to allocating a new syscall number. > > > > If we are providing/opening up additional flags, I can't think what will > > it break. Same flag was invalid in old kernel but new kernel supports > > it and will accept it. So it sounds reasonable to me to add new flags. > > > > If existing users are not broken, then I think it might be a good idea > > to extend existing syscall. Otherwise userspace will have to be modified > > to understand a 3rd syscall also and an additional option will show up > > which asks users to specify which syscall to use. So extending existing > > syscall might keep it little simple for users. > > I don't follow. > > To use the new feature, you have to modify userspace anyway, as you > require userspace to pass information which it did not previously pass > (in the new arguments added to the syscall). > > The presence of a new syscall does not imply the absence of the old > syscall, so you can always use that be default unless the user asks for > asomething only the new syscall provides. Regardless of the > syscall/flags difference, you still have to detect whether the new > functionality is present somehow. > Hmm., so current idea is that we have two syscalls() which are *ideally* supposed to work for all arches. Difference between two is that first one does not support kernel signature verification while second one does. By default old syscall is used and user can force using new syscall using option --kexec-file-load. If a user DTB is present, I was hoping that it will continue to work the same way. Both the sycalls can be used and can handle DTB. If we introduce a 3rd syscall, that means only first and 3rd syscall can handle DTB and we need to introduce one more option which tells whether to use kexec_load() or use the 3rd new syscall. And that's what I am trying to avoid. Vivek > > BTW, does kexec_load() needs to be modified too to handle DT? > > No, at least for arm64. In the kexec_load case userspace provides the > DTB as a raw segment, and the user-provided purgatory sets up registers > to pass that to the new kernel. > > Thanks, > Mark. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC 3/3] kexec: extend kexec_file_load system call
On Tue, Jul 19, 2016 at 08:24:06AM -0400, Vivek Goyal wrote: > On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote: > > Regardless, this extended syscall changes some underlying assumptions > > made with the development of kexec_file_load, and I think treating this > > as an extension is not a great idea. From a user's perspective there is > > little difference between passing an additional flag or using a > > different syscall number, so I don't think that we gain much by altering > > the existing prototype relative to allocating a new syscall number. > > If we are providing/opening up additional flags, I can't think what will > it break. Same flag was invalid in old kernel but new kernel supports > it and will accept it. So it sounds reasonable to me to add new flags. > > If existing users are not broken, then I think it might be a good idea > to extend existing syscall. Otherwise userspace will have to be modified > to understand a 3rd syscall also and an additional option will show up > which asks users to specify which syscall to use. So extending existing > syscall might keep it little simple for users. I don't follow. To use the new feature, you have to modify userspace anyway, as you require userspace to pass information which it did not previously pass (in the new arguments added to the syscall). The presence of a new syscall does not imply the absence of the old syscall, so you can always use that be default unless the user asks for asomething only the new syscall provides. Regardless of the syscall/flags difference, you still have to detect whether the new functionality is present somehow. > BTW, does kexec_load() needs to be modified too to handle DT? No, at least for arm64. In the kexec_load case userspace provides the DTB as a raw segment, and the user-provided purgatory sets up registers to pass that to the new kernel. Thanks, Mark. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC 3/3] kexec: extend kexec_file_load system call
On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote: > On Tue, Jul 19, 2016 at 08:55:56AM +0800, Dave Young wrote: > > On 07/18/16 at 11:07am, Mark Rutland wrote: > > > On Mon, Jul 18, 2016 at 10:30:24AM +0800, Dave Young wrote: > > > > I do not think it is worth to add another syscall for extra fds. > > > > We have open(2) as an example for different numbers of arguments > > > > already. > > > > > > Did we change the syscall interface for that? > > > > > > I was under the impression that there was always one underlying syscall, > > > and the C library did the right thing to pass the expected information > > > to the underlying syscall. > > > > I'm not sure kexec_load and kexec_file_load were included in glibc, we use > > syscall directly in kexec-tools. > > > > kexec_load man pages says there are no wrappers for both kexec_load and > > kexec_file_load in glibc. > > For the above, I was talking about how open() was handled. > > If there are no userspace wrappers, then the two cases aren't comparable > in the first place... > > > > That's rather different to changing the underlying syscall. > > > > > > Regardless of how this is wrapped in userspace, I do not think modifying > > > the existing prototype is a good idea, and I think this kind of > > > extension needs to be a new syscall. > > > > Hmm, as I replied to Vivek, there is one case about the flags, previously > > the new flag will be regarded as invalid, but not we extend it it will be > > valid, this maybe the only potential bad case. > > It's true that adding suport for new flags will change the behaviour of > what used to be error cases. We generally expect real users to not be > making pointless calls for which they rely on an error being returned in > all cases. > > Regardless, this extended syscall changes some underlying assumptions > made with the development of kexec_file_load, and I think treating this > as an extension is not a great idea. From a user's perspective there is > little difference between passing an additional flag or using a > different syscall number, so I don't think that we gain much by altering > the existing prototype relative to allocating a new syscall number. If we are providing/opening up additional flags, I can't think what will it break. Same flag was invalid in old kernel but new kernel supports it and will accept it. So it sounds reasonable to me to add new flags. If existing users are not broken, then I think it might be a good idea to extend existing syscall. Otherwise userspace will have to be modified to understand a 3rd syscall also and an additional option will show up which asks users to specify which syscall to use. So extending existing syscall might keep it little simple for users. This is only if conclusion in the end is that DT needs to be passed in from user space. BTW, does kexec_load() needs to be modified too to handle DT? Vivek ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -next] wan/fsl_ucc_hdlc: remove .owner field for driver
From: Wei YongjunRemove .owner field if calls are used which set it automatically. Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci Signed-off-by: Wei Yongjun --- drivers/net/wan/fsl_ucc_hdlc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index b3861bf..10ca497 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1168,7 +1168,6 @@ static struct platform_driver ucc_hdlc_driver = { .probe = ucc_hdlc_probe, .remove = ucc_hdlc_remove, .driver = { - .owner = THIS_MODULE, .name = DRV_NAME, .pm = HDLC_PM_OPS, .of_match_table = fsl_ucc_hdlc_of_match, ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -next] wan/fsl_ucc_hdlc: use module_platform_driver to simplify the code
From: Wei Yongjunmodule_platform_driver() makes the code simpler by eliminating boilerplate code. Signed-off-by: Wei Yongjun --- drivers/net/wan/fsl_ucc_hdlc.c | 13 + 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c index b3861bf..3f6b218 100644 --- a/drivers/net/wan/fsl_ucc_hdlc.c +++ b/drivers/net/wan/fsl_ucc_hdlc.c @@ -1175,15 +1175,4 @@ static struct platform_driver ucc_hdlc_driver = { }, }; -static int __init ucc_hdlc_init(void) -{ - return platform_driver_register(_hdlc_driver); -} - -static void __exit ucc_hdlc_exit(void) -{ - platform_driver_unregister(_hdlc_driver); -} - -module_init(ucc_hdlc_init); -module_exit(ucc_hdlc_exit); +module_platform_driver(ucc_hdlc_driver); ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC 3/3] kexec: extend kexec_file_load system call
On Tue, Jul 19, 2016 at 08:55:56AM +0800, Dave Young wrote: > On 07/18/16 at 11:07am, Mark Rutland wrote: > > On Mon, Jul 18, 2016 at 10:30:24AM +0800, Dave Young wrote: > > > I do not think it is worth to add another syscall for extra fds. > > > We have open(2) as an example for different numbers of arguments > > > already. > > > > Did we change the syscall interface for that? > > > > I was under the impression that there was always one underlying syscall, > > and the C library did the right thing to pass the expected information > > to the underlying syscall. > > I'm not sure kexec_load and kexec_file_load were included in glibc, we use > syscall directly in kexec-tools. > > kexec_load man pages says there are no wrappers for both kexec_load and > kexec_file_load in glibc. For the above, I was talking about how open() was handled. If there are no userspace wrappers, then the two cases aren't comparable in the first place... > > That's rather different to changing the underlying syscall. > > > > Regardless of how this is wrapped in userspace, I do not think modifying > > the existing prototype is a good idea, and I think this kind of > > extension needs to be a new syscall. > > Hmm, as I replied to Vivek, there is one case about the flags, previously > the new flag will be regarded as invalid, but not we extend it it will be > valid, this maybe the only potential bad case. It's true that adding suport for new flags will change the behaviour of what used to be error cases. We generally expect real users to not be making pointless calls for which they rely on an error being returned in all cases. Regardless, this extended syscall changes some underlying assumptions made with the development of kexec_file_load, and I think treating this as an extension is not a great idea. From a user's perspective there is little difference between passing an additional flag or using a different syscall number, so I don't think that we gain much by altering the existing prototype relative to allocating a new syscall number. Thus, I think that if this is necessary it should be treated as a new syscall. Thanks, Mark. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Suspected regression?
Hi all, I have got two boards MVME5100 (MPC7410 cpu) and MVME7100 (MPC8641D cpu) for which I use the same cross-compiler (ppc7400). I tested these against kernel HEAD to found that these don't boot anymore (PID 1 crash). Bisecting results in first offending commit: 7aef4136566b0539a1a98391181e188905e33401 Removing it from HEAD make boards boot properly again. A third system based on P2010 isn't affected at all. Is it a regression or I have made something wrong? Thanks! Ciao, Alessio ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/2] Automatically load the vmx_crypto module if supported
On Tue, Jul 19, 2016 at 07:13:24PM +1000, Michael Ellerman wrote: > > I'll assume patch 2 has your ack :) Sure, Acked-by: Herbert XuThanks, -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM
Sam Bobroffwrites: > On Fri, Jul 08, 2016 at 08:49:49PM +1000, Michael Ellerman wrote: >> On Wed, 2016-06-07 at 06:05:54 UTC, Sam bobroff wrote: >> > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c >> > index 02416fe..06d79bc 100644 >> > --- a/arch/powerpc/kvm/powerpc.c >> > +++ b/arch/powerpc/kvm/powerpc.c >> > @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, >> > long ext) >> >r = 1; >> >break; >> > #endif >> > + case KVM_CAP_PPC_HTM: >> > + r = cpu_has_feature(CPU_FTR_TM) >> > + && is_kvmppc_hv_enabled(kvm); >> >> I think it should be using CPU_FTR_TM_COMP. > > Oh, why is that? I'm happy to respin the patch I'm just curious. > > (I did it that way becuase that seems to be the way the other flags are used, > e.g. CPU_FTR_ALTIVEC). > > If I read the code correctly, using CPU_FTR_TM_COMP will work fine: it should > cause the cpu_has_feature() test to always return false if CPU_FTR_TM_COMP is > 0. CPU_FTR_TM says the CPU supports TM. CPU_FTR_TM_COMP says the CPU supports TM *and* the kernel is built with TM support. The distinction exists because currently the assembly patching macros don't deal correctly with a feature bit that is defined to 0. (And possibly other reasons I don't remember) We should fix that, but until we have, anything that is advertising support to userspace should be using the COMP bits, when they exist. cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3 02/11] mm: Hardened usercopy
On 07/15/2016 11:44 PM, Kees Cook wrote: > +config HAVE_ARCH_LINEAR_KERNEL_MAPPING > + bool > + help > + An architecture should select this if it has a secondary linear > + mapping of the kernel text. This is used to verify that kernel > + text exposures are not visible under CONFIG_HARDENED_USERCOPY. I have trouble parsing this. (What does secondary linear mapping mean?) So let me give an example below > + [...] > +/* Is this address range in the kernel text area? */ > +static inline const char *check_kernel_text_object(const void *ptr, > +unsigned long n) > +{ > + unsigned long textlow = (unsigned long)_stext; > + unsigned long texthigh = (unsigned long)_etext; > + > + if (overlaps(ptr, n, textlow, texthigh)) > + return ""; > + > +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING > + /* Check against linear mapping as well. */ > + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)), > + (unsigned long)__va(__pa(texthigh > + return ""; > +#endif > + > + return NULL; > +} s390 has an address space for user (primary address space from 0..4TB/8PB) and a separate address space (home space from 0..4TB/8PB) for the kernel. In this home space the kernel mapping is virtual containing the physical memory as well as vmalloc memory (creating aliases into the physical one). The kernel text is mapped from _stext to _etext in this mapping. So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ? ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/2] Automatically load the vmx_crypto module if supported
Herbert Xuwrites: > On Tue, Jul 19, 2016 at 11:01:55AM +1000, Michael Ellerman wrote: >> >> Can you please ask for an ack before merging arch patches? >> >> That's a 70 line powerpc patch and a 6 line crypto patch. It has no >> reviews and no acks. I would have preferred it if we could take it via >> the powerpc tree. > > Sorry, I'll delete them from the crypto tree. Thanks. I'll assume patch 2 has your ack :) cheers ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats
On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote: > vms and vcpus have statistics associated with them which can be viewed > within the debugfs. Currently it is assumed within the vcpu_stat_get() and > vm_stat_get() functions that all of these statistics are represented as > u32s, however the next patch adds some u64 vcpu statistics. > > Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly. > Since vcpu statistics are per vcpu, they will only be updated by a single > vcpu at a time so this shouldn't present a problem on 32-bit machines > which can't atomically increment 64-bit numbers. However vm statistics > could potentially be updated by multiple vcpus from that vm at a time. > To avoid the overhead of atomics make all vm statistics ulong such that > they are 64-bit on 64-bit systems where they can be atomically incremented > and are 32-bit on 32-bit systems which may not be able to atomically > increment 64-bit numbers. Modify vm_stat_get() to expect ulongs. > > Signed-off-by: Suraj Jitindar SinghAcked-by: Christian Borntraeger ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats
On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote: > Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and > halt_wait_ns to be used to accumulate the total time spend polling > successfully, polling unsuccessfully and waiting respectively, and > halt_successful_wait to accumulate the number of times the vcpu waits. > Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are > expressed in nanoseconds it is necessary to represent these as 64-bit > quantities, otherwise they would overflow after only about 4 seconds. Paolo, would these new kvm_stats also be useful for the base implementation? ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module
On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote: > This patch introduces new halt polling functionality into the kvm_hv kernel > module. When a vcore is idle it will poll for some period of time before > scheduling itself out. Some wording on why you cannot use the common code might be useful. > > When all of the runnable vcpus on a vcore have ceded (and thus the vcore is > idle) we schedule ourselves out to allow something else to run. In the > event that we need to wake up very quickly (for example an interrupt > arrives), we are required to wait until we get scheduled again. > > Implement halt polling so that when a vcore is idle, and before scheduling > ourselves, we poll for vcpus in the runnable_threads list which have > pending exceptions or which leave the ceded state. If we poll successfully > then we can get back into the guest very quickly without ever scheduling > ourselves, otherwise we schedule ourselves out as before. > > Testing of this patch with a TCP round robin test between two guests with > virtio network interfaces has found a decrease in round trip time of ~15us > on average. A performance gain is only seen when going out of and > back into the guest often and quickly, otherwise there is no net benefit > from the polling. The polling interval is adjusted such that when we are > often scheduled out for long periods of time it is reduced, and when we > often poll successfully it is increased. The rate at which the polling > interval increases or decreases, and the maximum polling interval, can > be set through module parameters. > > Based on the implementation in the generic kvm module by Wanpeng Li and > Paolo Bonzini, and on direction from Paul Mackerras. > > Signed-off-by: Suraj Jitindar Singh> --- > arch/powerpc/include/asm/kvm_book3s.h | 1 + > arch/powerpc/include/asm/kvm_host.h | 1 + > arch/powerpc/kvm/book3s_hv.c | 116 > ++ > arch/powerpc/kvm/trace_hv.h | 22 +++ > 4 files changed, 126 insertions(+), 14 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_book3s.h > b/arch/powerpc/include/asm/kvm_book3s.h > index 151f817..c261f52 100644 > --- a/arch/powerpc/include/asm/kvm_book3s.h > +++ b/arch/powerpc/include/asm/kvm_book3s.h > @@ -102,6 +102,7 @@ struct kvmppc_vcore { > ulong pcr; > ulong dpdes;/* doorbell state (POWER8) */ > ulong conferring_threads; > + unsigned int halt_poll_ns; > }; > > struct kvmppc_vcpu_book3s { > diff --git a/arch/powerpc/include/asm/kvm_host.h > b/arch/powerpc/include/asm/kvm_host.h > index 02d06e9..610f393 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -294,6 +294,7 @@ struct kvm_arch { > #define VCORE_SLEEPING 3 > #define VCORE_RUNNING4 > #define VCORE_EXITING5 > +#define VCORE_POLLING6 > > /* > * Struct used to manage memory for a virtual processor area > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 3bcf9e6..a9de1d4 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, > _ipi_redirect, > MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host > core"); > #endif > > +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ > +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; > +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); > +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); > + > +/* Factor by which the vcore halt poll interval is grown, default is to > double > + */ > +static unsigned int halt_poll_ns_grow = 2; > +module_param(halt_poll_ns_grow, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); > + > +/* Factor by which the vcore halt poll interval is shrunk, default is to > reset > + */ > +static unsigned int halt_poll_ns_shrink; > +module_param(halt_poll_ns_shrink, int, S_IRUGO); > +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); > + > static void kvmppc_end_cede(struct kvm_vcpu *vcpu); > static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); > > @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore > *vc, > finish_wait(>arch.cpu_run, ); > } > > +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + /* 10us base */ > + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) > + vc->halt_poll_ns = 1; > + else > + vc->halt_poll_ns *= halt_poll_ns_grow; > + > + if (vc->halt_poll_ns > halt_poll_max_ns) > + vc->halt_poll_ns = halt_poll_max_ns; > +} > + > +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) > +{ > + if (halt_poll_ns_shrink == 0) > + vc->halt_poll_ns = 0; > + else > + vc->halt_poll_ns /=
[PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats
vcpu stats are used to collect information about a vcpu which can be viewed in the debugfs. For example halt_attempted_poll and halt_successful_poll are used to keep track of the number of times the vcpu attempts to and successfully polls. These stats are currently not used on powerpc. Implement incrementation of the halt_attempted_poll and halt_successful_poll vcpu stats for powerpc. Since these stats are summed over all the vcpus for all running guests it doesn't matter which vcpu they are attributed to, thus we choose the current runner vcpu of the vcore. Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns to be used to accumulate the total time spend polling successfully, polling unsuccessfully and waiting respectively, and halt_successful_wait to accumulate the number of times the vcpu waits. Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are expressed in nanoseconds it is necessary to represent these as 64-bit quantities, otherwise they would overflow after only about 4 seconds. Given that the total time spend either polling or waiting will be known and the number of times that each was done, it will be possible to determine the average poll and wait times. This will give the ability to tune the kvm module parameters based on the calculated average wait and poll times. Signed-off-by: Suraj Jitindar Singh--- Change Log: V3 -> V4: - Instead of accounting just wait and poll time, separate these into successful_poll_time, failed_poll_time and wait_time. --- arch/powerpc/include/asm/kvm_host.h | 4 arch/powerpc/kvm/book3s.c | 4 arch/powerpc/kvm/book3s_hv.c| 36 +++- 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index f6304c5..f15ffc0 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -114,8 +114,12 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; + u64 halt_wait_ns; u64 halt_successful_poll; u64 halt_attempted_poll; + u64 halt_successful_wait; u64 halt_poll_invalid; u64 halt_wakeup; u64 dbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 47018fc..71eb8f3 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr",VCPU_STAT(ext_intr_exits) }, { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_poll_success_ns", VCPU_STAT(halt_poll_success_ns) }, + { "halt_poll_fail_ns", VCPU_STAT(halt_poll_fail_ns) }, + { "halt_wait_ns", VCPU_STAT(halt_wait_ns) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, + { "halt_successful_wait", VCPU_STAT(halt_successful_wait) }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index a9de1d4..81072f2 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2679,15 +2679,16 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) */ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { + ktime_t cur, start_poll, start_wait; int do_sleep = 1; - ktime_t cur, start; u64 block_ns; DECLARE_SWAITQUEUE(wait); /* Poll for pending exceptions and ceded state */ - cur = start = ktime_get(); + cur = start_poll = ktime_get(); if (vc->halt_poll_ns) { - ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns); + ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns); + ++vc->runner->stat.halt_attempted_poll; vc->vcore_state = VCORE_POLLING; spin_unlock(>lock); @@ -2703,8 +2704,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) spin_lock(>lock); vc->vcore_state = VCORE_INACTIVE; - if (!do_sleep) + if (!do_sleep) { + ++vc->runner->stat.halt_successful_poll; goto out; + } } prepare_to_swait(>wq, , TASK_INTERRUPTIBLE); @@ -2712,9 +2715,14 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) if (kvmppc_vcore_check_block(vc)) { finish_swait(>wq, ); do_sleep = 0; + /* If we polled, count this as a successful poll
[PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats
vms and vcpus have statistics associated with them which can be viewed within the debugfs. Currently it is assumed within the vcpu_stat_get() and vm_stat_get() functions that all of these statistics are represented as u32s, however the next patch adds some u64 vcpu statistics. Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly. Since vcpu statistics are per vcpu, they will only be updated by a single vcpu at a time so this shouldn't present a problem on 32-bit machines which can't atomically increment 64-bit numbers. However vm statistics could potentially be updated by multiple vcpus from that vm at a time. To avoid the overhead of atomics make all vm statistics ulong such that they are 64-bit on 64-bit systems where they can be atomically incremented and are 32-bit on 32-bit systems which may not be able to atomically increment 64-bit numbers. Modify vm_stat_get() to expect ulongs. Signed-off-by: Suraj Jitindar Singh--- Change Log: V2 -> V3: - Instead of implementing separate u32 and u64 functions keep the generic functions and modify them to expect u64s. Thus update all vm and vcpu statistics to u64s accordingly. V3 -> V4: - Change vm_stats from u64 to ulong --- arch/arm/include/asm/kvm_host.h | 12 ++-- arch/arm64/include/asm/kvm_host.h | 12 ++-- arch/mips/include/asm/kvm_host.h| 46 ++--- arch/powerpc/include/asm/kvm_host.h | 60 - arch/s390/include/asm/kvm_host.h| 128 ++-- arch/x86/include/asm/kvm_host.h | 72 ++-- virt/kvm/kvm_main.c | 4 +- 7 files changed, 167 insertions(+), 167 deletions(-) diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 96387d4..c8e55b3b 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -183,15 +183,15 @@ struct kvm_vcpu_arch { }; struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 49095fc..b14c8bc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -291,15 +291,15 @@ struct kvm_vcpu_arch { #endif struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; - u32 hvc_exit_stat; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; + u64 hvc_exit_stat; u64 wfe_exit_stat; u64 wfi_exit_stat; u64 mmio_exit_user; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 36a391d..9704888 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -98,32 +98,32 @@ extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); struct kvm_vm_stat { - u32 remote_tlb_flush; + ulong remote_tlb_flush; }; struct kvm_vcpu_stat { - u32 wait_exits; - u32 cache_exits; - u32 signal_exits; - u32 int_exits; - u32 cop_unusable_exits; - u32 tlbmod_exits; - u32 tlbmiss_ld_exits; - u32 tlbmiss_st_exits; - u32 addrerr_st_exits; - u32 addrerr_ld_exits; - u32 syscall_exits; - u32 resvd_inst_exits; - u32 break_inst_exits; - u32 trap_inst_exits; - u32 msa_fpe_exits; - u32 fpe_exits; - u32 msa_disabled_exits; - u32 flush_dcache_exits; - u32 halt_successful_poll; - u32 halt_attempted_poll; - u32 halt_poll_invalid; - u32 halt_wakeup; + u64 wait_exits; + u64 cache_exits; + u64 signal_exits; + u64 int_exits; + u64 cop_unusable_exits; + u64 tlbmod_exits; + u64 tlbmiss_ld_exits; + u64 tlbmiss_st_exits; + u64 addrerr_st_exits; + u64 addrerr_ld_exits; + u64 syscall_exits; + u64 resvd_inst_exits; + u64 break_inst_exits; + u64 trap_inst_exits; + u64 msa_fpe_exits; + u64 fpe_exits; + u64 msa_disabled_exits; + u64 flush_dcache_exits; + u64 halt_successful_poll; + u64 halt_attempted_poll; + u64 halt_poll_invalid; + u64 halt_wakeup; }; enum kvm_mips_exit_types { diff --git a/arch/powerpc/include/asm/kvm_host.h
[PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module
This patch introduces new halt polling functionality into the kvm_hv kernel module. When a vcore is idle it will poll for some period of time before scheduling itself out. When all of the runnable vcpus on a vcore have ceded (and thus the vcore is idle) we schedule ourselves out to allow something else to run. In the event that we need to wake up very quickly (for example an interrupt arrives), we are required to wait until we get scheduled again. Implement halt polling so that when a vcore is idle, and before scheduling ourselves, we poll for vcpus in the runnable_threads list which have pending exceptions or which leave the ceded state. If we poll successfully then we can get back into the guest very quickly without ever scheduling ourselves, otherwise we schedule ourselves out as before. Testing of this patch with a TCP round robin test between two guests with virtio network interfaces has found a decrease in round trip time of ~15us on average. A performance gain is only seen when going out of and back into the guest often and quickly, otherwise there is no net benefit from the polling. The polling interval is adjusted such that when we are often scheduled out for long periods of time it is reduced, and when we often poll successfully it is increased. The rate at which the polling interval increases or decreases, and the maximum polling interval, can be set through module parameters. Based on the implementation in the generic kvm module by Wanpeng Li and Paolo Bonzini, and on direction from Paul Mackerras. Signed-off-by: Suraj Jitindar Singh--- arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_hv.c | 116 ++ arch/powerpc/kvm/trace_hv.h | 22 +++ 4 files changed, 126 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 151f817..c261f52 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -102,6 +102,7 @@ struct kvmppc_vcore { ulong pcr; ulong dpdes;/* doorbell state (POWER8) */ ulong conferring_threads; + unsigned int halt_poll_ns; }; struct kvmppc_vcpu_book3s { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 02d06e9..610f393 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -294,6 +294,7 @@ struct kvm_arch { #define VCORE_SLEEPING 3 #define VCORE_RUNNING 4 #define VCORE_EXITING 5 +#define VCORE_POLLING 6 /* * Struct used to manage memory for a virtual processor area diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3bcf9e6..a9de1d4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, _ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); + +/* Factor by which the vcore halt poll interval is grown, default is to double + */ +static unsigned int halt_poll_ns_grow = 2; +module_param(halt_poll_ns_grow, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); + +/* Factor by which the vcore halt poll interval is shrunk, default is to reset + */ +static unsigned int halt_poll_ns_shrink; +module_param(halt_poll_ns_shrink, int, S_IRUGO); +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); + static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, finish_wait(>arch.cpu_run, ); } +static void grow_halt_poll_ns(struct kvmppc_vcore *vc) +{ + /* 10us base */ + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow) + vc->halt_poll_ns = 1; + else + vc->halt_poll_ns *= halt_poll_ns_grow; + + if (vc->halt_poll_ns > halt_poll_max_ns) + vc->halt_poll_ns = halt_poll_max_ns; +} + +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) +{ + if (halt_poll_ns_shrink == 0) + vc->halt_poll_ns = 0; + else + vc->halt_poll_ns /= halt_poll_ns_shrink; +} + +/* Check to see if any of the runnable vcpus on the vcore have pending + * exceptions or are no longer ceded + */ +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu; + int i; + + for_each_runnable_thread(i, vcpu, vc) { + if (vcpu->arch.pending_exceptions ||
[PATCH V4 2/5] kvm/ppc/book3s_hv: Change vcore element runnable_threads from linked-list to array
The struct kvmppc_vcore is a structure used to store various information about a virtual core for a kvm guest. The runnable_threads element of the struct provides a list of all of the currently runnable vcpus on the core (those in the KVMPPC_VCPU_RUNNABLE state). The previous implementation of this list was a linked_list. The next patch requires that the list be able to be iterated over without holding the vcore lock. Reimplement the runnable_threads list in the kvmppc_vcore struct as an array. Implement function to iterate over valid entries in the array and update access sites accordingly. Signed-off-by: Suraj Jitindar Singh--- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/book3s_hv.c | 68 +-- 3 files changed, 43 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index a50c5fe..151f817 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -87,7 +87,7 @@ struct kvmppc_vcore { u8 vcore_state; u8 in_guest; struct kvmppc_vcore *master_vcore; - struct list_head runnable_threads; + struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS]; struct list_head preempt_list; spinlock_t lock; struct swait_queue_head wq; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 19c6731..02d06e9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -633,7 +633,6 @@ struct kvm_vcpu_arch { long pgfault_index; unsigned long pgfault_hpte[2]; - struct list_head run_list; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e20beae..3bcf9e6 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "book3s.h" @@ -96,6 +97,26 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); +static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, + int *ip) +{ + int i = *ip; + struct kvm_vcpu *vcpu; + + while (++i < MAX_SMT_THREADS) { + vcpu = READ_ONCE(vc->runnable_threads[i]); + if (vcpu) { + *ip = i; + return vcpu; + } + } + return NULL; +} + +/* Used to traverse the list of runnable threads for a given vcore */ +#define for_each_runnable_thread(i, vcpu, vc) \ + for (i = -1; (vcpu = next_runnable_thread(vc, )); ) + static bool kvmppc_ipi_thread(int cpu) { /* On POWER8 for IPIs to threads in the same core, use msgsnd */ @@ -1492,7 +1513,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) if (vcore == NULL) return NULL; - INIT_LIST_HEAD(>runnable_threads); spin_lock_init(>lock); spin_lock_init(>stoltb_lock); init_swait_queue_head(>wq); @@ -1801,7 +1821,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; spin_unlock_irq(>arch.tbacct_lock); --vc->n_runnable; - list_del(>arch.run_list); + WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL); } static int kvmppc_grab_hwthread(int cpu) @@ -2208,10 +2228,10 @@ static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, static void prepare_threads(struct kvmppc_vcore *vc) { - struct kvm_vcpu *vcpu, *vnext; + int i; + struct kvm_vcpu *vcpu; - list_for_each_entry_safe(vcpu, vnext, >runnable_threads, -arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { if (signal_pending(vcpu->arch.run_task)) vcpu->arch.ret = -EINTR; else if (vcpu->arch.vpa.update_pending || @@ -2258,15 +2278,14 @@ static void collect_piggybacks(struct core_info *cip, int target_threads) static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { - int still_running = 0; + int still_running = 0, i; u64 now; long ret; - struct kvm_vcpu *vcpu, *vnext; + struct kvm_vcpu *vcpu; spin_lock(>lock); now = get_tb(); - list_for_each_entry_safe(vcpu, vnext, >runnable_threads, -arch.run_list) { + for_each_runnable_thread(i, vcpu, vc) { /* cancel pending dec exception if dec is positive */ if (now < vcpu->arch.dec_expires &&
[PATCH V4 1/5] kvm/ppc/book3s: Move struct kvmppc_vcore from kvm_host.h to kvm_book3s.h
The next commit will introduce a member to the kvmppc_vcore struct which references MAX_SMT_THREADS which is defined in kvm_book3s_asm.h, however this file isn't included in kvm_host.h directly. Thus compiling for certain platforms such as pmac32_defconfig and ppc64e_defconfig with KVM fails due to MAX_SMT_THREADS not being defined. Move the struct kvmppc_vcore definition to kvm_book3s.h which explicitly includes kvm_book3s_asm.h. Signed-off-by: Suraj Jitindar Singh--- Change Log: V1 -> V2: - Added patch to series --- arch/powerpc/include/asm/kvm_book3s.h | 35 +++ arch/powerpc/include/asm/kvm_host.h | 35 --- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 8f39796..a50c5fe 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -69,6 +69,41 @@ struct hpte_cache { int pagesize; }; +/* + * Struct for a virtual core. + * Note: entry_exit_map combines a bitmap of threads that have entered + * in the bottom 8 bits and a bitmap of threads that have exited in the + * next 8 bits. This is so that we can atomically set the entry bit + * iff the exit map is 0 without taking a lock. + */ +struct kvmppc_vcore { + int n_runnable; + int num_threads; + int entry_exit_map; + int napping_threads; + int first_vcpuid; + u16 pcpu; + u16 last_cpu; + u8 vcore_state; + u8 in_guest; + struct kvmppc_vcore *master_vcore; + struct list_head runnable_threads; + struct list_head preempt_list; + spinlock_t lock; + struct swait_queue_head wq; + spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ + u64 stolen_tb; + u64 preempt_tb; + struct kvm_vcpu *runner; + struct kvm *kvm; + u64 tb_offset; /* guest timebase - host timebase */ + ulong lpcr; + u32 arch_compat; + ulong pcr; + ulong dpdes;/* doorbell state (POWER8) */ + ulong conferring_threads; +}; + struct kvmppc_vcpu_book3s { struct kvmppc_sid_map sid_map[SID_MAP_NUM]; struct { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ec35af3..19c6731 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -275,41 +275,6 @@ struct kvm_arch { #endif }; -/* - * Struct for a virtual core. - * Note: entry_exit_map combines a bitmap of threads that have entered - * in the bottom 8 bits and a bitmap of threads that have exited in the - * next 8 bits. This is so that we can atomically set the entry bit - * iff the exit map is 0 without taking a lock. - */ -struct kvmppc_vcore { - int n_runnable; - int num_threads; - int entry_exit_map; - int napping_threads; - int first_vcpuid; - u16 pcpu; - u16 last_cpu; - u8 vcore_state; - u8 in_guest; - struct kvmppc_vcore *master_vcore; - struct list_head runnable_threads; - struct list_head preempt_list; - spinlock_t lock; - struct swait_queue_head wq; - spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */ - u64 stolen_tb; - u64 preempt_tb; - struct kvm_vcpu *runner; - struct kvm *kvm; - u64 tb_offset; /* guest timebase - host timebase */ - ulong lpcr; - u32 arch_compat; - ulong pcr; - ulong dpdes;/* doorbell state (POWER8) */ - ulong conferring_threads; -}; - #define VCORE_ENTRY_MAP(vc)((vc)->entry_exit_map & 0xff) #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8) #define VCORE_IS_EXITING(vc) (VCORE_EXIT_MAP(vc) != 0) -- 2.5.5 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2 1/2] crypto: vmx - Adding asm subroutines for XTS
On Mon, Jul 18, 2016 at 12:26:25PM -0300, Paulo Flabiano Smorigo wrote: > This patch add XTS subroutines using VMX-crypto driver. > > It gives a boost of 20 times using XTS. > > These code has been adopted from OpenSSL project in collaboration > with the original author (Andy Polyakov). > > Signed-off-by: Leonidas S. Barbosa > Signed-off-by: Paulo Flabiano Smorigo Both patches applied. Thanks. -- Email: Herbert Xu Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev