[RFC v5 3/3] cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled
The disable callback can be used to compute timeout for other states whenever a state is enabled or disabled. We store the computed timeout in "timeout" defined in cpuidle state strucure. So, we compute timeout only when some state is enabled or disabled and not every time in the fast idle path. We also use the computed timeout to get timeout for snooze, thus getting rid of get_snooze_timeout for snooze loop. Signed-off-by: Abhishek Goel --- drivers/cpuidle/cpuidle-powernv.c | 35 +++ include/linux/cpuidle.h | 1 + 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index d7686ce6e..a75226f52 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -45,7 +45,6 @@ struct stop_psscr_table { static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly; static u64 default_snooze_timeout __read_mostly; -static bool snooze_timeout_en __read_mostly; static u64 forced_wakeup_timeout(struct cpuidle_device *dev, struct cpuidle_driver *drv, @@ -67,26 +66,13 @@ static u64 forced_wakeup_timeout(struct cpuidle_device *dev, return 0; } -static u64 get_snooze_timeout(struct cpuidle_device *dev, - struct cpuidle_driver *drv, - int index) +static void pnv_disable_callback(struct cpuidle_device *dev, +struct cpuidle_driver *drv) { int i; - if (unlikely(!snooze_timeout_en)) - return default_snooze_timeout; - - for (i = index + 1; i < drv->state_count; i++) { - struct cpuidle_state *s = &drv->states[i]; - struct cpuidle_state_usage *su = &dev->states_usage[i]; - - if (s->disabled || su->disable) - continue; - - return s->target_residency * tb_ticks_per_usec; - } - - return default_snooze_timeout; + for (i = 0; i < drv->state_count; i++) + drv->states[i].timeout = forced_wakeup_timeout(dev, drv, i); } static int snooze_loop(struct cpuidle_device *dev, @@ -94,16 +80,20 @@ static int snooze_loop(struct cpuidle_device *dev, int index) { u64 snooze_exit_time; + u64 snooze_timeout = drv->states[index].timeout; + + if (!snooze_timeout) + snooze_timeout = default_snooze_timeout; set_thread_flag(TIF_POLLING_NRFLAG); local_irq_enable(); - snooze_exit_time = get_tb() + get_snooze_timeout(dev, drv, index); + snooze_exit_time = get_tb() + snooze_timeout; ppc64_runlatch_off(); HMT_very_low(); while (!need_resched()) { - if (likely(snooze_timeout_en) && get_tb() > snooze_exit_time) { + if (get_tb() > snooze_exit_time) { /* * Task has not woken up but we are exiting the polling * loop anyway. Require a barrier after polling is @@ -168,7 +158,7 @@ static int stop_loop(struct cpuidle_device *dev, u64 timeout_tb; bool forced_wakeup = false; - timeout_tb = forced_wakeup_timeout(dev, drv, index); + timeout_tb = drv->states[index].timeout; if (timeout_tb) { /* Ensure that the timeout is at least one microsecond @@ -263,6 +253,7 @@ static int powernv_cpuidle_driver_init(void) */ drv->cpumask = (struct cpumask *)cpu_present_mask; + drv->disable_callback = pnv_disable_callback; return 0; } @@ -422,8 +413,6 @@ static int powernv_idle_probe(void) /* Device tree can indicate more idle states */ max_idle_state = powernv_add_idle_states(); default_snooze_timeout = TICK_USEC * tb_ticks_per_usec; - if (max_idle_state > 1) - snooze_timeout_en = true; } else return -ENODEV; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 1729a497b..64195861b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -50,6 +50,7 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ + unsigned long long timeout; /* timeout for exiting out of a state */ int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, -- 2.17.1
[RFC v5 2/3] cpuidle : Add callback whenever a state usage is enabled/disabled
To force wakeup a cpu, we need to compute the timeout in the fast idle path as a state may be enabled or disabled but there did not exist a feedback to driver when a state is enabled or disabled. This patch adds a callback whenever a state_usage records a store for disable attribute Signed-off-by: Abhishek Goel --- drivers/cpuidle/sysfs.c | 15 ++- include/linux/cpuidle.h | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 2bb2683b4..6c9bf2f7b 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -418,8 +418,21 @@ static ssize_t cpuidle_state_store(struct kobject *kobj, struct attribute *attr, struct cpuidle_state_attr *cattr = attr_to_stateattr(attr); struct cpuidle_device *dev = kobj_to_device(kobj); - if (cattr->store) + if (cattr->store) { ret = cattr->store(state, state_usage, buf, size); + if (ret == size && + strncmp(cattr->attr.name, "disable", + strlen("disable"))) { + struct kobject *cpuidle_kobj = kobj->parent; + struct cpuidle_device *dev = + to_cpuidle_device(cpuidle_kobj); + struct cpuidle_driver *drv = + cpuidle_get_cpu_driver(dev); + + if (drv->disable_callback) + drv->disable_callback(dev, drv); + } + } /* reset poll time cache */ dev->poll_limit_ns = 0; diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 4b6b5bea8..1729a497b 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -122,6 +122,9 @@ struct cpuidle_driver { /* the driver handles the cpus in cpumask */ struct cpumask *cpumask; + void (*disable_callback)(struct cpuidle_device *dev, +struct cpuidle_driver *drv); + /* preferred governor to switch at register time */ const char *governor; }; -- 2.17.1
[PATCH v5 1/3] cpuidle-powernv : forced wakeup for stop states
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU may end up in the shallow state. This is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state. Signed-off-by: Abhishek Goel --- Auto-promotion v1 : started as auto promotion logic for cpuidle states in generic driver v2 : Removed timeout_needed and rebased the code to upstream kernel Forced-wakeup v1 : New patch with name of forced wakeup started v2 : Extending the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : Cleanly handle setting/resetting of decrementer so as to not break irq work v4 : Changed type and name of set/reset decrementer fucntion Handled irq_work_pending in try_set_dec_before_idle v5 : Removed forced wakeup for last stop state by changing the checking conditon of timeout_tb arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 +++ drivers/cpuidle/cpuidle-powernv.c | 40 3 files changed, 85 insertions(+) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 08dbe3e68..06a6a2314 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -184,6 +184,8 @@ static inline unsigned long tb_ticks_since(unsigned long tstamp) extern u64 mulhdu(u64, u64); #endif +extern bool try_set_dec_before_idle(u64 timeout); +extern void try_reset_dec_after_idle(void); extern void div128_by_32(u64 dividend_high, u64 dividend_low, unsigned divisor, struct div_result *dr); diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 694522308..d004c0d8e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -576,6 +576,49 @@ void arch_irq_work_raise(void) #endif /* CONFIG_IRQ_WORK */ +/* + * This function tries setting decrementer before entering into idle. + * Returns true if we have reprogrammed the decrementer for idle. + * Returns false if the decrementer is unchanged. + */ +bool try_set_dec_before_idle(u64 timeout) +{ + u64 *next_tb = this_cpu_ptr(&decrementers_next_tb); + u64 now = get_tb_or_rtc(); + + if (now + timeout > *next_tb) + return false; + + set_dec(timeout); + if (test_irq_work_pending()) + set_dec(1); + + return true; +} + +/* + * This function gets called if we have set decrementer before + * entering into idle. It tries to reset/restore the decrementer + * to its original value. + */ +void try_reset_dec_after_idle(void) +{ + u64 now; + u64 *next_tb; + + if (test_irq_work_pending()) + return; + + now = get_tb_or_rtc(); + next_tb = this_cpu_ptr(&decrementers_next_tb); + if (now >= *next_tb) + return; + + set_dec(*next_tb - now); + if (test_irq_work_pending()) + set_dec(1); +} + /* * timer_interrupt - gets called when the decrementer overflows, * with interrupts disabled. diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 84b1ebe21..d7686ce6e 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * Expose only those Hardware idle states via the cpuidle framework @@ -46,6 +47,26 @@ static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX] __read_mostly static u64 default_snooze_timeout __read_mostly; static bool snooze_timeout_en __read_mostly; +static u64 forced_wakeup_timeout(struct cpuidle_device *dev, +struct cpuidle_driver *drv, +int index) +{ + int i; + + for (i = index + 1; i < drv->state_count; i++) { + struct cpuidle_state *s = &drv->states[i]; + struct cpuidle_state_usage *su = &dev->states_usage[i]; + + if (s->disabled || su->disable) + c
[PATCH v5 0/3] Forced-wakeup for stop states on Powernv
Currently, the cpuidle governors determine what idle state a idling CPU should enter into based on heuristics that depend on the idle history on that CPU. Given that no predictive heuristic is perfect, there are cases where the governor predicts a shallow idle state, hoping that the CPU will be busy soon. However, if no new workload is scheduled on that CPU in the near future, the CPU will end up in the shallow state. Motivation -- In case of POWER, this is problematic, when the predicted state in the aforementioned scenario is a shallow stop state on a tickless system. As we might get stuck into shallow states even for hours, in absence of ticks or interrupts. To address this, We forcefully wakeup the cpu by setting the decrementer. The decrementer is set to a value that corresponds with the residency of the next available state. Thus firing up a timer that will forcefully wakeup the cpu. Few such iterations will essentially train the governor to select a deeper state for that cpu, as the timer here corresponds to the next available cpuidle state residency. Thus, cpu will eventually end up in the deepest possible state and we won't get stuck in a shallow state for long duration. Experiment -- For earlier versions when this feature was meat to be only for shallow lite states, I performed experiments for three scenarios to collect some data. case 1 : Without this patch and without tick retained, i.e. in a upstream kernel, It would spend more than even a second to get out of stop0_lite. case 2 : With tick retained in a upstream kernel - Generally, we have a sched tick at 4ms(CONF_HZ = 250). Ideally I expected it to take 8 sched tick to get out of stop0_lite. Experimentally, observation was = sample minmax 99percentile 20 4ms12ms 4ms = It would take atleast one sched tick to get out of stop0_lite. case 2 : With this patch (not stopping tick, but explicitly queuing a timer) sample min max 99percentile 20 144us 192us 144us Description of current implementation - We calculate timeout for the current idle state as the residency value of the next available idle state. If the decrementer is set to be greater than this timeout, we update the decrementer value with the residency of next available idle state. Thus, essentially training the governor to select the next available deeper state until we reach the deepest state. Hence, we won't get stuck unnecessarily in shallow states for longer duration. v1 of auto-promotion : https://lkml.org/lkml/2019/3/22/58 This patch was implemented only for shallow lite state in generic cpuidle driver. v2 : Removed timeout_needed and rebased to current upstream kernel Then, v1 of forced-wakeup : Moved the code to cpuidle powernv driver and started as forced wakeup instead of auto-promotion v2 : Extended the forced wakeup logic for all states. Setting the decrementer instead of queuing up a hrtimer to implement the logic. v3 : 1) Cleanly handle setting the decrementer after exiting out of stop states. 2) Added a disable_callback feature to compute timeout whenever a state is enbaled or disabled instead of computing everytime in fast idle path. 3) Use disable callback to recompute timeout whenever state usage is changed for a state. Also, cleaned up the get_snooze_timeout function. v4 :Changed the type and name of set/reset decrementer function. Handled irq work pending in try_set_dec_before_idle. No change in patch 2 and 3. v5 :Removed forced wakeup for the last state. We dont want to wakeup unnecessarily when already in deepest state. It was a mistake in previous patches that was found out in recent experiments. No change in patch 2 and 3. Abhishek Goel (3): cpuidle-powernv : forced wakeup for stop states cpuidle : Add callback whenever a state usage is enabled/disabled cpuidle-powernv : Recompute the idle-state timeouts when state usage is enabled/disabled arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kernel/time.c| 43 drivers/cpuidle/cpuidle-powernv.c | 55 +++ drivers/cpuidle/sysfs.c | 15 - include/linux/cpuidle.h | 4 +++ 5 files changed, 105 insertions(+), 14 deletions(-) -- 2.17.1
Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
On 10/3/19 10:26 AM, Jeremy Kerr wrote: Hi Vasant, Correct. We will have `ibm,prd-label` property. That's not the issue. It sure sounds like the issue - someone has represented a range that should be mapped by HBRT, but isn't appropriate for mapping by HBRT. Here issueis HBRT is loaded into NVDIMM memory. OK. How about we just don't do that? Yes. Hostboot will fix that. It will make sure that HBRT is loaded into regular memory. It sounds like we're just trying to work around an invalid representation of the mappings. Its not workaround. Its additional check. So that we don't mmap() if HBRT is not in system RAM and throw proper error.. So that debugging becomes easy. -Vasant
Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
Hi Vasant, > Correct. We will have `ibm,prd-label` property. That's not the issue. It sure sounds like the issue - someone has represented a range that should be mapped by HBRT, but isn't appropriate for mapping by HBRT. > Here issueis HBRT is loaded into NVDIMM memory. OK. How about we just don't do that? It sounds like we're just trying to work around an invalid representation of the mappings. Cheers, Jeremy
Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
On 10/3/19 7:17 AM, Jeremy Kerr wrote: Hi Vasant, Jeremy, Add check to validate whether requested page is part of system RAM or not before mmap() and error out if its not part of system RAM. opal_prd_range_is_valid() will return false if the reserved memory range does not have an ibm,prd-label property. If this you're getting invalid memory mapped through the PRD interface, that means the device tree is incorrectly describing those ranges. Correct. We will have `ibm,prd-label` property. That's not the issue. Here issue is HBRT is loaded into NVDIMM memory. Copy-pasting Vaidy's explanation from internal bugzilla here: -- The root-cause of the problem seem to be in HBRT using NVDIMM area/addresses for firmware operation. Linux maps the required address for HBRT to read, write and execute. This all works fine for normal RAM addresses. However NVDIMM is not normal RAM, it is device memory which can be used as RAM or through other layers and subsystem. Linux kernel memory manager set page table attributes as 0b10 non-idempotent I/O instead of normal RAM 0b00 since this is a special type of device memory initialized and used by a firmware device driver. This prevented instruction execution from that mapped page. Since instruction could not be fetched, opal-prd application could not complete init and start. -- Hostboot should detect NVDIMM areas and avoid using those areas for any firmware purposes including HBRT. Hostboot will fix this issue. In this patch we are adding additional check to make sure mmap() fails gracefully and we log proper error log. That way opal-prd will fail to start instead of looping forever . -Vasant
Re: [PATCH v5 10/11] mm/Kconfig: Adds config option to track lockless pagetable walks
> On Oct 2, 2019, at 9:36 PM, Leonardo Bras wrote: > > Adds config option LOCKLESS_PAGE_TABLE_WALK_TRACKING to make possible > enabling tracking lockless pagetable walks directly from kernel config. Can’t this name and all those new *lockless* function names be shorter? There are many functions name with *_locked, so how about dropping lockless at all, i.e., PAGE_TABLE_WALK_TRACKING blah blah? > > Signed-off-by: Leonardo Bras > --- > mm/Kconfig | 11 +++ > 1 file changed, 11 insertions(+) > > diff --git a/mm/Kconfig b/mm/Kconfig > index a5dae9a7eb51..00f487a0122f 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL > config ARCH_HAS_HUGEPD >bool > > +config LOCKLESS_PAGE_TABLE_WALK_TRACKING > +bool "Track (and optimize) lockless page table walks" > +default n > + > +help > + Maintain a reference count of active lockless page table > + walkers. This adds 4 bytes to struct mm size, and two atomic > + operations to calls such as get_user_pages_fast(). Some > + architectures can optimize lockless page table operations if > + this is enabled. > + > endmenu > -- > 2.20.1
Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
Hi Vasant, > Add check to validate whether requested page is part of system RAM > or not before mmap() and error out if its not part of system RAM. opal_prd_range_is_valid() will return false if the reserved memory range does not have an ibm,prd-label property. If this you're getting invalid memory mapped through the PRD interface, that means the device tree is incorrectly describing those ranges. Or am I missing something? Cheers, Jeremy
[PATCH v5 10/11] mm/Kconfig: Adds config option to track lockless pagetable walks
Adds config option LOCKLESS_PAGE_TABLE_WALK_TRACKING to make possible enabling tracking lockless pagetable walks directly from kernel config. Signed-off-by: Leonardo Bras --- mm/Kconfig | 11 +++ 1 file changed, 11 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index a5dae9a7eb51..00f487a0122f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -736,4 +736,15 @@ config ARCH_HAS_PTE_SPECIAL config ARCH_HAS_HUGEPD bool +config LOCKLESS_PAGE_TABLE_WALK_TRACKING + bool "Track (and optimize) lockless page table walks" + default n + + help + Maintain a reference count of active lockless page table + walkers. This adds 4 bytes to struct mm size, and two atomic + operations to calls such as get_user_pages_fast(). Some + architectures can optimize lockless page table operations if + this is enabled. + endmenu -- 2.20.1
[PATCH v5 05/11] powerpc/perf: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on read_user_stack_slow. local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Variable that saves the irq mask was renamed from flags to irq_mask so it doesn't lose meaning now it's not directly passed to local_irq_* functions. Signed-off-by: Leonardo Bras --- arch/powerpc/perf/callchain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c84bbd4298a0..43e49d8be344 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -116,14 +116,14 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned shift; unsigned long addr = (unsigned long) ptr; unsigned long offset; - unsigned long pfn, flags; + unsigned long pfn, irq_mask; void *kaddr; pgdir = current->mm->pgd; if (!pgdir) return -EFAULT; - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(current->mm); ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); if (!ptep) goto err_out; @@ -145,7 +145,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) memcpy(buf, kaddr + offset, nb); ret = 0; err_out: - local_irq_restore(flags); + end_lockless_pgtbl_walk(current->mm, irq_mask); return ret; } -- 2.20.1
[PATCH v5 11/11] powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing
Skips slow part of serialize_against_pte_lookup if there is no running lockless pagetable walk. Signed-off-by: Leonardo Bras --- arch/powerpc/mm/book3s64/pgtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index ae557fdce9a3..0fef9400f210 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -95,7 +95,8 @@ static void do_nothing(void *unused) void serialize_against_pte_lookup(struct mm_struct *mm) { smp_mb(); - smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); + if (running_lockless_pgtbl_walk(mm)) + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); } /* -- 2.20.1
[PATCH v5 09/11] powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all book3s_64-related functions that do lockless pagetable walks. Adds comments explaining that some lockless pagetable walks don't need protection due to guest pgd not being a target of THP collapse/split, or due to being called from Realmode + MSR_EE = 0. Given that some of these functions always are called in realmode, we use __{begin,end}_lockless_pgtbl_walk so we can decide when to disable interrupts. local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Variable that saves the irq mask was renamed from flags to irq_mask so it doesn't lose meaning now it's not directly passed to local_irq_* functions. There are also a function that uses local_irq_{en,dis}able, so the return value of begin_lockless_pgtbl_walk() is ignored and we pass IRQS_ENABLED to end_lockless_pgtbl_walk() to mimic the effect of local_irq_enable(). Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_mmu_hv.c| 6 ++--- arch/powerpc/kvm/book3s_64_mmu_radix.c | 34 +++--- arch/powerpc/kvm/book3s_64_vio_hv.c| 7 +- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9a75f0e1933b..d8f374c979f5 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -615,12 +615,12 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { pte_t *ptep, pte; - unsigned long flags; + unsigned long irq_mask; /* * We need to protect against page table destruction * hugepage split and collapse. */ - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(kvm->mm); ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); if (ptep) { @@ -628,7 +628,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, if (__pte_write(pte)) write_ok = 1; } - local_irq_restore(flags); + end_lockless_pgtbl_walk(kvm->mm, irq_mask); } } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 2d415c36a61d..8ba9742e2fc8 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -813,20 +813,20 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits. */ - local_irq_disable(); + begin_lockless_pgtbl_walk(kvm->mm); ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); /* * If the PTE disappeared temporarily due to a THP * collapse, just return and let the guest try again. */ if (!ptep) { - local_irq_enable(); + end_lockless_pgtbl_walk(kvm->mm, IRQS_ENABLED); if (page) put_page(page); return RESUME_GUEST; } pte = *ptep; - local_irq_enable(); + end_lockless_pgtbl_walk(kvm->mm, IRQS_ENABLED); /* If we're logging dirty pages, always map single pages */ large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); @@ -972,10 +972,16 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); + return 0; } @@ -989,6 +995,11 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, int ref = 0; unsigned long old, *rmapp; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); i
[PATCH v5 08/11] powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all book3s_hv related functions that do lockless pagetable walks. Adds comments explaining that some lockless pagetable walks don't need protection due to guest pgd not being a target of THP collapse/split, or due to being called from Realmode + MSR_EE = 0 kvmppc_do_h_enter: Fixes where local_irq_restore() must be placed (after the last usage of ptep). Given that some of these functions can be called in real mode, and others always are, we use __{begin,end}_lockless_pgtbl_walk so we can decide when to disable interrupts. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_hv_nested.c | 22 ++-- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 - 2 files changed, 38 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index cdf30c6eaf54..89944c699fd6 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -803,7 +803,11 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, if (!gp) return; - /* Find the pte */ + /* Find the pte: +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); /* * If the pte is present and the pfn is still the same, update the pte. @@ -853,7 +857,11 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, if (!gp) return; - /* Find and invalidate the pte */ + /* Find and invalidate the pte: +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); /* Don't spuriously invalidate ptes if the pfn has changed */ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) @@ -921,6 +929,11 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, int shift; spin_lock(&kvm->mmu_lock); + /* +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); if (!shift) shift = PAGE_SHIFT; @@ -1362,6 +1375,11 @@ static long int __kvmhv_nested_page_fault(struct kvm_run *run, /* See if can find translation in our partition scoped tables for L1 */ pte = __pte(0); spin_lock(&kvm->mmu_lock); + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (!shift) shift = PAGE_SHIFT; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 220305454c23..a8be42f5be1e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -210,7 +210,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, pte_t *ptep; unsigned int writing; unsigned long mmu_seq; - unsigned long rcbits, irq_flags = 0; + unsigned long rcbits, irq_mask = 0; if (kvm_is_radix(kvm)) return H_FUNCTION; @@ -252,12 +252,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ - if (!realmode) - local_irq_save(irq_flags); - /* -* If called in real mode we have MSR_EE = 0. Otherwise -* we disable irq above. -*/ + irq_mask = __begin_lockless_pgtbl_walk(kvm->mm, !realmode); + ptep = __find_linux_pte(pgdir, hva, NULL, &hpage_shift); if (ptep) { pte_t pte; @@ -272,8 +268,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * to <= host page size, if host is using hugepage */ if (host_pte_size < psize) { - if (!realmode) - local_irq_restore(flags); + __end_lockless_pgtbl_walk(kvm->mm, irq_mask, !realmode); return H_PARAMETER; } pte = kvmppc_read_update_linux_pte(ptep, writing); @@ -287,8 +282,6
[PATCH v5 07/11] powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on kvmppc_e500_shadow_map(). Fixes the place where local_irq_restore() is called: previously, if ptep was NULL, local_irq_restore() would never be called. local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Variable that saves the irq mask was renamed from flags to irq_mask so it doesn't lose meaning now it's not directly passed to local_irq_* functions. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/e500_mmu_host.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 321db0fdb9db..36f07c6a5f10 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -336,7 +336,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, pte_t *ptep; unsigned int wimg = 0; pgd_t *pgdir; - unsigned long flags; + unsigned long irq_mask; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; @@ -473,7 +473,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * We are holding kvm->mmu_lock so a notifier invalidate * can't run hence pfn won't change. */ - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(kvm->mm); ptep = find_linux_pte(pgdir, hva, NULL, NULL); if (ptep) { pte_t pte = READ_ONCE(*ptep); @@ -481,15 +481,16 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (pte_present(pte)) { wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - local_irq_restore(flags); } else { - local_irq_restore(flags); + end_lockless_pgtbl_walk(kvm->mm, irq_mask); pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", __func__, (long)gfn, pfn); ret = -EINVAL; goto out; } } + end_lockless_pgtbl_walk(kvm->mm, irq_mask); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, -- 2.20.1
[PATCH v5 06/11] powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all hash-related functions that do lockless pagetable walks. hash_page_mm: Adds comment that explain that there is no need to local_int_disable/save given that it is only called from DataAccess interrupt, so interrupts are already disabled. local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Variable that saves the irq mask was renamed from flags to irq_mask so it doesn't lose meaning now it's not directly passed to local_irq_* functions. Signed-off-by: Leonardo Bras --- arch/powerpc/mm/book3s64/hash_tlb.c | 6 +++--- arch/powerpc/mm/book3s64/hash_utils.c | 27 +-- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 4a70d8dd39cd..b0ef67d8c88a 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -194,7 +194,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, { bool is_thp; int hugepage_shift; - unsigned long flags; + unsigned long irq_mask; start = _ALIGN_DOWN(start, PAGE_SIZE); end = _ALIGN_UP(end, PAGE_SIZE); @@ -209,7 +209,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, * to being hashed). This is not the most performance oriented * way to do things but is fine for our needs here. */ - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(mm); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { pte_t *ptep = find_current_mm_pte(mm->pgd, start, &is_thp, @@ -229,7 +229,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, hpte_need_flush(mm, start, ptep, pte, hugepage_shift); } arch_leave_lazy_mmu_mode(); - local_irq_restore(flags); + end_lockless_pgtbl_walk(mm, irq_mask); } void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 6c123760164e..7a01a12a19bb 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1313,12 +1313,16 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif /* CONFIG_PPC_64K_PAGES */ - /* Get PTE and page size from page tables */ + /* Get PTE and page size from page tables : +* Called in from DataAccess interrupt (data_access_common: 0x300), +* interrupts are disabled here. +*/ + __begin_lockless_pgtbl_walk(mm, false); ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); rc = 1; - goto bail; + goto bail_pgtbl_walk; } /* Add _PAGE_PRESENT to the required access perm */ @@ -1331,7 +1335,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, if (!check_pte_access(access, pte_val(*ptep))) { DBG_LOW(" no access !\n"); rc = 1; - goto bail; + goto bail_pgtbl_walk; } if (hugeshift) { @@ -1355,7 +1359,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, if (current->mm == mm) check_paca_psize(ea, mm, psize, user_region); - goto bail; + goto bail_pgtbl_walk; } #ifndef CONFIG_PPC_64K_PAGES @@ -1429,6 +1433,8 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, #endif DBG_LOW(" -> rc=%d\n", rc); +bail_pgtbl_walk: + __end_lockless_pgtbl_walk(mm, 0, false); bail: exception_exit(prev_state); return rc; @@ -1517,7 +1523,7 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, unsigned long vsid; pgd_t *pgdir; pte_t *ptep; - unsigned long flags; + unsigned long irq_mask; int rc, ssize, update_flags = 0; unsigned long access = _PAGE_PRESENT | _PAGE_READ | (is_exec ? _PAGE_EXEC : 0); @@ -1539,11 +1545,12 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; + /* * Hash doesn't like irqs. Walking linux page table with irq disabled * saves us from holding multiple locks. */ - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(mm); /* * THP pages use update_mmu_cache_pmd. We don't do @@ -1588,7 +1595,7 @@ static void hash_preload(struct mm_struct *mm, unsigned long ea, mm_ctx_user_psize(&mm->context)
[PATCH v5 04/11] powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on addr_to_pfn(). local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Signed-off-by: Leonardo Bras --- arch/powerpc/kernel/mce_power.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 1cbf7f1a4e3d..7f888fb6f65c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -29,7 +29,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; unsigned int shift; - unsigned long pfn, flags; + unsigned long pfn, irq_mask; struct mm_struct *mm; if (user_mode(regs)) @@ -37,7 +37,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) else mm = &init_mm; - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(mm); ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift); if (!ptep || pte_special(*ptep)) { @@ -53,7 +53,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) } out: - local_irq_restore(flags); + end_lockless_pgtbl_walk(mm, irq_mask); return pfn; } -- 2.20.1
[PATCH v5 01/11] asm-generic/pgtable: Adds generic functions to monitor lockless pgtable walks
It's necessary to monitor lockless pagetable walks, in order to avoid doing THP splitting/collapsing during them. Some methods rely on irq enable/disable, but that can be slow on cases with a lot of cpus are used for the process, given all these cpus have to run a IPI. In order to speedup some cases, I propose a refcount-based approach, that counts the number of lockless pagetable walks happening on the process. If this count is zero, it skips the irq-oriented method. Given that there are lockless pagetable walks on generic code, it's necessary to create documented generic functions that may be enough for most archs and but let open to arch-specific implemenations. This method does not exclude the current irq-oriented method. It works as a complement to skip unnecessary waiting. begin_lockless_pgtbl_walk(mm) Insert before starting any lockless pgtable walk end_lockless_pgtbl_walk(mm) Insert after the end of any lockless pgtable walk (Mostly after the ptep is last used) running_lockless_pgtbl_walk(mm) Returns the number of lockless pgtable walks running While there is no config option, the method is disabled and these functions are only doing what was already needed to lockless pagetable walks (disabling interrupt). A memory barrier was also added just to make sure there is no speculative read outside the interrupt disabled area. Signed-off-by: Leonardo Bras --- include/asm-generic/pgtable.h | 58 +++ include/linux/mm_types.h | 11 +++ kernel/fork.c | 3 ++ 3 files changed, 72 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 818691846c90..3043ea9812d5 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1171,6 +1171,64 @@ static inline bool arch_has_pfn_modify_check(void) #endif #endif +#ifndef __HAVE_ARCH_LOCKLESS_PGTBL_WALK_CONTROL +static inline unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm) +{ + unsigned long irq_mask; + + if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING)) + atomic_inc(&mm->lockless_pgtbl_walkers); + + /* +* Interrupts must be disabled during the lockless page table walk. +* That's because the deleting or splitting involves flushing TLBs, +* which in turn issues interrupts, that will block when disabled. +*/ + local_irq_save(irq_mask); + + /* +* This memory barrier pairs with any code that is either trying to +* delete page tables, or split huge pages. Without this barrier, +* the page tables could be read speculatively outside of interrupt +* disabling. +*/ + smp_mb(); + + return irq_mask; +} + +static inline void end_lockless_pgtbl_walk(struct mm_struct *mm, + unsigned long irq_mask) +{ + /* +* This memory barrier pairs with any code that is either trying to +* delete page tables, or split huge pages. Without this barrier, +* the page tables could be read speculatively outside of interrupt +* disabling. +*/ + smp_mb(); + + /* +* Interrupts must be disabled during the lockless page table walk. +* That's because the deleting or splitting involves flushing TLBs, +* which in turn issues interrupts, that will block when disabled. +*/ + local_irq_restore(irq_mask); + + if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING)) + atomic_dec(&mm->lockless_pgtbl_walkers); +} + +static inline int running_lockless_pgtbl_walk(struct mm_struct *mm) +{ + if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING)) + return atomic_read(&mm->lockless_pgtbl_walkers); + + /* If disabled, must return > 0, so it falls back to sync method */ + return 1; +} +#endif + /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fa795284..277462f0b4fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -521,6 +521,17 @@ struct mm_struct { struct work_struct async_put_work; } __randomize_layout; +#ifdef CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING + /* +* Number of callers who are doing a lockless walk of the +* page tables. Typically arches might enable this in order to +* help optimize performance, by possibly avoiding expensive +* IPIs at the wrong times. +*/ + atomic_t lockless_pgtbl_walkers; + +#endif + /* * The mm_cpumask needs to be at the end of mm_struct, because it * is dynamically sized based on nr_cpu_ids. diff --git a/kernel/fork.c b/kernel/fork.c index f9572f416126..2cbca867f5a5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102
[PATCH v5 00/11] Introduces new count-based method for tracking lockless pagetable walks
If a process (qemu) with a lot of CPUs (128) try to munmap() a large chunk of memory (496GB) mapped with THP, it takes an average of 275 seconds, which can cause a lot of problems to the load (in qemu case, the guest will lock for this time). Trying to find the source of this bug, I found out most of this time is spent on serialize_against_pte_lookup(). This function will take a lot of time in smp_call_function_many() if there is more than a couple CPUs running the user process. Since it has to happen to all THP mapped, it will take a very long time for large amounts of memory. By the docs, serialize_against_pte_lookup() is needed in order to avoid pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless pagetable walk, to happen concurrently with THP splitting/collapsing. It does so by calling a do_nothing() on each CPU in mm->cpu_bitmap[], after interrupts are re-enabled. Since, interrupts are (usually) disabled during lockless pagetable walk, and serialize_against_pte_lookup will only return after interrupts are enabled, it is protected. So, by what I could understand, if there is no lockless pagetable walk running, there is no need to call serialize_against_pte_lookup(). So, to avoid the cost of running serialize_against_pte_lookup(), I propose a counter that keeps track of how many find_current_mm_pte() are currently running, and if there is none, just skip smp_call_function_many(). The related functions are: begin_lockless_pgtbl_walk() Insert before starting any lockless pgtable walk end_lockless_pgtbl_walk() Insert after the end of any lockless pgtable walk (Mostly after the ptep is last used) running_lockless_pgtbl_walk() Returns the number of lockless pgtable walks running On my workload (qemu), I could see munmap's time reduction from 275 seconds to 418ms. Also, I documented some lockless pagetable walks in which it's not necessary to keep track, given they work on init_mm or guest pgd. The patchset works by focusing all steps needed to begin/end lockless pagetable walks on the above functions, and then adding the config option to enable the tracking of these functions using the counting method. Changes since v4: Rebased on top of v5.4-rc1 Declared real generic functions instead of dummies start_lockless_pgtbl_walk renamed to begin_lockless_pgtbl_walk Interrupt {dis,en}able is now inside of {begin,end}_lockless_pgtbl_walk Power implementation has option to not {dis,en}able interrupt More documentation inside the funtions. Some irq maks variables renamed Removed some proxy mm_structs Few typos fixed Changes since v3: Explain (comments) why some lockless pgtbl walks don't need local_irq_disable (real mode + MSR_EE=0) Explain (comments) places where counting method is not needed (guest pgd, which is not touched by THP) Fixes some misplaced local_irq_restore() Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=132417 Changes since v2: Rebased to v5.3 Adds support on __get_user_pages_fast Adds usage decription to *_lockless_pgtbl_walk() Better style to dummy functions Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=131839 Changes since v1: Isolated atomic operations in functions *_lockless_pgtbl_walk() Fixed behavior of decrementing before last ptep was used Link: http://patchwork.ozlabs.org/patch/1163093/ Leonardo Bras (11): asm-generic/pgtable: Adds generic functions to monitor lockless pgtable walks powerpc/mm: Adds counting method to monitor lockless pgtable walks mm/gup: Applies counting method to monitor gup_pgd_range powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks powerpc/perf: Applies counting method to monitor lockless pgtbl walks powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks mm/Kconfig: Adds config option to track lockless pagetable walks powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing arch/powerpc/include/asm/book3s/64/pgtable.h | 9 ++ arch/powerpc/kernel/mce_power.c | 6 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 6 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 34 +- arch/powerpc/kvm/book3s_64_vio_hv.c | 7 +- arch/powerpc/kvm/book3s_hv_nested.c | 22 +++- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 32 ++--- arch/powerpc/kvm/e500_mmu_host.c | 9 +- arch/powerpc/mm/book3s64/hash_tlb.c | 6 +- arch/powerpc/mm/book3s64/hash_utils.c| 27 +++-- arch/powerpc/mm/book3s64/pgtable.c | 120 ++- arch/powerpc/perf/callchain.c| 6 +- include/asm-generic/pgtable.h
[PATCH v5 03/11] mm/gup: Applies counting method to monitor gup_pgd_range
As described, gup_pgd_range is a lockless pagetable walk. So, in order to monitor against THP split/collapse with the counting method, it's necessary to bound it with {begin,end}_lockless_pgtbl_walk. local_irq_{save,restore} is already inside {begin,end}_lockless_pgtbl_walk, so there is no need to repeat it here. Signed-off-by: Leonardo Bras --- mm/gup.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 23a9f9c9d377..52e53b4f39d8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2319,7 +2319,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { unsigned long len, end; - unsigned long flags; + unsigned long irq_mask; int nr = 0; start = untagged_addr(start) & PAGE_MASK; @@ -2345,9 +2345,9 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - local_irq_save(flags); + irq_mask = begin_lockless_pgtbl_walk(current->mm); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); - local_irq_restore(flags); + end_lockless_pgtbl_walk(current->mm, irq_mask); } return nr; @@ -2414,9 +2414,9 @@ int get_user_pages_fast(unsigned long start, int nr_pages, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - local_irq_disable(); + begin_lockless_pgtbl_walk(current->mm); gup_pgd_range(addr, end, gup_flags, pages, &nr); - local_irq_enable(); + end_lockless_pgtbl_walk(current->mm, IRQS_ENABLED); ret = nr; } -- 2.20.1
[PATCH v5 02/11] powerpc/mm: Adds counting method to monitor lockless pgtable walks
It's necessary to monitor lockless pagetable walks, in order to avoid doing THP splitting/collapsing during them. On powerpc, we need to do some lockless pagetable walks from functions that already have disabled interrupts, specially from real mode with MSR[EE=0]. In these contexts, disabling/enabling interrupts can be very troubling. So, this arch-specific implementation features functions with an extra argument that allows interrupt enable/disable to be skipped: __begin_lockless_pgtbl_walk() and __end_lockless_pgtbl_walk(). Functions similar to the generic ones are also exported, by calling the above functions with parameter *able_irq = false. While there is no config option, the method is disabled and these functions are only doing what was already needed to lockless pagetable walks (disabling interrupt). A memory barrier was also added just to make sure there is no speculative read outside the interrupt disabled area. Signed-off-by: Leonardo Bras --- arch/powerpc/include/asm/book3s/64/pgtable.h | 9 ++ arch/powerpc/mm/book3s64/pgtable.c | 117 +++ 2 files changed, 126 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b01624e5c467..8330b35cd28d 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1372,5 +1372,14 @@ static inline bool pgd_is_leaf(pgd_t pgd) return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); } +#define __HAVE_ARCH_LOCKLESS_PGTBL_WALK_CONTROL +unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm); +unsigned long __begin_lockless_pgtbl_walk(struct mm_struct *mm, + bool disable_irq); +void end_lockless_pgtbl_walk(struct mm_struct *mm, unsigned long irq_mask); +void __end_lockless_pgtbl_walk(struct mm_struct *mm, unsigned long irq_mask, + bool enable_irq); +int running_lockless_pgtbl_walk(struct mm_struct *mm); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 75483b40fcb1..ae557fdce9a3 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -98,6 +98,123 @@ void serialize_against_pte_lookup(struct mm_struct *mm) smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); } +/* + * Counting method to monitor lockless pagetable walks: + * Uses begin_lockless_pgtbl_walk and end_lockless_pgtbl_walk to track the + * number of lockless pgtable walks happening, and + * running_lockless_pgtbl_walk to return this value. + */ + +/* begin_lockless_pgtbl_walk: Must be inserted before a function call that does + * lockless pagetable walks, such as __find_linux_pte(). + * This version allows setting disable_irq=false, so irqs are not touched, which + * is quite useful for running when ints are already disabled (like real-mode) + */ + +inline unsigned long __begin_lockless_pgtbl_walk(struct mm_struct *mm, +bool disable_irq) +{ + unsigned long irq_mask = 0; + + if (IS_ENABLED(CONFIG_LOCKLESS_PAGE_TABLE_WALK_TRACKING)) + atomic_inc(&mm->lockless_pgtbl_walkers); + + /* +* Interrupts must be disabled during the lockless page table walk. +* That's because the deleting or splitting involves flushing TLBs, +* which in turn issues interrupts, that will block when disabled. +* +* When this function is called from realmode with MSR[EE=0], +* it's not needed to touch irq, since it's already disabled. +*/ + if (disable_irq) + local_irq_save(irq_mask); + + /* +* This memory barrier pairs with any code that is either trying to +* delete page tables, or split huge pages. Without this barrier, +* the page tables could be read speculatively outside of interrupt +* disabling or reference counting. +*/ + smp_mb(); + + return irq_mask; +} +EXPORT_SYMBOL(__begin_lockless_pgtbl_walk); + +/* begin_lockless_pgtbl_walk: Must be inserted before a function call that does + * lockless pagetable walks, such as __find_linux_pte(). + * This version is used by generic code, and always assume irqs being disabled + */ +unsigned long begin_lockless_pgtbl_walk(struct mm_struct *mm) +{ + return __begin_lockless_pgtbl_walk(mm, true); +} +EXPORT_SYMBOL(begin_lockless_pgtbl_walk); + +/* + * __end_lockless_pgtbl_walk: Must be inserted after the last use of a pointer + * returned by a lockless pagetable walk, such as __find_linux_pte() + * This version allows setting enable_irq=false, so irqs are not touched, which + * is quite useful for running when ints are already disabled (like real-mode) + */ +inline void __end_lockless_pgtbl_walk(struct mm_struct *mm, + unsi
Re: [PATCH v6 6/9] ima: make process_buffer_measurement() non static
[Cc'ing Prakhar] On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote: > To add the support for checking against blacklist, it would be needed > to add an additional measurement record that identifies the record > as blacklisted. > > This patch modifies the process_buffer_measurement() and makes it > non static to be used by blacklist functionality. It modifies the > function to handle more than just the KEXEC_CMDLINE. > > Signed-off-by: Nayna Jain Making process_buffer_measurement() non static is the end result, not the reason for the change. The reason for changing process_buffer_measurement() is to make it more generic. The blacklist measurement record is the usecase. Please rewrite the patch description. thanks, Mimi
Re: [PATCH v6 3/9] powerpc: add support to initialize ima policy rules
On Tue, 2019-10-01 at 12:07 -0400, Nayna wrote: > > On 09/30/2019 09:04 PM, Thiago Jung Bauermann wrote: > > Hello, > > Hi, > > > > >> diff --git a/arch/powerpc/kernel/ima_arch.c > >> b/arch/powerpc/kernel/ima_arch.c > >> new file mode 100644 > >> index ..39401b67f19e > >> --- /dev/null > >> +++ b/arch/powerpc/kernel/ima_arch.c > >> @@ -0,0 +1,33 @@ > >> +// SPDX-License-Identifier: GPL-2.0 > >> +/* > >> + * Copyright (C) 2019 IBM Corporation > >> + * Author: Nayna Jain > >> + */ > >> + > >> +#include > >> +#include > >> + > >> +bool arch_ima_get_secureboot(void) > >> +{ > >> + return is_powerpc_os_secureboot_enabled(); > >> +} > >> + > >> +/* Defines IMA appraise rules for secureboot */ > >> +static const char *const arch_rules[] = { > >> + "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", > >> +#if !IS_ENABLED(CONFIG_MODULE_SIG) > >> + "appraise func=MODULE_CHECK appraise_type=imasig|modsig", > >> +#endif > >> + NULL > >> +}; > >> + > >> +/* > >> + * Returns the relevant IMA arch policies based on the system secureboot > >> state. > >> + */ > >> +const char *const *arch_get_ima_policy(void) > >> +{ > >> + if (is_powerpc_os_secureboot_enabled()) > >> + return arch_rules; > >> + > >> + return NULL; > >> +} > > If CONFIG_MODULE_SIG is enabled but module signatures aren't enforced, > > then IMA won't enforce module signature either. x86's > > arch_get_ima_policy() calls set_module_sig_enforced(). Doesn't the > > powerpc version need to do that as well? > > > > On the flip side, if module signatures are enforced by the module > > subsystem then IMA will verify the signature a second time since there's > > no sharing of signature verification results between the module > > subsystem and IMA (this was observed by Mimi). > > > > IMHO this is a minor issue, since module loading isn't a hot path and > > the duplicate work shouldn't impact anything. But it could be avoided by > > having a NULL entry in arch_rules, which arch_get_ima_policy() would > > dynamically update with the "appraise func=MODULE_CHECK" rule if > > is_module_sig_enforced() is true. > > Thanks Thiago for reviewing. I am wondering that this will give two > meanings for NULL. Can we do something like below, there are possibly > two options ? > > 1. Set IMA_APPRAISED in the iint->flags if is_module_sig_enforced(). > > OR > > 2. Let ima_get_action() check for is_module_sig_enforced() when policy > is appraise and func is MODULE_CHECK. I'm a bit hesitant about mixing the module subsystem signature verification method with the IMA measure "template=ima-modsig" rules. Does it actually work? We can at least limit verifying the same appended signature twice to when "module.sig_enforce" is specified on the boot command line, by changing "!IS_ENABLED(CONFIG_MODULE_SIG)" to test "CONFIG_MODULE_SIG_FORCE". Mimi
[PATCH] macintosh/ams-input: switch to using input device polling mode
Now that instances of input_dev support polling mode natively, we no longer need to create input_polled_dev instance. Signed-off-by: Dmitry Torokhov --- drivers/macintosh/Kconfig | 1 - drivers/macintosh/ams/ams-input.c | 37 +++ drivers/macintosh/ams/ams.h | 4 ++-- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index 574e122ae105..da6a943ad746 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -247,7 +247,6 @@ config PMAC_RACKMETER config SENSORS_AMS tristate "Apple Motion Sensor driver" depends on PPC_PMAC && !PPC64 && INPUT && ((ADB_PMU && I2C = y) || (ADB_PMU && !I2C) || I2C) - select INPUT_POLLDEV help Support for the motion sensor included in PowerBooks. Includes implementations for PMU and I2C. diff --git a/drivers/macintosh/ams/ams-input.c b/drivers/macintosh/ams/ams-input.c index 06a96b3f11de..0da493d449b2 100644 --- a/drivers/macintosh/ams/ams-input.c +++ b/drivers/macintosh/ams/ams-input.c @@ -25,9 +25,8 @@ MODULE_PARM_DESC(invert, "Invert input data on X and Y axis"); static DEFINE_MUTEX(ams_input_mutex); -static void ams_idev_poll(struct input_polled_dev *dev) +static void ams_idev_poll(struct input_dev *idev) { - struct input_dev *idev = dev->input; s8 x, y, z; mutex_lock(&ams_info.lock); @@ -59,14 +58,10 @@ static int ams_input_enable(void) ams_info.ycalib = y; ams_info.zcalib = z; - ams_info.idev = input_allocate_polled_device(); - if (!ams_info.idev) + input = input_allocate_device(); + if (!input) return -ENOMEM; - ams_info.idev->poll = ams_idev_poll; - ams_info.idev->poll_interval = 25; - - input = ams_info.idev->input; input->name = "Apple Motion Sensor"; input->id.bustype = ams_info.bustype; input->id.vendor = 0; @@ -75,28 +70,32 @@ static int ams_input_enable(void) input_set_abs_params(input, ABS_X, -50, 50, 3, 0); input_set_abs_params(input, ABS_Y, -50, 50, 3, 0); input_set_abs_params(input, ABS_Z, -50, 50, 3, 0); + input_set_capability(input, EV_KEY, BTN_TOUCH); - set_bit(EV_ABS, input->evbit); - set_bit(EV_KEY, input->evbit); - set_bit(BTN_TOUCH, input->keybit); + error = input_setup_polling(input, ams_idev_poll); + if (error) + goto err_free_input; - error = input_register_polled_device(ams_info.idev); - if (error) { - input_free_polled_device(ams_info.idev); - ams_info.idev = NULL; - return error; - } + input_set_poll_interval(input, 25); + error = input_register_device(input); + if (error) + goto err_free_input; + + ams_info.idev = input; joystick = true; return 0; + +err_free_input: + input_free_device(input); + return error; } static void ams_input_disable(void) { if (ams_info.idev) { - input_unregister_polled_device(ams_info.idev); - input_free_polled_device(ams_info.idev); + input_unregister_device(ams_info.idev); ams_info.idev = NULL; } diff --git a/drivers/macintosh/ams/ams.h b/drivers/macintosh/ams/ams.h index fe8d596f9845..935bdd9cd9a6 100644 --- a/drivers/macintosh/ams/ams.h +++ b/drivers/macintosh/ams/ams.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include -#include +#include #include #include #include @@ -51,7 +51,7 @@ struct ams { #endif /* Joystick emulation */ - struct input_polled_dev *idev; + struct input_dev *idev; __u16 bustype; /* calibrated null values */ -- 2.23.0.444.g18eeb5a265-goog -- Dmitry
Re: [PATCH v6 8/9] ima: deprecate permit_directio, instead use appraise_flag
Hi Nayna, On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote: > This patch deprecates the existing permit_directio flag, instead adds > it as possible value to appraise_flag parameter. > For eg. > appraise_flag=permit_directio Defining a generic "appraise_flag=", which supports different options, is the right direction. I would really like to depreciate the "permit_directio" flag, not just change the policy syntax. For now, let's drop this change. Mimi
Re: [PATCH v6 7/9] ima: check against blacklisted hashes for files with modsig
On Fri, 2019-09-27 at 10:25 -0400, Nayna Jain wrote: > Asymmetric private keys are used to sign multiple files. The kernel > currently support checking against the blacklisted keys. However, if the > public key is blacklisted, any file signed by the blacklisted key will > automatically fail signature verification. We might not want to blacklist > all the files signed by a particular key, but just a single file. > Blacklisting the public key is not fine enough granularity. > > This patch adds support for blacklisting binaries with appended signatures, > based on the IMA policy. Defined is a new policy option > "appraise_flag=check_blacklist". > > Signed-off-by: Nayna Jain > --- > Documentation/ABI/testing/ima_policy | 1 + > security/integrity/ima/ima.h | 12 + > security/integrity/ima/ima_appraise.c | 35 +++ > security/integrity/ima/ima_main.c | 8 -- > security/integrity/ima/ima_policy.c | 10 ++-- > security/integrity/integrity.h| 1 + > 6 files changed, 63 insertions(+), 4 deletions(-) > > diff --git a/Documentation/ABI/testing/ima_policy > b/Documentation/ABI/testing/ima_policy > index 29ebe9afdac4..4c97afcc0f3c 100644 > --- a/Documentation/ABI/testing/ima_policy > +++ b/Documentation/ABI/testing/ima_policy > @@ -25,6 +25,7 @@ Description: > lsm:[[subj_user=] [subj_role=] [subj_type=] >[obj_user=] [obj_role=] [obj_type=]] > option: [[appraise_type=]] [template=] [permit_directio] > + [appraise_flag=[check_blacklist]] > base: func:= > [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] > [FIRMWARE_CHECK] > [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] > diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h > index 9bf509217e8e..2c034728b239 100644 > --- a/security/integrity/ima/ima.h > +++ b/security/integrity/ima/ima.h > @@ -254,6 +254,9 @@ int ima_policy_show(struct seq_file *m, void *v); > #define IMA_APPRAISE_KEXEC 0x40 > > #ifdef CONFIG_IMA_APPRAISE > +int ima_blacklist_measurement(struct integrity_iint_cache *iint, > + const struct modsig *modsig, int action, > + int pcr, struct ima_template_desc *template_desc); > int ima_appraise_measurement(enum ima_hooks func, >struct integrity_iint_cache *iint, >struct file *file, const unsigned char *filename, > @@ -269,6 +272,15 @@ int ima_read_xattr(struct dentry *dentry, > struct evm_ima_xattr_data **xattr_value); > > #else > +static inline int ima_blacklist_measurement(struct integrity_iint_cache > *iint, > + const struct modsig *modsig, > + int action, int pcr, > + struct ima_template_desc > + *template_desc) > +{ > + return 0; > +} > + > static inline int ima_appraise_measurement(enum ima_hooks func, > struct integrity_iint_cache *iint, > struct file *file, > diff --git a/security/integrity/ima/ima_appraise.c > b/security/integrity/ima/ima_appraise.c > index 136ae4e0ee92..a5a82e870e24 100644 > --- a/security/integrity/ima/ima_appraise.c > +++ b/security/integrity/ima/ima_appraise.c > @@ -12,6 +12,7 @@ > #include > #include > #include > +#include > > #include "ima.h" > > @@ -303,6 +304,40 @@ static int modsig_verify(enum ima_hooks func, const > struct modsig *modsig, > return rc; > } > > +/* > + * ima_blacklist_measurement - checks if the file measurement is blacklisted > + * > + * Returns -EKEYREJECTED if the hash is blacklisted. > + */ In the function comment, please mention that blacklisted binaries are included in the IMA measurement list. > +int ima_blacklist_measurement(struct integrity_iint_cache *iint, > + const struct modsig *modsig, int action, int pcr, > + struct ima_template_desc *template_desc) > +{ > + enum hash_algo hash_algo; > + const u8 *digest = NULL; > + u32 digestsize = 0; > + u32 secid; > + int rc = 0; > + > + if (!(iint->flags & IMA_CHECK_BLACKLIST)) > + return 0; > + > + if (iint->flags & IMA_MODSIG_ALLOWED) { > + security_task_getsecid(current, &secid); > + ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); > + > + rc = is_hash_blacklisted(digest, digestsize, "bin"); > + > + /* Returns -EKEYREJECTED on blacklisted hash found */ is_hash_blacklisted() returning -EKEYREJECTED makes sense if the key is blacklisted, not so much for a binary. It would make more sense to define is_binary_blacklis
[PATCH 3.16 76/87] perf/ioctl: Add check for the sample_period value
3.16.75-rc1 review patch. If anyone has any objections, please let me know. -- From: Ravi Bangoria commit 913a90bc5a3a06b1f04c337320e9aeee2328dd77 upstream. perf_event_open() limits the sample_period to 63 bits. See: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits") Make ioctl() consistent with it. Also on PowerPC, negative sample_period could cause a recursive PMIs leading to a hang (reported when running perf-fuzzer). Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: ma...@linux.vnet.ibm.com Cc: m...@ellerman.id.au Fixes: 0819b2e30ccb ("perf: Limit perf_event_attr::sample_period to 63 bits") Link: https://lkml.kernel.org/r/20190604042953.914-1-ravi.bango...@linux.ibm.com Signed-off-by: Ingo Molnar [bwh: Backported to 3.16: adjust context] Signed-off-by: Ben Hutchings --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3823,6 +3823,9 @@ static int perf_event_period(struct perf if (perf_event_check_period(event, value)) return -EINVAL; + if (!event->attr.freq && (value & (1ULL << 63))) + return -EINVAL; + task = ctx->task; pe.value = value;
Re: [PATCH v4 0/5] Early node associativity
Hi Srikar, Srikar Dronamraju writes: > Abdul reported a warning on a shared lpar. > "WARNING: workqueue cpumask: online intersect > possible intersect". > This is because per node workqueue possible mask is set very early in the > boot process even before the system was querying the home node > associativity. However per node workqueue online cpumask gets updated > dynamically. Hence there is a chance when per node workqueue online cpumask > is a superset of per node workqueue possible mask. Sorry for the delay in following up on these. The series looks good to me, and I've given it a little testing with LPM and DLPAR. I've also verified that the cpu assignments occur early as intended on an LPAR where that workqueue warning had been triggered.
Re: [PATCH v4 5/5] powerpc/numa: Remove late request for home node associativity
Srikar Dronamraju writes: > With commit ("powerpc/numa: Early request for home node associativity"), > commit 2ea626306810 ("powerpc/topology: Get topology for shared > processors at boot") which was requesting home node associativity > becomes redundant. > > Hence remove the late request for home node associativity. Reviewed-by: Nathan Lynch
Re: [PATCH v4 4/5] powerpc/numa: Early request for home node associativity
Srikar Dronamraju writes: > Currently the kernel detects if its running on a shared lpar platform > and requests home node associativity before the scheduler sched_domains > are setup. However between the time NUMA setup is initialized and the > request for home node associativity, workqueue initializes its per node > cpumask. The per node workqueue possible cpumask may turn invalid > after home node associativity resulting in weird situations like > workqueue possible cpumask being a subset of workqueue online cpumask. > > This can be fixed by requesting home node associativity earlier just > before NUMA setup. However at the NUMA setup time, kernel may not be in > a position to detect if its running on a shared lpar platform. So > request for home node associativity and if the request fails, fallback > on the device tree property. Reviewed-by: Nathan Lynch
Re: [PATCH v4 3/5] powerpc/numa: Use cpu node map of first sibling thread
Srikar Dronamraju writes: > All the sibling threads of a core have to be part of the same node. > To ensure that all the sibling threads map to the same node, always > lookup/update the cpu-to-node map of the first thread in the core. Reviewed-by: Nathan Lynch
Re: [PATCH v4 2/5] powerpc/numa: Handle extra hcall_vphn error cases
Srikar Dronamraju writes: > Currently code handles H_FUNCTION, H_SUCCESS, H_HARDWARE return codes. > However hcall_vphn can return other return codes. Now it also handles > H_PARAMETER return code. Also the rest return codes are handled under the > default case. Reviewed-by: Nathan Lynch However: > + case H_PARAMETER: > + pr_err_ratelimited("hcall_vphn() was passed an invalid > parameter. " > + "Disabling polling...\n"); Consider including the bad parameter value in the message?
Re: [PATCH v4 1/5] powerpc/vphn: Check for error from hcall_vphn
Srikar Dronamraju writes: > There is no value in unpacking associativity, if > H_HOME_NODE_ASSOCIATIVITY hcall has returned an error. Reviewed-by: Nathan Lynch
Re: [PATCH v8 1/5] kasan: support backing vmalloc space with real shadow memory
On Wed, Oct 02, 2019 at 11:23:06AM +1000, Daniel Axtens wrote: > Hi, > > >>/* > >> * Find a place in the tree where VA potentially will be > >> * inserted, unless it is merged with its sibling/siblings. > >> @@ -741,6 +752,10 @@ merge_or_add_vmap_area(struct vmap_area *va, > >>if (sibling->va_end == va->va_start) { > >>sibling->va_end = va->va_end; > >> > >> + kasan_release_vmalloc(orig_start, orig_end, > >> +sibling->va_start, > >> +sibling->va_end); > >> + > > The same. > > The call to kasan_release_vmalloc() is a static inline no-op if > CONFIG_KASAN_VMALLOC is not defined, which I thought was the preferred > way to do things rather than sprinkling the code with ifdefs? > I agree that is totally correct. > The complier should be smart enough to eliminate all the > orig_state/orig_end stuff at compile time because it can see that it's > not used, so there's no cost in the binary. > It should. I was more thinking about if those two variables can be considered as unused, resulting in compile warning like "set but not used". But that is theory and in case of having any warning the test robot will notify anyway about that. So, i am totally fine with that if compiler does not complain. If so, please ignore my comments :) -- Vlad Rezki
Re: [PATCH v2 00/21] Refine memblock API
On Wed, Oct 2, 2019 at 2:36 AM Mike Rapoport wrote: > > Hi Adam, > > On Tue, Oct 01, 2019 at 07:14:13PM -0500, Adam Ford wrote: > > On Sun, Sep 29, 2019 at 8:33 AM Adam Ford wrote: > > > > > > I am attaching two logs. I now the mailing lists will be unhappy, but > > > don't want to try and spam a bunch of log through the mailing liast. > > > The two logs show the differences between the working and non-working > > > imx6q 3D accelerator when trying to run a simple glmark2-es2-drm demo. > > > > > > The only change between them is the 2 line code change you suggested. > > > > > > In both cases, I have cma=128M set in my bootargs. Historically this > > > has been sufficient, but cma=256M has not made a difference. > > > > > > > Mike any suggestions on how to move forward? > > I was hoping to get the fixes tested and pushed before 5.4 is released > > if at all possible > > I have a fix (below) that kinda restores the original behaviour, but I > still would like to double check to make sure it's not a band aid and I > haven't missed the actual root cause. > > Can you please send me your device tree definition and the output of > > cat /sys/kernel/debug/memblock/memory > > and > > cat /sys/kernel/debug/memblock/reserved > > Thanks! > Before the patch: # cat /sys/kernel/debug/memblock/memory 0: 0x1000..0x8fff # cat /sys/kernel/debug/memblock/reserved 0: 0x10004000..0x10007fff 1: 0x1010..0x11ab141f 2: 0x1fff1000..0x1fffcfff 3: 0x2ee4..0x2ef53fff 4: 0x2ef56940..0x2ef56c43 5: 0x2ef56c48..0x2fffefff 6: 0x20c0..0x24d8 7: 0x2500..0x255f 8: 0x2580..0x2703 9: 0x2740..0x2918 10: 0x2940..0x29cf 11: 0x2a00..0x2a0f 12: 0x2a40..0x2a43 13: 0x2a80..0x2ad5 14: 0x2b00..0x2b55 15: 0x2b80..0x2bd5 16: 0x2c00..0x2c4e 17: 0x2c50..0x2c6a 18: 0x2c6c..0x2ce6 19: 0x2ce8..0x2d02 20: 0x2d04..0x2d1e 21: 0x2d20..0x2d3a 22: 0x2d3c..0x2d56 23: 0x2d58..0x2e30 24: 0x2e34..0x2e4c 25: 0x2e50..0x2e68 26: 0x2e6c..0x2e84 27: 0x2e88..0x2ea0 28: 0x2ea4..0x2ebc 29: 0x2ec0..0x2edf 30: 0x2ee4..0x2efc 31: 0x2f00..0x2f13 32: 0x2f28..0x2f4b 33: 0x2f50..0x2f84 34: 0x2f88..0x3fff After the patch: # cat /sys/kernel/debug/memblock/memory 0: 0x1000..0x8fff # cat /sys/kernel/debug/memblock/reserved 0: 0x10004000..0x10007fff 1: 0x1010..0x11ab141f 2: 0x1fff1000..0x1fffcfff 3: 0x3eec..0x3efd3fff 4: 0x3efd6940..0x3efd6c43 5: 0x3efd6c48..0x3fffbfff 6: 0x3fffc0c0..0x3fffc4d8 7: 0x3fffc500..0x3fffc55f 8: 0x3fffc580..0x3fffc703 9: 0x3fffc740..0x3fffc918 10: 0x3fffc940..0x3fffc9cf 11: 0x3fffca00..0x3fffca0f 12: 0x3fffca40..0x3fffca43 13: 0x3fffca80..0x3fffca83 14: 0x3fffcac0..0x3fffcb15 15: 0x3fffcb40..0x3fffcb95 16: 0x3fffcbc0..0x3fffcc15 17: 0x3fffcc28..0x3fffcc72 18: 0x3fffcc74..0x3fffcc8e 19: 0x3fffcc90..0x3fffcd0a 20: 0x3fffcd0c..0x3fffcd26 21: 0x3fffcd28..0x3fffcd42 22: 0x3fffcd44..0x3fffcd5e 23: 0x3fffcd60..0x3fffcd7a 24: 0x3fffcd7c..0x3fffce54 25: 0x3fffce58..0x3fffce70 26: 0x3fffce74..0x3fffce8c 27: 0x3fffce90..0x3fffcea8 28: 0x3fffceac..0x3fffcec4 29: 0x3fffcec8..0x3fffcee0 30: 0x3fffcee4..0x3fffcefc 31: 0x3fffcf00..0x3fffcf1f 32: 0x3fffcf28..0x3fffcf53 33: 0x3fffcf68..0x3fffcf8b 34: 0x3fffcf90..0x3fffcfac 35: 0x3fffcfb0..0x3fff 36: 0x8000..0x8fff > From 06529f861772b7dea2912fc2245debe4690139b8 Mon Sep 17 00:00:00 2001 > From: Mike Rapoport > Date: Wed, 2 Oct 2019 10:14:17 +0300 > Subject: [PATCH] mm: memblock: do not enforce current limit for memblock_phys* > family > > Until commit 92d12f9544b7 ("memblock: refactor internal allocation > functions") the maximal address for memblock allocations was forced to > memblock.current_limit only for the allocation functions returning virtual > address. The changes introduced by that commit moved the limit enforcement > into the allocation core and as a result the allocation functions returning > physical address also started to limit allocations to > memblock.current_limit. > > This caused breakage of etnaviv GPU driver: > > [3.682347] etnaviv etnaviv: bound 13.gpu (ops gpu_ops) > [3.688669] etnaviv etnaviv: bound 134000.gpu (ops gpu_ops) > [3.695099] etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops) > [3.700800] etnaviv-gpu 13.gpu: model: GC2000, revision: 5108 > [3.723013] etnaviv-gpu 13.gpu: command buffer outside valid > memory window > [3.731308] etnaviv-gpu 134000.gpu: model: GC320, revision: 5007 > [3.752437] etnaviv-gpu 134000.gpu: command buffer outside valid > memory window > [3.760583] etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215 > [3.766766] etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0 > > Resto
Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()
Andy Shevchenko wrote: > > but I confess to being a little ambivalent. It's > > arguably a little easier to read, > > I have another opinion here. Instead of parsing body of for-loop, the name of > the function tells you exactly what it's done. Besides the fact that reading > and parsing two lines, with zero conditionals, is faster. > > > but it's not nearly as efficient > > (not a great concern here) > > David, do you know why for_each_set_bit() has no optimization for the cases > when nbits <= BITS_PER_LONG? (Actually find_*bit() family of functions) I've not had anything to do with for_each_set_bit() itself. By 'nbits', I presume you mean the size parameter - max in the sample bit of code. It would need per-arch optimisation. Some arches have an instruction to find the next bit and some don't. Using for_each_set_bit() like this is definitely suboptimal, since find_first_bit() and find_next_bit() may well be out of line. It should probably be using something like __ffs() if size <= BITS_PER_LONG. David
Re: [PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
* Vasant Hegde [2019-10-02 13:18:56]: > Add check to validate whether requested page is part of system RAM > or not before mmap() and error out if its not part of system RAM. > > cat /proc/iomem: > - > -27 : System RAM > 28-2f : namespace0.0 > 2000-2027 : System RAM > 2028-202f : namespace1.0 > 6-6003fbfff : pciex@600c3c000 > 60040-6007f7fff : pciex@600c3c010 > > > > Sample dmesg output with this fix: > -- > [ 160.371911] opal-prd: mmap: Requested page is not part of system RAM (addr > : 0x202ffcfc, size : 0x0057) > [ 160.665366] opal-prd: mmap: Requested page is not part of system RAM (addr > : 0x202ffcfc, size : 0x0057) > [ 160.914627] opal-prd: mmap: Requested page is not part of system RAM (addr > : 0x202ffcfc, size : 0x0057) > [ 161.165253] opal-prd: mmap: Requested page is not part of system RAM (addr > : 0x202ffcfc, size : 0x0057) > [ 161.414604] opal-prd: mmap: Requested page is not part of system RAM (addr > : 0x202ffcfc, size : 0x0057) Thanks Vasant for the patch. HBRT pages landing in NVDIMM area caused the opal-prd failure. Good debug and root-cause. Thanks to Aneesh for the tip to look into ATT bits of the PTE mapping. > CC: Aneesh Kumar K.V > CC: Jeremy Kerr > CC: Vaidyanathan Srinivasan > Signed-off-by: Vasant Hegde Signed-off-by: Vaidyanathan Srinivasan > --- > arch/powerpc/platforms/powernv/opal-prd.c | 16 +++- > 1 file changed, 15 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/platforms/powernv/opal-prd.c > b/arch/powerpc/platforms/powernv/opal-prd.c > index 45f4223a790f..0f88752302a2 100644 > --- a/arch/powerpc/platforms/powernv/opal-prd.c > +++ b/arch/powerpc/platforms/powernv/opal-prd.c > @@ -3,7 +3,7 @@ > * OPAL Runtime Diagnostics interface driver > * Supported on POWERNV platform > * > - * Copyright IBM Corporation 2015 > + * Copyright IBM Corporation 2015-2019 > */ > > #define pr_fmt(fmt) "opal-prd: " fmt > @@ -47,6 +47,20 @@ static bool opal_prd_range_is_valid(uint64_t addr, > uint64_t size) > if (addr + size < addr) > return false; > > + /* > + * Check if region is in system RAM and error out if the address > + * belongs to special devices like NVDIMM. phys_mem_access_prot() > + * routine will change ATT bits to non cachable if page is not in > + * RAM, causing HBRT to not fetch and execute in the mapped memory > + * and fail. Page permissions can be left at default since all > + * firmware component should be in system RAM only. > + */ > + if (!page_is_ram(addr >> PAGE_SHIFT)) { > + pr_warn("mmap: Requested page is not part of system RAM " > + "(addr : 0x%016llx, size : 0x%016llx)\n", addr, size); > + return false; > + } > + > parent = of_find_node_by_path("/reserved-memory"); > if (!parent) > return false; > -- > 2.21.0 >
Re: [RFC PATCH 23/27] powerpc/64: system call implement the bulk of the logic in C
On Sun, Sep 15, 2019 at 11:28:09AM +1000, Nicholas Piggin wrote: > System call entry and particularly exit code is beyond the limit of what > is reasonable to implement in asm. > > This conversion moves all conditional branches out of the asm code, > except for the case that all GPRs should be restored at exit. > > Null syscall test is about 5% faster after this patch, because the exit > work is handled under local_irq_disable, and the hard mask and pending > interrupt replay is handled after that, which avoids games with MSR. > > Signed-off-by: Nicholas Piggin > > v3: > - Fix !KUAP build [mpe] > - Fix BookE build/boot [mpe] > - Don't trace irqs with MSR[RI]=0 > - Don't allow syscall_exit_prepare to be ftraced, because function > graph tracing which traces exits barfs after the IRQ state is > prepared for kernel exit. > - Fix BE syscall table to use normal function descriptors now that they > are called from C. > - Comment syscall_exit_prepare. ... > -#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && defined(CONFIG_PPC_SPLPAR) > -BEGIN_FW_FTR_SECTION > - /* see if there are any DTL entries to process */ > - ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ > - ld r11,PACA_DTL_RIDX(r13) /* get log read index */ > - addir10,r10,LPPACA_DTLIDX > - LDX_BE r10,0,r10 /* get log write index */ > - cmpdr11,r10 > - beq+33f > - bl accumulate_stolen_time > - REST_GPR(0,r1) > - REST_4GPRS(3,r1) > - REST_2GPRS(7,r1) > - addir9,r1,STACK_FRAME_OVERHEAD > -33: > -END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) > -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE && CONFIG_PPC_SPLPAR */ ... > diff --git a/arch/powerpc/kernel/syscall_64.c > b/arch/powerpc/kernel/syscall_64.c > new file mode 100644 > index ..1d2529824588 > --- /dev/null > +++ b/arch/powerpc/kernel/syscall_64.c > @@ -0,0 +1,195 @@ > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +extern void __noreturn tabort_syscall(void); > + > +typedef long (*syscall_fn)(long, long, long, long, long, long); > + > +long system_call_exception(long r3, long r4, long r5, long r6, long r7, long > r8, unsigned long r0, struct pt_regs *regs) > +{ > + unsigned long ti_flags; > + syscall_fn f; > + > + BUG_ON(!(regs->msr & MSR_PR)); > + > + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && > + unlikely(regs->msr & MSR_TS_T)) > + tabort_syscall(); > + > + account_cpu_user_entry(); > + > +#ifdef CONFIG_PPC_SPLPAR > + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && > + firmware_has_feature(FW_FEATURE_SPLPAR)) { > + struct lppaca *lp = get_lppaca(); > + > + if (unlikely(local_paca->dtl_ridx != lp->dtl_idx)) sparse complains about this, and in time.c this it converted like this: if (unlikely(local_paca->dtl_ridx != be64_to_cpu(lp->dtl_idx))) The removed code has a LDX_BE there so there should be some conversion involved, right? Thanks Michal
Re: [RFC PATCH 27/27] powerpc/64s: system call support for scv/rfscv instructions
On Sun, Sep 15, 2019 at 11:28:13AM +1000, Nicholas Piggin wrote: > Add support for the scv instruction on POWER9 and later CPUs. > > For now this implements the zeroth scv vector 'scv 0', as identical > to 'sc' system calls, with the exception that lr is not preserved, and > it is 64-bit only. There may yet be changes made to this ABI, so it's > for testing only. > > This also doesn't yet properly handle PR KVM, or the case where a guest > is denied AIL=3 mode. I haven't added real mode entry points, so scv > must not be used when AIL=0, but I haven't cleared the FSCR in this > case. > > This also implements a strange hack to handle the non-implemented > vectors, scheduling a decrementer and expecting it to interrupt and > replay pending soft masked interrupts. This is unlikely to be a good > idea, and needs to actually do a proper handler and replay in case an > interrupt is pending. > > It may also require some errata handling before it can be safely used > on all POWER9 CPUs, I have to look that up. > > rfscv is implemented to return from scv type system calls. It can not > be used to return from sc system calls because those are defined to > preserve lr. > > In a comparison of getpid syscall, the test program had scv taking > about 3 more cycles in user mode (92 vs 89 for sc), due to lr handling. > Total cycles taken for a getpid system call on POWER9 are improved from > 436 to 345 (26.3% faster), mostly due to reducing mtmsr and mtspr. ... > diff --git a/arch/powerpc/kernel/syscall_64.c > b/arch/powerpc/kernel/syscall_64.c > index 034b52d3d78c..3e8aa5ae8ec8 100644 > --- a/arch/powerpc/kernel/syscall_64.c > +++ b/arch/powerpc/kernel/syscall_64.c > @@ -15,6 +15,77 @@ extern void __noreturn tabort_syscall(void); > > typedef long (*syscall_fn)(long, long, long, long, long, long); > > +#ifdef CONFIG_PPC_BOOK3S > +long system_call_vectored_exception(long r3, long r4, long r5, long r6, long > r7, long r8, unsigned long r0, struct pt_regs *regs) > +{ > + unsigned long ti_flags; > + syscall_fn f; > + > + BUG_ON(!(regs->msr & MSR_RI)); > + BUG_ON(!(regs->msr & MSR_PR)); > + BUG_ON(!FULL_REGS(regs)); > + BUG_ON(regs->softe != IRQS_ENABLED); > + > + if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) && > + unlikely(regs->msr & MSR_TS_T)) > + tabort_syscall(); > + > + account_cpu_user_entry(); > + > +#ifdef CONFIG_PPC_SPLPAR > + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE) && > + firmware_has_feature(FW_FEATURE_SPLPAR)) { > + struct lppaca *lp = get_lppaca(); > + > + if (unlikely(local_paca->dtl_ridx != lp->dtl_idx)) This adds another instance of the lack of endian conversion issue. > + accumulate_stolen_time(); > + } > +#endif Thanks Michal
Re: [RFC PATCH 24/27] powerpc/64s: interrupt return in C
On Sun, Sep 15, 2019 at 11:28:10AM +1000, Nicholas Piggin wrote: > Implement the bulk of interrupt return logic in C. The asm return code > must handle a few cases: restoring full GPRs, and emulating stack store. > > The asm return code is moved into 64e for now. The new logic has made > allowance for 64e, but I don't have a full environment that works well > to test it, and even booting in emulated qemu is not great for stress > testing. 64e shouldn't be too far off working with this, given a bit > more testing and auditing of the logic. > > This is slightly faster on a POWER9 (page fault speed increases about > 1.1%), probably due to reduced mtmsrd. > > Signed-off-by: Nicholas Piggin ... > diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c > index 24621e7e5033..45c1524b6c9e 100644 > --- a/arch/powerpc/kernel/process.c > +++ b/arch/powerpc/kernel/process.c > @@ -524,6 +524,7 @@ void giveup_all(struct task_struct *tsk) > } > EXPORT_SYMBOL(giveup_all); > > +#ifdef CONFIG_PPC_BOOK3S_64 This fails build on !BOOK3S_64 because restore_fpu and restore_altivec are used exclusively from restore_math which is now BOOK3S_64 only. > /* > * The exception exit path calls restore_math() with interrupts hard disabled > * but the soft irq state not "reconciled". ftrace code that calls > @@ -564,6 +565,7 @@ void notrace restore_math(struct pt_regs *regs) > > regs->msr = msr; > } > +#endif > > static void save_all(struct task_struct *tsk) > { Thanks Michal
Re: [RFC PATCH 00/27] current interrupt series plus scv syscall
Hello, can you mark the individual patches with RFC rather than the wole series? Thanks Michal On Sun, Sep 15, 2019 at 11:27:46AM +1000, Nicholas Piggin wrote: > My interrupt entry patches have finally collided with syscall and > interrupt exit patches, so I'll merge the series. Most patches have > been seen already, however there have been a number of small changes > and fixes throughout the series. > > The final two patches add support for 'scv' and 'rfscv' instructions. > > I'm posting this out now so we can start considering ABI and userspace > support. We have the PPC_FEATURE2_SCV hwcap for this. > > Thanks, > Nick > > Nicholas Piggin (27): > powerpc/64s/exception: Introduce INT_DEFINE parameter block for code > generation > powerpc/64s/exception: Add GEN_COMMON macro that uses INT_DEFINE > parameters > powerpc/64s/exception: Add GEN_KVM macro that uses INT_DEFINE > parameters > powerpc/64s/exception: Expand EXC_COMMON and EXC_COMMON_ASYNC macros > powerpc/64s/exception: Move all interrupt handlers to new style code > gen macros > powerpc/64s/exception: Remove old INT_ENTRY macro > powerpc/64s/exception: Remove old INT_COMMON macro > powerpc/64s/exception: Remove old INT_KVM_HANDLER > powerpc/64s/exception: Add ISIDE option > powerpc/64s/exception: move real->virt switch into the common handler > powerpc/64s/exception: move soft-mask test to common code > powerpc/64s/exception: move KVM test to common code > powerpc/64s/exception: remove confusing IEARLY option > powerpc/64s/exception: remove the SPR saving patch code macros > powerpc/64s/exception: trim unused arguments from KVMTEST macro > powerpc/64s/exception: hdecrementer avoid touching the stack > powerpc/64s/exception: re-inline some handlers > powerpc/64s/exception: Clean up SRR specifiers > powerpc/64s/exception: add more comments for interrupt handlers > powerpc/64s/exception: only test KVM in SRR interrupts when PR KVM is > supported > powerpc/64s/exception: soft nmi interrupt should not use > ret_from_except > powerpc/64: system call remove non-volatile GPR save optimisation > powerpc/64: system call implement the bulk of the logic in C > powerpc/64s: interrupt return in C > powerpc/64s/exception: remove lite interrupt return > powerpc/64s/exception: treat NIA below __end_interrupts as soft-masked > powerpc/64s: system call support for scv/rfscv instructions > > arch/powerpc/include/asm/asm-prototypes.h | 11 - > .../powerpc/include/asm/book3s/64/kup-radix.h | 24 +- > arch/powerpc/include/asm/cputime.h| 24 + > arch/powerpc/include/asm/exception-64s.h |4 - > arch/powerpc/include/asm/head-64.h|2 +- > arch/powerpc/include/asm/hw_irq.h |4 + > arch/powerpc/include/asm/ppc_asm.h|2 + > arch/powerpc/include/asm/processor.h |2 +- > arch/powerpc/include/asm/ptrace.h |3 + > arch/powerpc/include/asm/signal.h |3 + > arch/powerpc/include/asm/switch_to.h | 11 + > arch/powerpc/include/asm/time.h |4 +- > arch/powerpc/kernel/Makefile |3 +- > arch/powerpc/kernel/cpu_setup_power.S |2 +- > arch/powerpc/kernel/dt_cpu_ftrs.c |1 + > arch/powerpc/kernel/entry_64.S| 964 ++-- > arch/powerpc/kernel/exceptions-64e.S | 254 +- > arch/powerpc/kernel/exceptions-64s.S | 2046 - > arch/powerpc/kernel/process.c |2 + > arch/powerpc/kernel/signal.h |2 - > arch/powerpc/kernel/syscall_64.c | 422 > arch/powerpc/kernel/syscalls/syscall.tbl | 22 +- > arch/powerpc/kernel/systbl.S |9 +- > arch/powerpc/kernel/time.c|9 - > arch/powerpc/kernel/vector.S |2 +- > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 - > arch/powerpc/kvm/book3s_segment.S |7 - > 27 files changed, 2458 insertions(+), 1392 deletions(-) > create mode 100644 arch/powerpc/kernel/syscall_64.c > > -- > 2.23.0 >
[PATCH] powerpc/powernv/prd: Validate whether address to be mapped is part of system RAM
Add check to validate whether requested page is part of system RAM or not before mmap() and error out if its not part of system RAM. cat /proc/iomem: - -27 : System RAM 28-2f : namespace0.0 2000-2027 : System RAM 2028-202f : namespace1.0 6-6003fbfff : pciex@600c3c000 60040-6007f7fff : pciex@600c3c010 Sample dmesg output with this fix: -- [ 160.371911] opal-prd: mmap: Requested page is not part of system RAM (addr : 0x202ffcfc, size : 0x0057) [ 160.665366] opal-prd: mmap: Requested page is not part of system RAM (addr : 0x202ffcfc, size : 0x0057) [ 160.914627] opal-prd: mmap: Requested page is not part of system RAM (addr : 0x202ffcfc, size : 0x0057) [ 161.165253] opal-prd: mmap: Requested page is not part of system RAM (addr : 0x202ffcfc, size : 0x0057) [ 161.414604] opal-prd: mmap: Requested page is not part of system RAM (addr : 0x202ffcfc, size : 0x0057) CC: Aneesh Kumar K.V CC: Jeremy Kerr CC: Vaidyanathan Srinivasan Signed-off-by: Vasant Hegde --- arch/powerpc/platforms/powernv/opal-prd.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal-prd.c b/arch/powerpc/platforms/powernv/opal-prd.c index 45f4223a790f..0f88752302a2 100644 --- a/arch/powerpc/platforms/powernv/opal-prd.c +++ b/arch/powerpc/platforms/powernv/opal-prd.c @@ -3,7 +3,7 @@ * OPAL Runtime Diagnostics interface driver * Supported on POWERNV platform * - * Copyright IBM Corporation 2015 + * Copyright IBM Corporation 2015-2019 */ #define pr_fmt(fmt) "opal-prd: " fmt @@ -47,6 +47,20 @@ static bool opal_prd_range_is_valid(uint64_t addr, uint64_t size) if (addr + size < addr) return false; + /* +* Check if region is in system RAM and error out if the address +* belongs to special devices like NVDIMM. phys_mem_access_prot() +* routine will change ATT bits to non cachable if page is not in +* RAM, causing HBRT to not fetch and execute in the mapped memory +* and fail. Page permissions can be left at default since all +* firmware component should be in system RAM only. +*/ + if (!page_is_ram(addr >> PAGE_SHIFT)) { + pr_warn("mmap: Requested page is not part of system RAM " + "(addr : 0x%016llx, size : 0x%016llx)\n", addr, size); + return false; + } + parent = of_find_node_by_path("/reserved-memory"); if (!parent) return false; -- 2.21.0
Re: [PATCH v2 00/21] Refine memblock API
Hi Adam, On Tue, Oct 01, 2019 at 07:14:13PM -0500, Adam Ford wrote: > On Sun, Sep 29, 2019 at 8:33 AM Adam Ford wrote: > > > > I am attaching two logs. I now the mailing lists will be unhappy, but > > don't want to try and spam a bunch of log through the mailing liast. > > The two logs show the differences between the working and non-working > > imx6q 3D accelerator when trying to run a simple glmark2-es2-drm demo. > > > > The only change between them is the 2 line code change you suggested. > > > > In both cases, I have cma=128M set in my bootargs. Historically this > > has been sufficient, but cma=256M has not made a difference. > > > > Mike any suggestions on how to move forward? > I was hoping to get the fixes tested and pushed before 5.4 is released > if at all possible I have a fix (below) that kinda restores the original behaviour, but I still would like to double check to make sure it's not a band aid and I haven't missed the actual root cause. Can you please send me your device tree definition and the output of cat /sys/kernel/debug/memblock/memory and cat /sys/kernel/debug/memblock/reserved Thanks! >From 06529f861772b7dea2912fc2245debe4690139b8 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 2 Oct 2019 10:14:17 +0300 Subject: [PATCH] mm: memblock: do not enforce current limit for memblock_phys* family Until commit 92d12f9544b7 ("memblock: refactor internal allocation functions") the maximal address for memblock allocations was forced to memblock.current_limit only for the allocation functions returning virtual address. The changes introduced by that commit moved the limit enforcement into the allocation core and as a result the allocation functions returning physical address also started to limit allocations to memblock.current_limit. This caused breakage of etnaviv GPU driver: [3.682347] etnaviv etnaviv: bound 13.gpu (ops gpu_ops) [3.688669] etnaviv etnaviv: bound 134000.gpu (ops gpu_ops) [3.695099] etnaviv etnaviv: bound 2204000.gpu (ops gpu_ops) [3.700800] etnaviv-gpu 13.gpu: model: GC2000, revision: 5108 [3.723013] etnaviv-gpu 13.gpu: command buffer outside valid memory window [3.731308] etnaviv-gpu 134000.gpu: model: GC320, revision: 5007 [3.752437] etnaviv-gpu 134000.gpu: command buffer outside valid memory window [3.760583] etnaviv-gpu 2204000.gpu: model: GC355, revision: 1215 [3.766766] etnaviv-gpu 2204000.gpu: Ignoring GPU with VG and FE2.0 Restore the behaviour of memblock_phys* family so that these functions will not enforce memblock.current_limit. Fixes: 92d12f9544b7 ("memblock: refactor internal allocation functions") Reported-by: Adam Ford Signed-off-by: Mike Rapoport --- mm/memblock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 7d4f61a..c4b16ca 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1356,9 +1356,6 @@ static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, align = SMP_CACHE_BYTES; } - if (end > memblock.current_limit) - end = memblock.current_limit; - again: found = memblock_find_in_range_node(size, align, start, end, nid, flags); @@ -1469,6 +1466,9 @@ static void * __init memblock_alloc_internal( if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, nid); + if (max_addr > memblock.current_limit) + max_addr = memblock.current_limit; + alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid); /* retry allocation without lower limit */ -- 2.7.4 > > adam > > > > On Sat, Sep 28, 2019 at 2:33 AM Mike Rapoport wrote: > > > > > > On Thu, Sep 26, 2019 at 02:35:53PM -0500, Adam Ford wrote: > > > > On Thu, Sep 26, 2019 at 11:04 AM Mike Rapoport > > > > wrote: > > > > > > > > > > Hi, > > > > > > > > > > On Thu, Sep 26, 2019 at 08:09:52AM -0500, Adam Ford wrote: > > > > > > On Wed, Sep 25, 2019 at 10:17 AM Fabio Estevam > > > > > > wrote: > > > > > > > > > > > > > > On Wed, Sep 25, 2019 at 9:17 AM Adam Ford > > > > > > > wrote: > > > > > > > > > > > > > > > I tried cma=256M and noticed the cma dump at the beginning > > > > > > > > didn't > > > > > > > > change. Do we need to setup a reserved-memory node like > > > > > > > > imx6ul-ccimx6ulsom.dtsi did? > > > > > > > > > > > > > > I don't think so. > > > > > > > > > > > > > > Were you able to identify what was the exact commit that caused > > > > > > > such regression? > > > > > > > > > > > > I was able to narrow it down the 92d12f9544b7 ("memblock: refactor > > > > > > internal allocation functions") that caused the regression with > > > > > > Etnaviv. > > > > > > > > > > > > > > > Can you please test with this change: > > > > > > > > > > > > > That appears to have fixed my issue. I am not sure what the impact > > > > is, but is this a safe option? > > > > > > It'
Re: [PATCH v8 1/5] kasan: support backing vmalloc space with real shadow memory
Daniel Axtens a écrit : Hi, /* * Find a place in the tree where VA potentially will be * inserted, unless it is merged with its sibling/siblings. @@ -741,6 +752,10 @@ merge_or_add_vmap_area(struct vmap_area *va, if (sibling->va_end == va->va_start) { sibling->va_end = va->va_end; + kasan_release_vmalloc(orig_start, orig_end, + sibling->va_start, + sibling->va_end); + The same. The call to kasan_release_vmalloc() is a static inline no-op if CONFIG_KASAN_VMALLOC is not defined, which I thought was the preferred way to do things rather than sprinkling the code with ifdefs? The complier should be smart enough to eliminate all the orig_state/orig_end stuff at compile time because it can see that it's not used, so there's no cost in the binary. Daniel, You are entirely right in your way to do i, that's fully in line with Linux kernel codying style https://www.kernel.org/doc/html/latest/process/coding-style.html#conditional-compilation Christophe
Re: [PATCH v5 04/10] mm/memory_hotplug: Don't access uninitialized memmaps in shrink_zone_span()
On 02.10.19 02:06, kbuild test robot wrote: > Hi David, > > I love your patch! Perhaps something to improve: > > [auto build test WARNING on mmotm/master] > > url: > https://github.com/0day-ci/linux/commits/David-Hildenbrand/mm-memory_hotplug-Shrink-zones-before-removing-memory/20191002-054310 > base: git://git.cmpxchg.org/linux-mmotm.git master > config: x86_64-randconfig-b002-201939 (attached as .config) > compiler: gcc-7 (Debian 7.4.0-13) 7.4.0 > reproduce: > # save the attached .config to linux build tree > make ARCH=x86_64 > > If you fix the issue, kindly add following tag > Reported-by: kbuild test robot > > All warnings (new ones prefixed by >>): > >In file included from include/asm-generic/bug.h:5:0, > from arch/x86/include/asm/bug.h:83, > from include/linux/bug.h:5, > from include/linux/mmdebug.h:5, > from include/linux/mm.h:9, > from mm/memory_hotplug.c:9: >mm/memory_hotplug.c: In function '__remove_zone': >mm/memory_hotplug.c:471:24: error: 'ZONE_DEVICE' undeclared (first use in > this function); did you mean 'ZONE_MOVABLE'? > if (zone_idx(zone) == ZONE_DEVICE) >^ >include/linux/compiler.h:58:52: note: in definition of macro > '__trace_if_var' > #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : > __trace_if_value(cond)) >^~~~ >>> mm/memory_hotplug.c:471:2: note: in expansion of macro 'if' > if (zone_idx(zone) == ZONE_DEVICE) > ^~ >mm/memory_hotplug.c:471:24: note: each undeclared identifier is reported > only once for each function it appears in > if (zone_idx(zone) == ZONE_DEVICE) >^ >include/linux/compiler.h:58:52: note: in definition of macro > '__trace_if_var' > #define __trace_if_var(cond) (__builtin_constant_p(cond) ? (cond) : > __trace_if_value(cond)) >^~~~ >>> mm/memory_hotplug.c:471:2: note: in expansion of macro 'if' > if (zone_idx(zone) == ZONE_DEVICE) > ^~ > > vim +/if +471 mm/memory_hotplug.c > >459 >460static void __remove_zone(struct zone *zone, unsigned long > start_pfn, >461unsigned long nr_pages) >462{ >463struct pglist_data *pgdat = zone->zone_pgdat; >464unsigned long flags; >465 >466/* >467 * Zone shrinking code cannot properly deal with > ZONE_DEVICE. So >468 * we will not try to shrink the zones - which is okay > as >469 * set_zone_contiguous() cannot deal with ZONE_DEVICE > either way. >470 */ > > 471if (zone_idx(zone) == ZONE_DEVICE) >472return; >473 >474pgdat_resize_lock(zone->zone_pgdat, &flags); >475shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); >476update_pgdat_span(pgdat); >477pgdat_resize_unlock(zone->zone_pgdat, &flags); >478} >479 > > --- > 0-DAY kernel test infrastructureOpen Source Technology Center > https://lists.01.org/pipermail/kbuild-all Intel Corporation > That should be easy to fix with some ifdef-ery :) -- Thanks, David / dhildenb