[PATCH v2 4/4] ARM: dts: imx: use dual-fifo sdma script for ssi
Use dual-fifo sdma scripts instead of shared scripts for ssi on i.MX series. Signed-off-by: Nicolin Chen --- arch/arm/boot/dts/imx51.dtsi | 4 ++-- arch/arm/boot/dts/imx53.dtsi | 4 ++-- arch/arm/boot/dts/imx6qdl.dtsi | 12 ++-- arch/arm/boot/dts/imx6sl.dtsi | 12 ++-- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/arm/boot/dts/imx51.dtsi b/arch/arm/boot/dts/imx51.dtsi index 54cee65..1a71eac 100644 --- a/arch/arm/boot/dts/imx51.dtsi +++ b/arch/arm/boot/dts/imx51.dtsi @@ -154,8 +154,8 @@ reg = <0x70014000 0x4000>; interrupts = <30>; clocks = <&clks 49>; - dmas = <&sdma 24 1 0>, - <&sdma 25 1 0>; + dmas = <&sdma 24 22 0>, + <&sdma 25 22 0>; dma-names = "rx", "tx"; fsl,fifo-depth = <15>; fsl,ssi-dma-events = <25 24 23 22>; /* TX0 RX0 TX1 RX1 */ diff --git a/arch/arm/boot/dts/imx53.dtsi b/arch/arm/boot/dts/imx53.dtsi index 4307e80..7208fde 100644 --- a/arch/arm/boot/dts/imx53.dtsi +++ b/arch/arm/boot/dts/imx53.dtsi @@ -153,8 +153,8 @@ reg = <0x50014000 0x4000>; interrupts = <30>; clocks = <&clks 49>; - dmas = <&sdma 24 1 0>, - <&sdma 25 1 0>; + dmas = <&sdma 24 22 0>, + <&sdma 25 22 0>; dma-names = "rx", "tx"; fsl,fifo-depth = <15>; fsl,ssi-dma-events = <25 24 23 22>; /* TX0 RX0 TX1 RX1 */ diff --git a/arch/arm/boot/dts/imx6qdl.dtsi b/arch/arm/boot/dts/imx6qdl.dtsi index 57e9c38..6e096ca 100644 --- a/arch/arm/boot/dts/imx6qdl.dtsi +++ b/arch/arm/boot/dts/imx6qdl.dtsi @@ -223,8 +223,8 @@ reg = <0x02028000 0x4000>; interrupts = <0 46 0x04>; clocks = <&clks 178>; - dmas = <&sdma 37 1 0>, - <&sdma 38 1 0>; + dmas = <&sdma 37 22 0>, + <&sdma 38 22 0>; dma-names = "rx", "tx"; fsl,fifo-depth = <15>; fsl,ssi-dma-events = <38 37>; @@ -236,8 +236,8 @@ reg = <0x0202c000 0x4000>; interrupts = <0 47 0x04>; clocks = <&clks 179>; - dmas = <&sdma 41 1 0>, - <&sdma 42 1 0>; + dmas = <&sdma 41 22 0>, + <&sdma 42 22 0>; dma-names = "rx", "tx"; fsl,fifo-depth = <15>; fsl,ssi-dma-events = <42 41>; @@ -249,8 +249,8 @@ reg = <0x0203 0x4000>; interrupts = <0 48 0x04>; clocks = <&clks 180>; - dmas = <&sdma 45 1 0>, - <&sdma 46 1 0>; + dmas = <&sdma 45 22 0>, + <&sdma 46 22 0>; dma-names = "rx", "tx"; fsl,fifo-depth = <15>; fsl,ssi-dma-events = <46 45>; diff --git a/arch/arm/boot/dts/imx6sl.dtsi b/arch/arm/boot/dts/imx6sl.dtsi index c46651e..b32ba99 100644 --- a/arch/arm/boot/dts/imx6sl.dtsi +++ b/arch/arm/boot/dts/imx6sl.dtsi @@ -195,8 +195,8 @@ reg = <0x02028000 0x4000>; interrupts = <0 46 0x04>; clocks = <&clks IMX6SL_CLK_SSI1>; - dmas = <&sdma 37 1 0>, - <&sdma 38 1 0>; + dmas = <&sdma 37 22 0>, + <&sdma 38 22 0>; dma-names = "rx", "tx";
[PATCH v2 2/4] dma: imx-sdma: Add new dma type for ssi dual fifo script
This patch adds a new DMA_TYPE for SSI dual FIFO script, included in SDMA firmware version 2. This script would allow SSI use dual fifo mode to transimit/receive data without occasional hardware underrun/overrun. Signed-off-by: Nicolin Chen --- Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt | 1 + drivers/dma/imx-sdma.c | 4 include/linux/platform_data/dma-imx.h | 1 + 3 files changed, 6 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt b/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt index 4fa814d..68b83ec 100644 --- a/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt +++ b/Documentation/devicetree/bindings/dma/fsl-imx-sdma.txt @@ -42,6 +42,7 @@ The full ID of peripheral types can be found below. 19 IPU Memory 20 ASRC 21 ESAI + 22 SSI Dual FIFO (needs firmware ver >= 2) The third cell specifies the transfer priority as below. diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index c7ece8d..efaa9a9 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -725,6 +725,10 @@ static void sdma_get_pc(struct sdma_channel *sdmac, per_2_emi = sdma->script_addrs->app_2_mcu_addr; emi_2_per = sdma->script_addrs->mcu_2_app_addr; break; + case IMX_DMATYPE_SSI_DUAL: + per_2_emi = sdma->script_addrs->ssish_2_mcu_addr; + emi_2_per = sdma->script_addrs->mcu_2_ssish_addr; + break; case IMX_DMATYPE_SSI_SP: case IMX_DMATYPE_MMC: case IMX_DMATYPE_SDHC: diff --git a/include/linux/platform_data/dma-imx.h b/include/linux/platform_data/dma-imx.h index beac6b8..bcbc6c3 100644 --- a/include/linux/platform_data/dma-imx.h +++ b/include/linux/platform_data/dma-imx.h @@ -39,6 +39,7 @@ enum sdma_peripheral_type { IMX_DMATYPE_IPU_MEMORY, /* IPU Memory */ IMX_DMATYPE_ASRC, /* ASRC */ IMX_DMATYPE_ESAI, /* ESAI */ + IMX_DMATYPE_SSI_DUAL, /* SSI Dual FIFO */ }; enum imx_dma_prio { -- 1.8.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 3/4] ASoC: fsl_ssi: Add dual fifo mode support
By enabling dual fifo mode, it would allow SSI enter a better performance to transimit/receive data without occasional hardware underrun/overrun. [ Passed compile-test with mpc85xx_defconfig ] Signed-off-by: Nicolin Chen --- sound/soc/fsl/fsl_ssi.c | 24 +++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c index 35e2773..0dfb7d6 100644 --- a/sound/soc/fsl/fsl_ssi.c +++ b/sound/soc/fsl/fsl_ssi.c @@ -143,6 +143,7 @@ struct fsl_ssi_private { bool ssi_on_imx; bool imx_ac97; bool use_dma; + bool use_dual_fifo; struct clk *clk; struct snd_dmaengine_dai_dma_data dma_params_tx; struct snd_dmaengine_dai_dma_data dma_params_rx; @@ -413,6 +414,16 @@ static int fsl_ssi_setup(struct fsl_ssi_private *ssi_private) write_ssi(CCSR_SSI_SOR_WAIT(3), &ssi->sor); } + if (ssi_private->use_dual_fifo) { + write_ssi_mask(&ssi->srcr, 0, CCSR_SSI_SRCR_RFEN1); + write_ssi_mask(&ssi->stcr, 0, CCSR_SSI_STCR_TFEN1); + write_ssi_mask(&ssi->scr, 0, CCSR_SSI_SCR_TCH_EN); + } else { + write_ssi_mask(&ssi->srcr, CCSR_SSI_SRCR_RFEN1, 0); + write_ssi_mask(&ssi->stcr, CCSR_SSI_STCR_TFEN1, 0); + write_ssi_mask(&ssi->scr, CCSR_SSI_SCR_TCH_EN, 0); + } + return 0; } @@ -947,7 +958,7 @@ static int fsl_ssi_probe(struct platform_device *pdev) ssi_private->fifo_depth = 8; if (of_device_is_compatible(pdev->dev.of_node, "fsl,imx21-ssi")) { - u32 dma_events[2]; + u32 dma_events[2], dmas[4]; ssi_private->ssi_on_imx = true; ssi_private->clk = devm_clk_get(&pdev->dev, NULL); @@ -1001,6 +1012,17 @@ static int fsl_ssi_probe(struct platform_device *pdev) dma_events[0], shared ? IMX_DMATYPE_SSI_SP : IMX_DMATYPE_SSI); imx_pcm_dma_params_init_data(&ssi_private->filter_data_rx, dma_events[1], shared ? IMX_DMATYPE_SSI_SP : IMX_DMATYPE_SSI); + if (!of_property_read_u32_array(pdev->dev.of_node, "dmas", dmas, 4) + && dmas[2] == IMX_DMATYPE_SSI_DUAL) { + ssi_private->use_dual_fifo = true; + /* When using dual fifo mode, we need to keep watermark +* as even numbers due to dma script limitation. +*/ + ssi_private->dma_params_tx.maxburst /= 2; + ssi_private->dma_params_tx.maxburst *= 2; + ssi_private->dma_params_rx.maxburst /= 2; + ssi_private->dma_params_rx.maxburst *= 2; + } } else if (ssi_private->use_dma) { /* The 'name' should not have any slashes in it. */ ret = devm_request_irq(&pdev->dev, ssi_private->irq, -- 1.8.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 1/4] dma: imx-sdma: Add sdma firmware version 2 support
On i.MX5/6 series, SDMA is using new version firmware to support SSI dual FIFO feature and HDMI Audio (i.MX6Q/DL only). Thus add it. Signed-off-by: Nicolin Chen --- drivers/dma/imx-sdma.c | 15 ++- include/linux/platform_data/dma-imx-sdma.h | 3 +++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index fc43603..c7ece8d 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -323,6 +323,7 @@ struct sdma_engine { struct clk *clk_ipg; struct clk *clk_ahb; spinlock_t channel_0_lock; + u32 script_number; struct sdma_script_start_addrs *script_addrs; const struct sdma_driver_data *drvdata; }; @@ -1238,6 +1239,7 @@ static void sdma_issue_pending(struct dma_chan *chan) } #define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V134 +#define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V238 static void sdma_add_scripts(struct sdma_engine *sdma, const struct sdma_script_start_addrs *addr) @@ -1246,7 +1248,7 @@ static void sdma_add_scripts(struct sdma_engine *sdma, s32 *saddr_arr = (u32 *)sdma->script_addrs; int i; - for (i = 0; i < SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1; i++) + for (i = 0; i < sdma->script_number; i++) if (addr_arr[i] > 0) saddr_arr[i] = addr_arr[i]; } @@ -1272,6 +1274,17 @@ static void sdma_load_firmware(const struct firmware *fw, void *context) goto err_firmware; if (header->ram_code_start + header->ram_code_size > fw->size) goto err_firmware; + switch (header->version_major) { + case 1: + sdma->script_number = SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1; + break; + case 2: + sdma->script_number = SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V2; + break; + default: + dev_err(sdma->dev, "unknown firmware version\n"); + return; + } addr = (void *)header + header->script_addrs_start; ram_code = (void *)header + header->ram_code_start; diff --git a/include/linux/platform_data/dma-imx-sdma.h b/include/linux/platform_data/dma-imx-sdma.h index 3a39428..14ca582 100644 --- a/include/linux/platform_data/dma-imx-sdma.h +++ b/include/linux/platform_data/dma-imx-sdma.h @@ -43,6 +43,9 @@ struct sdma_script_start_addrs { s32 dptc_dvfs_addr; s32 utra_addr; s32 ram_code_start_addr; + s32 mcu_2_ssish_addr; + s32 ssish_2_mcu_addr; + s32 hdmi_dma_addr; }; /** -- 1.8.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v2 0/4] Add dual-fifo mode support of i.MX ssi
Changelog v2: * Instead of adding rogue scripts to current SDMA driver based on firmware * V1, we define the new SDMA firmware as version 2 and bisect the PATCH-1 * to two patches: The first is to add version check code to the SDMA driver; * And the second is to add SSI dual FIFO DMATYPE. * * Nothing changes for the last two patches. v1: * SSI can reduce hardware overrun/underrun possibility when using dual * fifo mode. To support this mode, we need to first update sdma sciprt * list, and then enable dual fifo BIT in SSI driver, and last update DT * bindings of i.MX series. * * ! This series of patches has a direct dependency between them. When * ! applying them, we need to apply in one single branch. Otherwise, * ! it would break currect branches. Nicolin Chen (4): dma: imx-sdma: Add sdma firmware version 2 support dma: imx-sdma: Add new dma type for ssi dual fifo script ASoC: fsl_ssi: Add dual fifo mode support ARM: dts: imx: use dual-fifo sdma script for ssi .../devicetree/bindings/dma/fsl-imx-sdma.txt | 1 + arch/arm/boot/dts/imx51.dtsi | 4 ++-- arch/arm/boot/dts/imx53.dtsi | 4 ++-- arch/arm/boot/dts/imx6qdl.dtsi | 12 +-- arch/arm/boot/dts/imx6sl.dtsi | 12 +-- drivers/dma/imx-sdma.c | 19 - include/linux/platform_data/dma-imx-sdma.h | 3 +++ include/linux/platform_data/dma-imx.h | 1 + sound/soc/fsl/fsl_ssi.c| 24 +- 9 files changed, 62 insertions(+), 18 deletions(-) -- 1.8.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] of: move definition of of_find_next_cache_node into common code.
On Wed, 2013-09-18 at 11:53 +0100, Sudeep KarkadaNagesha wrote: > From: Sudeep KarkadaNagesha > > Since the definition of_find_next_cache_node is architecture independent, > the existing definition in powerpc can be moved to driver/of/base.c > > Cc: Benjamin Herrenschmidt > Cc: Grant Likely > Cc: Rob Herring > Signed-off-by: Sudeep KarkadaNagesha I've seen no follow up on that, I'm happy to stick it in powerpc-next with some other late stuff. Cheers, Ben. > --- > arch/powerpc/include/asm/prom.h | 3 --- > arch/powerpc/kernel/prom.c | 31 --- > drivers/of/base.c | 31 +++ > include/linux/of.h | 2 ++ > 4 files changed, 33 insertions(+), 34 deletions(-) > > diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h > index 7d0c7f3..bf09e5a 100644 > --- a/arch/powerpc/include/asm/prom.h > +++ b/arch/powerpc/include/asm/prom.h > @@ -44,9 +44,6 @@ void of_parse_dma_window(struct device_node *dn, const > __be32 *dma_window, > > extern void kdump_move_device_tree(void); > > -/* cache lookup */ > -struct device_node *of_find_next_cache_node(struct device_node *np); > - > #ifdef CONFIG_NUMA > extern int of_node_to_nid(struct device_node *device); > #else > diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c > index 09be275..4432fd8 100644 > --- a/arch/powerpc/kernel/prom.c > +++ b/arch/powerpc/kernel/prom.c > @@ -761,37 +761,6 @@ void __init early_init_devtree(void *params) > ***/ > > /** > - * of_find_next_cache_node - Find a node's subsidiary cache > - * @np:node of type "cpu" or "cache" > - * > - * Returns a node pointer with refcount incremented, use > - * of_node_put() on it when done. Caller should hold a reference > - * to np. > - */ > -struct device_node *of_find_next_cache_node(struct device_node *np) > -{ > - struct device_node *child; > - const phandle *handle; > - > - handle = of_get_property(np, "l2-cache", NULL); > - if (!handle) > - handle = of_get_property(np, "next-level-cache", NULL); > - > - if (handle) > - return of_find_node_by_phandle(be32_to_cpup(handle)); > - > - /* OF on pmac has nodes instead of properties named "l2-cache" > - * beneath CPU nodes. > - */ > - if (!strcmp(np->type, "cpu")) > - for_each_child_of_node(np, child) > - if (!strcmp(child->type, "cache")) > - return child; > - > - return NULL; > -} > - > -/** > * of_get_ibm_chip_id - Returns the IBM "chip-id" of a device > * @np: device node of the device > * > diff --git a/drivers/of/base.c b/drivers/of/base.c > index 865d3f6..b2cee3d 100644 > --- a/drivers/of/base.c > +++ b/drivers/of/base.c > @@ -1884,3 +1884,34 @@ int of_device_is_stdout_path(struct device_node *dn) > return of_stdout == dn; > } > EXPORT_SYMBOL_GPL(of_device_is_stdout_path); > + > +/** > + * of_find_next_cache_node - Find a node's subsidiary cache > + * @np:node of type "cpu" or "cache" > + * > + * Returns a node pointer with refcount incremented, use > + * of_node_put() on it when done. Caller should hold a reference > + * to np. > + */ > +struct device_node *of_find_next_cache_node(const struct device_node *np) > +{ > + struct device_node *child; > + const phandle *handle; > + > + handle = of_get_property(np, "l2-cache", NULL); > + if (!handle) > + handle = of_get_property(np, "next-level-cache", NULL); > + > + if (handle) > + return of_find_node_by_phandle(be32_to_cpup(handle)); > + > + /* OF on pmac has nodes instead of properties named "l2-cache" > + * beneath CPU nodes. > + */ > + if (!strcmp(np->type, "cpu")) > + for_each_child_of_node(np, child) > + if (!strcmp(child->type, "cache")) > + return child; > + > + return NULL; > +} > diff --git a/include/linux/of.h b/include/linux/of.h > index f95aee3..c08c07e 100644 > --- a/include/linux/of.h > +++ b/include/linux/of.h > @@ -226,6 +226,8 @@ static inline int of_get_child_count(const struct > device_node *np) > return num; > } > > +/* cache lookup */ > +extern struct device_node *of_find_next_cache_node(const struct device_node > *); > extern struct device_node *of_find_node_with_property( > struct device_node *from, const char *prop_name); > #define for_each_node_with_property(dn, prop_name) \ ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 03:28:54PM +0200, Victor Kaplansky wrote: > "Paul E. McKenney" wrote on 10/30/2013 > 11:27:25 AM: > > > If you were to back up that insistence with a description of the > orderings > > you are relying on, why other orderings are not important, and how the > > important orderings are enforced, I might be tempted to pay attention > > to your opinion. > > > > Thanx, Paul > > NP, though, I feel too embarrassed to explain things about memory barriers > when > one of the authors of Documentation/memory-barriers.txt is on cc: list ;-) > > Disclaimer: it is anyway impossible to prove lack of *any* problem. If you want to play the "omit memory barriers" game, then proving a negative is in fact the task before you. > Having said that, lets look into an example in > Documentation/circular-buffers.txt: And the correctness of this code has been called into question. :-( An embarrassingly long time ago -- I need to get this either proven or fixed. > > THE PRODUCER > > > > > > The producer will look something like this: > > > > spin_lock(&producer_lock); > > > > unsigned long head = buffer->head; > > unsigned long tail = ACCESS_ONCE(buffer->tail); > > > > if (CIRC_SPACE(head, tail, buffer->size) >= 1) { > > /* insert one item into the buffer */ > > struct item *item = buffer[head]; > > > > produce_item(item); > > > > smp_wmb(); /* commit the item before incrementing the head > */ > > > > buffer->head = (head + 1) & (buffer->size - 1); > > > > /* wake_up() will make sure that the head is committed > before > >* waking anyone up */ > > wake_up(consumer); > > } > > > > spin_unlock(&producer_lock); > > We can see that authors of the document didn't put any memory barrier > after "buffer->tail" read and before "produce_item(item)" and I think they > have > a good reason. > > Lets consider an imaginary smp_mb() right before "produce_item(item);". > Such a barrier will ensure that - > > - the memory read on "buffer->tail" is completed > before store to memory pointed by "item" is committed. > > However, the store to "buffer->tail" anyway cannot be completed before > conditional > branch implied by "if ()" is proven to execute body statement of the if(). > And the > latter cannot be proven before read of "buffer->tail" is completed. > > Lets see this other way. Lets imagine that somehow a store to the data > pointed by "item" > is completed before we read "buffer->tail". That would mean, that the store > was completed > speculatively. But speculative execution of conditional stores is > prohibited by C/C++ standard, > otherwise any conditional store at any random place of code could pollute > shared memory. Before C/C++11, the closest thing to such a prohibition is use of volatile, for example, ACCESS_ONCE(). Even in C/C++11, you have to use atomics to get anything resembing this prohibition. If you just use normal variables, the compiler is within its rights to transform something like the following: if (a) b = 1; else b = 42; Into: b = 42; if (a) b = 1; Many other similar transformations are permitted. Some are used to all vector instructions to be used -- the compiler can do a write with an overly wide vector instruction, then clean up the clobbered variables later, if it wishes. Again, if the variables are not marked volatile, or, in C/C++11, atomic. > On the other hand, if compiler or processor can prove that condition in > above if() is going > to be true (or if speculative store writes the same value as it was before > write), the > speculative store *is* allowed. In this case we should not be bothered by > the fact that > memory pointed by "item" is written before a read from "buffer->tail" is > completed. The compilers don't always know as much as they might about the underlying hardware's memory model. Of course, if this code is architecture specific, it can avoid DEC Alpha's fun and games, which could also violate your assumptions in the above paragraph: http://www.openvms.compaq.com/wizard/wiz_2637.html Anyway, proving or fixing the code in Documentation/circular-buffers.txt has been on my list for too long, so I will take a closer look at it. Thanx, Paul ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 04:51:16PM +0100, Peter Zijlstra wrote: > On Wed, Oct 30, 2013 at 03:28:54PM +0200, Victor Kaplansky wrote: > > one of the authors of Documentation/memory-barriers.txt is on cc: list ;-) > > > > Disclaimer: it is anyway impossible to prove lack of *any* problem. > > > > Having said that, lets look into an example in > > Documentation/circular-buffers.txt: > > > > > We can see that authors of the document didn't put any memory barrier > > Note that both documents have the same author list ;-) > > Anyway, I didn't know about the circular thing, I suppose I should use > CIRC_SPACE() thing :-) Interesting that we didn't seem to supply a definition... ;-) Thanx, Paul ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[Suggestion] drivers: powercap: 'dev_attrs' has already removed from 'struct class'
Hello Maintainers It is removed by "bcc8edb driver core: remove dev_attrs from struct class" in Oct 5 2013. But "75d2364 PowerCap: Add class driver" still use it in Oct 11 2013. The related error (for powerpc with allmodconfig): CC drivers/powercap/powercap_sys.o drivers/powercap/powercap_sys.c:484:2: error: unknown field ‘dev_attrs’ specified in initializer drivers/powercap/powercap_sys.c:484:2: warning: initialization from incompatible pointer type [enabled by default] drivers/powercap/powercap_sys.c:484:2: warning: (near initialization for ‘powercap_class.suspend’) [enabled by default] Please give a check thanks. Thanks. -- Chen Gang ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] offb: make the screen properties endian safe
On Wed, 2013-10-30 at 17:14 +0100, Cédric Le Goater wrote: > @@ -552,25 +552,25 @@ static void __init offb_init_nodriver(struct > device_node *dp, int no_real_node) > if (pp == NULL) > pp = of_get_property(dp, "depth", &len); > if (pp && len == sizeof(u32)) > - depth = *pp; > + depth = be32_to_cpu(*pp); This is usually written as depth = be32_to_cpup(pp); It used to be that the latter generated better code but that might not be the case anymore, however it's still a better alternative. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] ADB_PMU_LED_IDE selects LEDS_TRIGGER_IDE_DISK which has unmet direct dependencies
On Tue, 2013-10-29 at 21:25 -0700, Christian Kujau wrote: > On Wed, 30 Oct 2013 at 10:13, Benjamin Herrenschmidt wrote: > > You probably want to do that to the ADB_PMU_LED_IDE entry not the > > ADB_PMU_LED one which doesn't have a dependency and isn't the one > > selecting LEDS_TRIGGER_IDE_DISK :-) > > Right you are, sorry for the mixup. Let me try again: And this time without a changeset comment ? :-) I've copied it from the previous version but next time, re-send the complete commit. Cheers, Ben. > > Signed-off-by: Christian Kujau > > diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig > index 696238b..d26a312 100644 > --- a/drivers/macintosh/Kconfig > +++ b/drivers/macintosh/Kconfig > @@ -103,6 +103,7 @@ config ADB_PMU_LED_IDE > bool "Use front LED as IDE LED by default" > depends on ADB_PMU_LED > depends on LEDS_CLASS > + depends on IDE_GD_ATA > select LEDS_TRIGGERS > select LEDS_TRIGGER_IDE_DISK > help > > C. > -- > BOFH excuse #378: > > Operators killed by year 2000 bug bite. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2] powerpc/mpc512x: remove unnecessary #if
On Fri, 11 Oct 2013 10:37:38 -0700 Brian Norris wrote: > Several functions are only ever referenced locally, so make them static. > Of those functions, many of them are protected by an #if. However, the > code which can compile fine in either case. > > Now that (1) the unneeded code is marked 'static' and (2) the code is > only used under a C 'if (IS_ENABLED(CONFIG_FB_FSL_DIU))', the compiler > can automatically remove the unneeded code, and we don't need the #if or > the empty stub functions. > > Signed-off-by: Brian Norris > --- > v2: left out a diff I was holding locally (to remove an #if/#endif > completely). Sorry for the noise. > > Based off of Gerhard Sittig's patch: > powerpc/mpc512x: silence build warning upon disabled DIU > > Compile-tested with CONFIG_FB_FSL_DIU=n > > arch/powerpc/platforms/512x/mpc512x_shared.c | 21 +++-- > arch/powerpc/sysdev/fsl_soc.h| 3 --- > 2 files changed, 7 insertions(+), 17 deletions(-) Applied, thanks! Anatolij ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/3] powerpc: Enable emulate_step In Little Endian Mode
On 10/30/2013 2:43 PM, Geert Uytterhoeven wrote: On Wed, Oct 30, 2013 at 8:35 PM, Tom Musta wrote: On 10/30/2013 12:43 PM, Andreas Schwab wrote: Tom Musta writes: +#ifdef __LITTLE_ENDIAN__ + if (!regs->msr & MSR_LE) That won't work. Andreas. Please elaborate. You want to test for "!(regs & MSR_LE)". Gr{oetje,eeting}s, Geert -- Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. -- Linus Torvalds Thanks Adnreas and Geert. I will fix and resubmit. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 2/7] ppc/cell: use get_unused_fd_flags(0) instead of get_unused_fd()
Macro get_unused_fd() is used to allocate a file descriptor with default flags. Those default flags (0) can be "unsafe": O_CLOEXEC must be used by default to not leak file descriptor across exec(). Instead of macro get_unused_fd(), functions anon_inode_getfd() or get_unused_fd_flags() should be used with flags given by userspace. If not possible, flags should be set to O_CLOEXEC to provide userspace with a default safe behavor. In a further patch, get_unused_fd() will be removed so that new code start using anon_inode_getfd() or get_unused_fd_flags() with correct flags. This patch replaces calls to get_unused_fd() with equivalent call to get_unused_fd_flags(0) to preserve current behavor for existing code. The hard coded flag value (0) should be reviewed on a per-subsystem basis, and, if possible, set to O_CLOEXEC. Signed-off-by: Yann Droneaud Link: http://lkml.kernel.org/r/cover.1383121137.git.ydrone...@opteya.com --- arch/powerpc/platforms/cell/spufs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 87ba7cf..51effce 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -301,7 +301,7 @@ static int spufs_context_open(struct path *path) int ret; struct file *filp; - ret = get_unused_fd(); + ret = get_unused_fd_flags(0); if (ret < 0) return ret; @@ -518,7 +518,7 @@ static int spufs_gang_open(struct path *path) int ret; struct file *filp; - ret = get_unused_fd(); + ret = get_unused_fd_flags(0); if (ret < 0) return ret; -- 1.8.3.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 0/7] Getting rid of get_unused_fd()
Hi, Please find the fourth revision of my patchset to remove get_unused_fd() macro in order to encourage subsystems to use get_unused_fd_flags() or anon_inode_getfd() with open flags set to O_CLOEXEC were appropriate. The patchset convert all calls to get_unused_fd() to get_unused_fd_flags(0) before removing get_unused_fd() macro. Without get_unused_fd() macro, more subsystems are likely to use anon_inode_getfd() and be teached to provide an API that let userspace choose the opening flags of the file descriptor. Not using O_CLOEXEC by default or not letting userspace provide the "open" flags should be considered bad practice from security point of view: in most case O_CLOEXEC must be used to not leak file descriptor across exec(). Using O_CLOEXEC by default when flags are not provided by userspace allows userspace to set, using fcntl(), without any risk of race, if the file descriptor is going to be inherited or not across exec(). Status: In linux-next tag 20131029, they're currently: - 32 calls to fd_install() - 23 calls to get_unused_fd_flags() - 11 calls to anon_inode_getfd() - 7 calls to get_unused_fd() Changes from patchset v3 [PATCHSETv3]: - industrialio: use anon_inode_getfd() with O_CLOEXEC flag DROPPED: applied upstream Changes from patchset v2 [PATCHSETv2]: - android/sw_sync: use get_unused_fd_flags(O_CLOEXEC) instead of get_unused_fd() DROPPED: applied upstream - android/sync: use get_unused_fd_flags(O_CLOEXEC) instead of get_unused_fd() DROPPED: applied upstream - vfio: use get_unused_fd_flags(0) instead of get_unused_fd() DROPPED: applied upstream. Additionally subsystem maintainer applied another patch on top to set the flags to O_CLOEXEC. - industrialio: use anon_inode_getfd() with O_CLOEXEC flag NEW: propose to use O_CLOEXEC as default flag. Changes from patchset v1 [PATCHSETv1]: - explicitly added subsystem maintainers as mail recepients. - infiniband: use get_unused_fd_flags(0) instead of get_unused_fd() DROPPED: subsystem maintainer applied another patch using get_unused_fd_flags(O_CLOEXEC) as suggested. - android/sw_sync: use get_unused_fd_flags(0) instead of get_unused_fd() MODIFIED: use get_unused_fd_flags(O_CLOEXEC) as suggested - android/sync: use get_unused_fd_flags(0) instead of get_unused_fd() MODIFIED: use get_unused_fd_flags(O_CLOEXEC) as suggested - xfs: use get_unused_fd_flags(0) instead of get_unused_fd() DROPPED: applied asis by subsystem maintainer. - sctp: use get_unused_fd_flags(0) instead of get_unused_fd() DROPPED: applied asis by subsystem maintainer. Links: [PATCHSETv3] http://lkml.kernel.org/r/cover.1378460926.git.ydrone...@opteya.com [PATCHSETv2] http://lkml.kernel.org/r/cover.1376327678.git.ydrone...@opteya.com [PATCHSETv1] http://lkml.kernel.org/r/cover.1372777600.git.ydrone...@opteya.com Yann Droneaud (7): ia64: use get_unused_fd_flags(0) instead of get_unused_fd() ppc/cell: use get_unused_fd_flags(0) instead of get_unused_fd() binfmt_misc: use get_unused_fd_flags(0) instead of get_unused_fd() file: use get_unused_fd_flags(0) instead of get_unused_fd() fanotify: use get_unused_fd_flags(0) instead of get_unused_fd() events: use get_unused_fd_flags(0) instead of get_unused_fd() file: remove get_unused_fd() arch/ia64/kernel/perfmon.c| 2 +- arch/powerpc/platforms/cell/spufs/inode.c | 4 ++-- fs/binfmt_misc.c | 2 +- fs/file.c | 2 +- fs/notify/fanotify/fanotify_user.c| 2 +- include/linux/file.h | 1 - kernel/events/core.c | 2 +- 7 files changed, 7 insertions(+), 8 deletions(-) -- 1.8.3.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/3] powerpc: Enable emulate_step In Little Endian Mode
On Wed, Oct 30, 2013 at 8:35 PM, Tom Musta wrote: > On 10/30/2013 12:43 PM, Andreas Schwab wrote: >> >> Tom Musta writes: >> >>> +#ifdef __LITTLE_ENDIAN__ >>> + if (!regs->msr & MSR_LE) >> >> >> That won't work. >> >> Andreas. >> > > Please elaborate. You want to test for "!(regs & MSR_LE)". Gr{oetje,eeting}s, Geert -- Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. -- Linus Torvalds ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/3] powerpc: Enable emulate_step In Little Endian Mode
On 10/30/2013 12:43 PM, Andreas Schwab wrote: Tom Musta writes: +#ifdef __LITTLE_ENDIAN__ + if (!regs->msr & MSR_LE) That won't work. Andreas. Please elaborate. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 07:29:30PM +0100, Peter Zijlstra wrote: > + page_shift = PAGE_SHIFT + page_order(rb); > + > + handle->page = (offset >> page_shift) & (rb->nr_pages - 1); > + > + offset &= page_shift - 1; offset &= (1UL << page_shift) - 1; Weird that it even appeared to work.. /me wonders if he even booted the right kernel. > + > + handle->addr = rb->data_pages[handle->page] + offset; > + handle->size = (1 << page_shift) - offset; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 29/51] DMA-API: ata: pata_octeon_cf: convert to use dma_coerce_mask_and_coherent()
On Thu, Sep 19, 2013 at 11:54 PM, Russell King wrote: > diff --git a/drivers/ata/pata_octeon_cf.c b/drivers/ata/pata_octeon_cf.c > index c51bbb9..6231d43 100644 > --- a/drivers/ata/pata_octeon_cf.c > +++ b/drivers/ata/pata_octeon_cf.c > @@ -1014,8 +1014,9 @@ static int octeon_cf_probe(struct platform_device *pdev) > } > cf_port->c0 = ap->ioaddr.ctl_addr; > > - pdev->dev.coherent_dma_mask = DMA_BIT_MASK(64); > - pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask; > + rv = dma_coerce_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); > + if (rv) > + return ret; return rv; http://kisskb.ellerman.id.au/kisskb/buildresult/9959184/ Gr{oetje,eeting}s, Geert -- Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. -- Linus Torvalds ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 04:51:16PM +0100, Peter Zijlstra wrote: > On Wed, Oct 30, 2013 at 03:28:54PM +0200, Victor Kaplansky wrote: > > one of the authors of Documentation/memory-barriers.txt is on cc: list ;-) > > > > Disclaimer: it is anyway impossible to prove lack of *any* problem. > > > > Having said that, lets look into an example in > > Documentation/circular-buffers.txt: > > > > > We can see that authors of the document didn't put any memory barrier > > Note that both documents have the same author list ;-) > > Anyway, I didn't know about the circular thing, I suppose I should use > CIRC_SPACE() thing :-) The below removes 80 bytes from ring_buffer.o of which 50 bytes are from perf_output_begin(), it also removes 30 lines of code, so yay! (x86_64 build) And it appears to still work.. although I've not stressed the no-space bits. --- kernel/events/ring_buffer.c | 74 ++--- 1 file changed, 22 insertions(+), 52 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 9c2ddfbf4525..e4a51fa10595 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -12,40 +12,10 @@ #include #include #include +#include #include "internal.h" -static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long sz = perf_data_size(rb); - unsigned long mask = sz - 1; - - /* -* check if user-writable -* overwrite : over-write its own tail -* !overwrite: buffer possibly drops events. -*/ - if (rb->overwrite) - return true; - - /* -* verify that payload is not bigger than buffer -* otherwise masking logic may fail to detect -* the "not enough space" condition -*/ - if ((head - offset) > sz) - return false; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLL_IN); @@ -115,8 +85,7 @@ static void perf_output_put_handle(struct perf_output_handle *handle) rb->user_page->data_head = head; /* -* Now check if we missed an update, rely on the (compiler) -* barrier in atomic_dec_and_test() to re-read rb->head. +* Now check if we missed an update. */ if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); @@ -135,7 +104,7 @@ int perf_output_begin(struct perf_output_handle *handle, { struct ring_buffer *rb; unsigned long tail, offset, head; - int have_lost; + int have_lost, page_shift; struct perf_sample_data sample_data; struct { struct perf_event_header header; @@ -161,7 +130,7 @@ int perf_output_begin(struct perf_output_handle *handle, goto out; have_lost = local_read(&rb->lost); - if (have_lost) { + if (unlikely(have_lost)) { lost_event.header.size = sizeof(lost_event); perf_event_header__init_id(&lost_event.header, &sample_data, event); @@ -171,32 +140,33 @@ int perf_output_begin(struct perf_output_handle *handle, perf_output_get_handle(handle); do { - /* -* Userspace could choose to issue a mb() before updating the -* tail pointer. So that all reads will be completed before the -* write is issued. -* -* See perf_output_put_handle(). -*/ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_mb(); offset = head = local_read(&rb->head); - head += size; - if (unlikely(!perf_output_space(rb, tail, offset, head))) + if (!rb->overwrite && + unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size)) goto fail; + head += size; } while (local_cmpxchg(&rb->head, offset, head) != offset); + /* +* Userspace SHOULD issue an MB before writing the tail; see +* perf_output_put_handle(). +*/ + smp_mb(); + if (head - local_read(&rb->wakeup) > rb->watermark) local_add(rb->watermark, &rb->wakeup); - handle->page = offset >> (PAGE_SHIFT + page_order(rb)); - handle->page &= rb->nr_pages - 1; - handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1); - handle->addr = rb->data_pages[handle->page]; - handle->addr += handle->size; - handle->size = (PAGE_SIZE << page_order(rb)) - handle->size; + page_shift = PAGE_SHIFT + page_order(rb); + +
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 07:14:29PM +0200, Victor Kaplansky wrote: > We need a complete rmb() here IMO. I think there is a fundamental > difference between load and stores in this aspect. Load are allowed to > be hoisted by compiler or executed speculatively by HW. To prevent > load "*(ubuf->data + tail)" to be hoisted beyond "ubuf->head" load you > would need something like this: Indeed, we could compute and load ->data + tail the moment we've completed the tail load but before we've completed the head load and done the comparison. So yes, full rmb() it is. > void > ubuf_read(void) > { > u64 head, tail; > > tail = ubuf->tail; > head = ACCESS_ONCE(ubuf->head); > > /* > * Ensure we read the buffer boundaries before the actual buffer > * data... > */ > > while (tail != head) { > smp_read_barrier_depends(); /* for Alpha */ > obj = *(ubuf->data + head - 128); > /* process obj */ > tail += obj->size; > tail %= ubuf->size; > } > > /* > * Ensure all data reads are complete before we issue the > * ubuf->tail update; once that update hits, kbuf_write() can > * observe and overwrite data. > */ > smp_mb(); /* D, matches with A */ > > ubuf->tail = tail; > } > > (note that "head" is part of address calculation of obj load now). Right, explicit dependent loads; I was hoping the conditional in between might be enough, but as argued above it is not. The above cannot work in our case though, we must use tail to find the obj since we have variable size objects. > But, even in this demo example some "smp_read_barrier_depends()" before > "obj = *(ubuf->data + head - 100);" is required for architectures > like Alpha. Though, on more sane architectures "smp_read_barrier_depends()" > will be translated to nothing. Sure.. I know all about that. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/3] powerpc: Enable emulate_step In Little Endian Mode
Tom Musta writes: > +#ifdef __LITTLE_ENDIAN__ > + if (!regs->msr & MSR_LE) That won't work. Andreas. -- Andreas Schwab, sch...@linux-m68k.org GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5 "And now for something completely different." ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
Peter Zijlstra wrote on 10/30/2013 05:39:31 PM: > Although I suppose speculative reads are allowed -- they don't have the > destructive behaviour speculative writes have -- and thus we could in > fact get reorder issues. I agree. > > But since it is still a dependent load in that we do that @tail vs @head > comparison before doing other loads, wouldn't a read_barrier_depends() > be sufficient? Or do we still need a complete rmb? We need a complete rmb() here IMO. I think there is a fundamental difference between load and stores in this aspect. Load are allowed to be hoisted by compiler or executed speculatively by HW. To prevent load "*(ubuf->data + tail)" to be hoisted beyond "ubuf->head" load you would need something like this: void ubuf_read(void) { u64 head, tail; tail = ubuf->tail; head = ACCESS_ONCE(ubuf->head); /* * Ensure we read the buffer boundaries before the actual buffer * data... */ while (tail != head) { smp_read_barrier_depends(); /* for Alpha */ obj = *(ubuf->data + head - 128); /* process obj */ tail += obj->size; tail %= ubuf->size; } /* * Ensure all data reads are complete before we issue the * ubuf->tail update; once that update hits, kbuf_write() can * observe and overwrite data. */ smp_mb(); /* D, matches with A */ ubuf->tail = tail; } (note that "head" is part of address calculation of obj load now). But, even in this demo example some "smp_read_barrier_depends()" before "obj = *(ubuf->data + head - 100);" is required for architectures like Alpha. Though, on more sane architectures "smp_read_barrier_depends()" will be translated to nothing. Regards, -- Victor ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] offb: make the screen properties endian safe
The "screen" properties : depth, width, height, linebytes need to be converted to the host endian order when read from the device tree. Signed-off-by: Cédric Le Goater --- drivers/video/offb.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/video/offb.c b/drivers/video/offb.c index 0c4f343..c1cf591 100644 --- a/drivers/video/offb.c +++ b/drivers/video/offb.c @@ -536,7 +536,7 @@ static void __init offb_init_nodriver(struct device_node *dp, int no_real_node) unsigned int flags, rsize, addr_prop = 0; unsigned long max_size = 0; u64 rstart, address = OF_BAD_ADDR; - const u32 *pp, *addrp, *up; + const __be32 *pp, *addrp, *up; u64 asize; int foreign_endian = 0; @@ -552,25 +552,25 @@ static void __init offb_init_nodriver(struct device_node *dp, int no_real_node) if (pp == NULL) pp = of_get_property(dp, "depth", &len); if (pp && len == sizeof(u32)) - depth = *pp; + depth = be32_to_cpu(*pp); pp = of_get_property(dp, "linux,bootx-width", &len); if (pp == NULL) pp = of_get_property(dp, "width", &len); if (pp && len == sizeof(u32)) - width = *pp; + width = be32_to_cpu(*pp); pp = of_get_property(dp, "linux,bootx-height", &len); if (pp == NULL) pp = of_get_property(dp, "height", &len); if (pp && len == sizeof(u32)) - height = *pp; + height = be32_to_cpu(*pp); pp = of_get_property(dp, "linux,bootx-linebytes", &len); if (pp == NULL) pp = of_get_property(dp, "linebytes", &len); if (pp && len == sizeof(u32) && (*pp != 0xu)) - pitch = *pp; + pitch = be32_to_cpu(*pp); else pitch = width * ((depth + 7) / 8); -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 03:28:54PM +0200, Victor Kaplansky wrote: > one of the authors of Documentation/memory-barriers.txt is on cc: list ;-) > > Disclaimer: it is anyway impossible to prove lack of *any* problem. > > Having said that, lets look into an example in > Documentation/circular-buffers.txt: > > We can see that authors of the document didn't put any memory barrier Note that both documents have the same author list ;-) Anyway, I didn't know about the circular thing, I suppose I should use CIRC_SPACE() thing :-) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 04:52:05PM +0200, Victor Kaplansky wrote: > Peter Zijlstra wrote on 10/30/2013 01:25:26 PM: > > > Also, I'm not entirely sure on C, that too seems like a dependency, we > > simply cannot read the buffer @tail before we've read the tail itself, > > now can we? Similarly we cannot compare tail to head without having the > > head read completed. > > No, this one we cannot omit, because our problem on consumer side is not > with @tail, which is written exclusively by consumer, but with @head. Ah indeed, my argument was flawed in that @head is the important part. But we still do a comparison of @tail against @head before we do further reads. Although I suppose speculative reads are allowed -- they don't have the destructive behaviour speculative writes have -- and thus we could in fact get reorder issues. But since it is still a dependent load in that we do that @tail vs @head comparison before doing other loads, wouldn't a read_barrier_depends() be sufficient? Or do we still need a complete rmb? > BTW, it is why you also don't need ACCESS_ONCE() around @tail, but only > around > @head read. Agreed, the ACCESS_ONCE() around tail is superfluous since we're the one updating tail, so there's no problem with the value changing unexpectedly. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
Peter Zijlstra wrote on 10/30/2013 01:25:26 PM: > Also, I'm not entirely sure on C, that too seems like a dependency, we > simply cannot read the buffer @tail before we've read the tail itself, > now can we? Similarly we cannot compare tail to head without having the > head read completed. No, this one we cannot omit, because our problem on consumer side is not with @tail, which is written exclusively by consumer, but with @head. BTW, it is why you also don't need ACCESS_ONCE() around @tail, but only around @head read. -- Victor ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 12/12] powerpc/powernv: Machine check exception handling.
From: Mahesh Salgaonkar Add basic error handling in machine check exception handler. - If MSR_RI isn't set, we can not recover. - Check if disposition set to OpalMCE_DISPOSITION_RECOVERED. - Check if address at fault is inside kernel address space, if not then send SIGBUS to process if we hit exception when in userspace. - If address at fault is not provided then and if we get a synchronous machine check while in userspace then kill the task. Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/include/asm/mce.h|1 + arch/powerpc/kernel/mce.c | 27 + arch/powerpc/platforms/powernv/opal.c | 43 - 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 3276b40..a2b8c7b 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -193,5 +193,6 @@ extern void release_mce_event(void); extern void machine_check_queue_event(void); extern void machine_check_process_queued_event(void); extern void machine_check_print_event_info(struct machine_check_event *evt); +extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 1c6d157..c0c52ec 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -316,3 +316,30 @@ void machine_check_print_event_info(struct machine_check_event *evt) break; } } + +uint64_t get_mce_fault_addr(struct machine_check_event *evt) +{ + switch (evt->error_type) { + case MCE_ERROR_TYPE_UE: + if (evt->u.ue_error.effective_address_provided) + return evt->u.ue_error.effective_address; + break; + case MCE_ERROR_TYPE_SLB: + if (evt->u.slb_error.effective_address_provided) + return evt->u.slb_error.effective_address; + break; + case MCE_ERROR_TYPE_ERAT: + if (evt->u.erat_error.effective_address_provided) + return evt->u.erat_error.effective_address; + break; + case MCE_ERROR_TYPE_TLB: + if (evt->u.tlb_error.effective_address_provided) + return evt->u.tlb_error.effective_address; + break; + default: + case MCE_ERROR_TYPE_UNKNOWN: + break; + } + return 0; +} +EXPORT_SYMBOL(get_mce_fault_addr); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index d17e1e3..4a727d3 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -240,6 +241,44 @@ int opal_put_chars(uint32_t vtermno, const char *data, int total_len) return written; } +static int opal_recover_mce(struct pt_regs *regs, + struct machine_check_event *evt) +{ + int recovered = 0; + uint64_t ea = get_mce_fault_addr(evt); + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + } else if (evt->disposition == MCE_DISPOSITION_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + } else if (ea && !is_kernel_addr(ea)) { + /* +* Faulting address is not in kernel text. We should be fine. +* We need to find which process uses this address. +* For now, kill the task if we have received exception when +* in userspace. +* +* TODO: Queue up this address for hwpoisioning later. +*/ + if (user_mode(regs) && !is_global_init(current)) { + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } else + recovered = 0; + } else if (user_mode(regs) && !is_global_init(current) && + evt->severity == MCE_SEV_ERROR_SYNC) { + /* +* If we have received a synchronous error when in userspace +* kill the task. +*/ + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + return recovered; +} + int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; @@ -255,7 +294,9 @@ int opal_machine_check(struct pt_regs *regs) } machine_check_print_event_info(&evt); - return evt.severity == MCE_SEV_FATAL ? 0 : 1; + if (opal_recover_mce(regs, &evt)) + return 1; + return 0; } static irqreturn_t opal_interrupt(int irq, void *data) ___
[RFC PATCH v5 11/12] powerpc/powernv: Remove machine check handling in OPAL.
From: Mahesh Salgaonkar Now that we are ready to handle machine check directly in linux, do not register with firmware to handle machine check exception. Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/platforms/powernv/opal.c |8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 219ce65..d17e1e3 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -83,14 +83,10 @@ static int __init opal_register_exception_handlers(void) if (!(powerpc_firmware_features & FW_FEATURE_OPAL)) return -ENODEV; - /* Hookup some exception handlers. We use the fwnmi area at 0x7000 -* to provide the glue space to OPAL + /* Hookup some exception handlers except machine check. We use the +* fwnmi area at 0x7000 to provide the glue space to OPAL */ glue = 0x7000; - opal_register_exception_handler(OPAL_MACHINE_CHECK_HANDLER, - __pa(opal_mc_secondary_handler[0]), - glue); - glue += 128; opal_register_exception_handler(OPAL_HYPERVISOR_MAINTENANCE_HANDLER, 0, glue); glue += 128; ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 10/12] powerpc/book3s: Queue up and process delayed MCE events.
From: Mahesh Salgaonkar When machine check real mode handler can not continue into host kernel in V mode, it returns from the interrupt and we loose MCE event which never gets logged. In such a situation queue up the MCE event so that we can log it later when we get back into host kernel with r1 pointing to kernel stack e.g. during syscall exit. Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/include/asm/mce.h|3 + arch/powerpc/kernel/entry_64.S|5 + arch/powerpc/kernel/exceptions-64s.S |7 +- arch/powerpc/kernel/mce.c | 154 + arch/powerpc/platforms/powernv/opal.c | 97 - 5 files changed, 168 insertions(+), 98 deletions(-) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 87cad2a..3276b40 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -190,5 +190,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled, struct mce_error_info *mce_err, uint64_t addr); extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); +extern void machine_check_queue_event(void); +extern void machine_check_process_queued_event(void); +extern void machine_check_print_event_info(struct machine_check_event *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index c04cdf7..30c11ac 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -184,6 +184,11 @@ syscall_exit: bl .do_show_syscall_exit ld r3,RESULT(r1) #endif +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + bl .machine_check_process_queued_event +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif CURRENT_THREAD_INFO(r12, r1) ld r8,_MSR(r1) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9595a82..b7ca246 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -829,7 +829,8 @@ BEGIN_FTR_SECTION /* Supervisor state loss */ li r0,1 stb r0,PACA_NAPSTATELOST(r13) -3: MACHINE_CHECK_HANDLER_WINDUP +3: bl .machine_check_queue_event + MACHINE_CHECK_HANDLER_WINDUP GET_PACA(r13) ld r1,PACAR1(r13) b .power7_enter_nap_mode @@ -869,8 +870,10 @@ BEGIN_FTR_SECTION 2: /* * Return from MC interrupt. -* TODO: Queue up the MCE event so that we can log it later. +* Queue up the MCE event so that we can log it later, while +* returning from kernel or opal call. */ + bl .machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP rfid 9: diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index aeecdf1..1c6d157 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -31,6 +31,10 @@ static DEFINE_PER_CPU(int, mce_nest_count); static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event); +/* Queue for delayed MCE events. */ +static DEFINE_PER_CPU(int, mce_queue_count); +static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue); + static void mce_set_error_info(struct machine_check_event *mce, struct mce_error_info *mce_err) { @@ -162,3 +166,153 @@ void release_mce_event(void) { get_mce_event(NULL, true); } + +/* + * Queue up the MCE event which then can be handled later. + */ +void machine_check_queue_event(void) +{ + int index; + struct machine_check_event evt; + + if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) + return; + + index = __get_cpu_var(mce_queue_count)++; + /* If queue is full, just return for now. */ + if (index >= MAX_MC_EVT) { + __get_cpu_var(mce_queue_count)--; + return; + } + __get_cpu_var(mce_event_queue[index]) = evt; +} + +/* + * process pending MCE event from the mce event queue. This function will be + * called during syscall exit. + */ +void machine_check_process_queued_event(void) +{ + int index; + + preempt_disable(); + /* +* For now just print it to console. +* TODO: log this error event to FSP or nvram. +*/ + while (__get_cpu_var(mce_queue_count) > 0) { + index = __get_cpu_var(mce_queue_count) - 1; + machine_check_print_event_info( + &__get_cpu_var(mce_event_queue[index])); + __get_cpu_var(mce_queue_count)--; + } + preempt_enable(); +} + +void machine_check_print_event_info(struct machine_check_event *evt) +{ + const char *level, *sevstr, *subtype; + static const char *mc_ue_types[] = { + "Indeterminate", + "Instruction fetch", + "Page table wal
[RFC PATCH v5 09/12] powerpc/book3s: Decode and save machine check event.
From: Mahesh Salgaonkar Now that we handle machine check in linux, the MCE decoding should also take place in linux host. This info is crucial to log before we go down in case we can not handle the machine check errors. This patch decodes and populates a machine check event which contain high level meaning full MCE information. We do this in real mode C code with ME bit on. The MCE information is still available on emergency stack (in pt_regs structure format). Even if we take another exception at this point the MCE early handler will allocate a new stack frame on top of current one. So when we return back here we still have our MCE information safe on current stack. We use per cpu buffer to save high level MCE information. Each per cpu buffer is an array of machine check event structure indexed by per cpu counter mce_nest_count. The mce_nest_count is incremented every time we enter machine check early handler in real mode to get the current free slot (index = mce_nest_count - 1). The mce_nest_count is decremented once the MCE info is consumed by virtual mode machine exception handler. This patch provides save_mce_event(), get_mce_event() and release_mce_event() generic routines that can be used by machine check handlers to populate and retrieve the event. The routine release_mce_event() will free the event slot so that it can be reused. Caller can invoke get_mce_event() with a release flag either to release the event slot immediately OR keep it so that it can be fetched again. The event slot can be also released anytime by invoking release_mce_event(). This patch also updates kvm code to invoke get_mce_event to retrieve generic mce event rather than paca->opal_mce_evt. The KVM code always calls get_mce_event() with release flags set to false so that event is available for linus host machine If machine check occurs while we are in guest, KVM tries to handle the error. If KVM is able to handle MC error successfully, it enters the guest and delivers the machine check to guest. If KVM is not able to handle MC error, it exists the guest and passes the control to linux host machine check handler which then logs MC event and decides how to handle it in linux host. In failure case, KVM needs to make sure that the MC event is available for linux host to consume. Hence KVM always calls get_mce_event() with release flags set to false and later it invokes release_mce_event() only if it succeeds to handle error. Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/include/asm/mce.h| 124 + arch/powerpc/kernel/Makefile |2 arch/powerpc/kernel/mce.c | 164 + arch/powerpc/kernel/mce_power.c | 116 ++- arch/powerpc/kvm/book3s_hv_ras.c | 32 -- arch/powerpc/platforms/powernv/opal.c | 35 +++ 6 files changed, 434 insertions(+), 39 deletions(-) create mode 100644 arch/powerpc/kernel/mce.c diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index e3ffa82..87cad2a 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -66,5 +66,129 @@ #define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ P8_DSISR_MC_ERAT_MULTIHIT_SEC) +enum MCE_Version { + MCE_V1 = 1, +}; + +enum MCE_Severity { + MCE_SEV_NO_ERROR = 0, + MCE_SEV_WARNING = 1, + MCE_SEV_ERROR_SYNC = 2, + MCE_SEV_FATAL = 3, +}; + +enum MCE_Disposition { + MCE_DISPOSITION_RECOVERED = 0, + MCE_DISPOSITION_NOT_RECOVERED = 1, +}; + +enum MCE_Initiator { + MCE_INITIATOR_UNKNOWN = 0, + MCE_INITIATOR_CPU = 1, +}; + +enum MCE_ErrorType { + MCE_ERROR_TYPE_UNKNOWN = 0, + MCE_ERROR_TYPE_UE = 1, + MCE_ERROR_TYPE_SLB = 2, + MCE_ERROR_TYPE_ERAT = 3, + MCE_ERROR_TYPE_TLB = 4, +}; + +enum MCE_UeErrorType { + MCE_UE_ERROR_INDETERMINATE = 0, + MCE_UE_ERROR_IFETCH = 1, + MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH = 2, + MCE_UE_ERROR_LOAD_STORE = 3, + MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 4, +}; + +enum MCE_SlbErrorType { + MCE_SLB_ERROR_INDETERMINATE = 0, + MCE_SLB_ERROR_PARITY = 1, + MCE_SLB_ERROR_MULTIHIT = 2, +}; + +enum MCE_EratErrorType { + MCE_ERAT_ERROR_INDETERMINATE = 0, + MCE_ERAT_ERROR_PARITY = 1, + MCE_ERAT_ERROR_MULTIHIT = 2, +}; + +enum MCE_TlbErrorType { + MCE_TLB_ERROR_INDETERMINATE = 0, + MCE_TLB_ERROR_PARITY = 1, + MCE_TLB_ERROR_MULTIHIT = 2, +}; + +struct machine_check_event { + enum MCE_Versionversion:8; /* 0x00 */ + uint8_t in_use; /* 0x01 */ + enum MCE_Severity severity:8; /* 0x02 */ + enum MCE_Initiator initiator:8;/* 0x03 */ + enum MCE_ErrorType error_type:8; /* 0x04 */ + enum MCE_Dispositiondisposition:8; /* 0x05 */ + uint8_t reserved_1[2
[RFC PATCH v5 08/12] powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors on power8.
From: Mahesh Salgaonkar This patch handles the memory errors on power8. If we get a machine check exception due to SLB or TLB errors, then flush SLBs/TLBs and reload SLBs to recover. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras --- arch/powerpc/include/asm/mce.h |3 +++ arch/powerpc/kernel/cputable.c |4 arch/powerpc/kernel/mce_power.c | 34 ++ 3 files changed, 41 insertions(+) diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 8157d4e..e3ffa82 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -64,4 +64,7 @@ P7_DSISR_MC_SLB_MULTIHIT | \ P7_DSISR_MC_SLB_MULTIHIT_PARITY) +#define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ +P8_DSISR_MC_ERAT_MULTIHIT_SEC) + #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index c54188b..6c8dd5d 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -74,6 +74,7 @@ extern void __restore_cpu_a2(void); extern void __flush_tlb_power7(unsigned long inval_selector); extern void __flush_tlb_power8(unsigned long inval_selector); extern long __machine_check_early_realmode_p7(struct pt_regs *regs); +extern long __machine_check_early_realmode_p8(struct pt_regs *regs); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_E500) extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); @@ -462,6 +463,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_power8, .cpu_restore= __restore_cpu_power8, .flush_tlb = __flush_tlb_power8, + .machine_check_early= __machine_check_early_realmode_p8, .platform = "power8", }, { /* Power7 */ @@ -521,6 +523,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_power8, .cpu_restore= __restore_cpu_power8, .flush_tlb = __flush_tlb_power8, + .machine_check_early= __machine_check_early_realmode_p8, .platform = "power8", }, { /* Power8 */ @@ -540,6 +543,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .cpu_setup = __setup_cpu_power8, .cpu_restore= __restore_cpu_power8, .flush_tlb = __flush_tlb_power8, + .machine_check_early= __machine_check_early_realmode_p8, .platform = "power8", }, { /* Cell Broadband Engine */ diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 6905473..60a217f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -148,3 +148,37 @@ long __machine_check_early_realmode_p7(struct pt_regs *regs) /* TODO: Decode machine check reason. */ return handled; } + +static long mce_handle_ierror_p8(uint64_t srr1) +{ + long handled = 0; + + handled = mce_handle_common_ierror(srr1); + + if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { + flush_and_reload_slb(); + handled = 1; + } + return handled; +} + +static long mce_handle_derror_p8(uint64_t dsisr) +{ + return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); +} + +long __machine_check_early_realmode_p8(struct pt_regs *regs) +{ + uint64_t srr1; + long handled = 1; + + srr1 = regs->msr; + + if (P7_SRR1_MC_LOADSTORE(srr1)) + handled = mce_handle_derror_p8(regs->dsisr); + else + handled = mce_handle_ierror_p8(srr1); + + /* TODO: Decode machine check reason. */ + return handled; +} ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 07/12] powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors on power7.
From: Mahesh Salgaonkar If we get a machine check exception due to SLB or TLB errors, then flush SLBs/TLBs and reload SLBs to recover. We do this in real mode before turning on MMU. Otherwise we would run into nested machine checks. If we get a machine check when we are in guest, then just flush the SLBs and continue. This patch handles errors for power7. The next patch will handle errors for power8 Signed-off-by: Mahesh Salgaonkar Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/bitops.h |5 + arch/powerpc/include/asm/mce.h| 67 + arch/powerpc/kernel/Makefile |1 arch/powerpc/kernel/cputable.c|4 + arch/powerpc/kernel/mce_power.c | 150 + 5 files changed, 227 insertions(+) create mode 100644 arch/powerpc/include/asm/mce.h create mode 100644 arch/powerpc/kernel/mce_power.c diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 910194e..a5e9a7d 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -46,6 +46,11 @@ #include #include +/* PPC bit number conversion */ +#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be)) +#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit)) +#define PPC_BITMASK(bs, be)((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs)) + /* * clear_bit doesn't imply a memory barrier */ diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h new file mode 100644 index 000..8157d4e --- /dev/null +++ b/arch/powerpc/include/asm/mce.h @@ -0,0 +1,67 @@ +/* + * Machine check exception header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2013 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#ifndef __ASM_PPC64_MCE_H__ +#define __ASM_PPC64_MCE_H__ + +#include + +/* + * Machine Check bits on power7 and power8 + */ +#define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */ + +/* SRR1 bits for machine check (On Power7 and Power8) */ +#define P7_SRR1_MC_IFETCH(srr1)((srr1) & PPC_BITMASK(43, 45)) /* P8 too */ + +#define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */ +#define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */ +#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */ +#define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45)) +#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */ +#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD(0x6 << PPC_BITLSHIFT(45)) /* P8 too */ +#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45)) + +/* SRR1 bits for machine check (On Power8) */ +#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT(0x4 << PPC_BITLSHIFT(45)) + +/* DSISR bits for machine check (On Power7 and Power8) */ +#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ +#define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */ +#define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */ +#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */ +#define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */ +#define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */ +#define P7_DSISR_MC_SLB_MULTIHIT_PARITY(PPC_BIT(57)) /* P8 too */ + +/* + * DSISR bits for machine check (Power8) in addition to above. + * Secondary DERAT Multihit + */ +#define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54)) + +/* SLB error bits */ +#define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \ +P7_DSISR_MC_SLB_PARITY_MFSLB | \ +P7_DSISR_MC_SLB_MULTIHIT | \ +P7_DSISR_MC_SLB_MULTIHIT_PARITY) + +#endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 445cb6e..07c63d0 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_power.o +obj-$(CONFIG_PPC_BOOK3S_64)+=
[RFC PATCH v5 06/12] powerpc/book3s: Add flush_tlb operation in cpu_spec.
From: Mahesh Salgaonkar This patch introduces flush_tlb operation in cpu_spec structure. This will help us to invoke appropriate CPU-side flush tlb routine. This patch adds the foundation to invoke CPU specific flush routine for respective architectures. Currently this patch introduce flush_tlb for p7 and p8. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras --- arch/powerpc/include/asm/cputable.h |5 arch/powerpc/kernel/cpu_setup_power.S | 38 +++-- arch/powerpc/kernel/cputable.c|8 +++ arch/powerpc/kvm/book3s_hv_ras.c | 18 +++- 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index c5b107a..617cc76 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -97,6 +97,11 @@ struct cpu_spec { */ long(*machine_check_early)(struct pt_regs *regs); + /* +* Processor specific routine to flush tlbs. +*/ + void(*flush_tlb)(unsigned long inval_selector); + }; extern struct cpu_spec *cur_cpu_spec; diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 18b5b9c..37d1bb0 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -29,7 +29,7 @@ _GLOBAL(__setup_cpu_power7) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR bl __init_LPCR - bl __init_TLB + bl __init_tlb_power7 mtlrr11 blr @@ -42,7 +42,7 @@ _GLOBAL(__restore_cpu_power7) mtspr SPRN_LPID,r0 mfspr r3,SPRN_LPCR bl __init_LPCR - bl __init_TLB + bl __init_tlb_power7 mtlrr11 blr @@ -59,7 +59,7 @@ _GLOBAL(__setup_cpu_power8) orisr3, r3, LPCR_AIL_3@h bl __init_LPCR bl __init_HFSCR - bl __init_TLB + bl __init_tlb_power8 bl __init_PMU_HV mtlrr11 blr @@ -78,7 +78,7 @@ _GLOBAL(__restore_cpu_power8) orisr3, r3, LPCR_AIL_3@h bl __init_LPCR bl __init_HFSCR - bl __init_TLB + bl __init_tlb_power8 bl __init_PMU_HV mtlrr11 blr @@ -134,15 +134,31 @@ __init_HFSCR: mtspr SPRN_HFSCR,r3 blr -__init_TLB: - /* -* Clear the TLB using the "IS 3" form of tlbiel instruction -* (invalidate by congruence class). P7 has 128 CCs, P8 has 512 -* so we just always do 512 -*/ +/* + * Clear the TLB using the specified IS form of tlbiel instruction + * (invalidate by congruence class). P7 has 128 CCs., P8 has 512. + * + * r3 = IS field + */ +__init_tlb_power7: + li r3,0xc00/* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power7) + li r6,128 + mtctr r6 + mr r7,r3 /* IS field */ + ptesync +2: tlbiel r7 + addir7,r7,0x1000 + bdnz2b + ptesync +1: blr + +__init_tlb_power8: + li r3,0xc00/* IS field = 0b11 */ +_GLOBAL(__flush_tlb_power8) li r6,512 mtctr r6 - li r7,0xc00/* IS field = 0b11 */ + mr r7,r3 /* IS field */ ptesync 2: tlbiel r7 addir7,r7,0x1000 diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 597d954..55d0f9c 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -71,6 +71,8 @@ extern void __restore_cpu_power7(void); extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec); extern void __restore_cpu_power8(void); extern void __restore_cpu_a2(void); +extern void __flush_tlb_power7(unsigned long inval_selector); +extern void __flush_tlb_power8(unsigned long inval_selector); #endif /* CONFIG_PPC64 */ #if defined(CONFIG_E500) extern void __setup_cpu_e5500(unsigned long offset, struct cpu_spec* spec); @@ -440,6 +442,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power7, .cpu_restore= __restore_cpu_power7, + .flush_tlb = __flush_tlb_power7, .platform = "power7", }, { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ @@ -456,6 +459,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .oprofile_cpu_type = "ppc64/ibm-compat-v1", .cpu_setup = __setup_cpu_power8, .cpu_restore= __restore_cpu_power8, + .flush_tlb = __flush_tlb_power8, .platform = "power8", }, { /* Power7 */
[RFC PATCH v5 05/12] powerpc/book3s: Introduce a early machine check hook in cpu_spec.
From: Mahesh Salgaonkar This patch adds the early machine check function pointer in cputable for CPU specific early machine check handling. The early machine handle routine will be called in real mode to handle SLB and TLB errors. We can not reuse the existing machine_check hook because it is always invoked in kernel virtual mode and we would already be in trouble if we get SLB or TLB errors. This patch just sets up a mechanism to invoke CPU specific handler. The subsequent patches will populate the function pointer. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras --- arch/powerpc/include/asm/cputable.h |7 +++ arch/powerpc/kernel/traps.c |7 +-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 0d4939b..c5b107a 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -90,6 +90,13 @@ struct cpu_spec { * if the error is fatal, 1 if it was fully recovered and 0 to * pass up (not CPU originated) */ int (*machine_check)(struct pt_regs *regs); + + /* +* Processor specific early machine check handler which is +* called in real mode to handle SLB and TLB errors. +*/ + long(*machine_check_early)(struct pt_regs *regs); + }; extern struct cpu_spec *cur_cpu_spec; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 0c32fa0..008e5c5 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -293,8 +293,11 @@ void system_reset_exception(struct pt_regs *regs) */ long machine_check_early(struct pt_regs *regs) { - /* TODO: handle/decode machine check reason */ - return 0; + long handled = 0; + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; } #endif ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 04/12] powerpc/book3s: Return from interrupt if coming from evil context.
From: Mahesh Salgaonkar We can get machine checks from any context. We need to make sure that we handle all of them correctly. If we are coming from hypervisor user-space, we can continue in host kernel in virtual mode to deliver the MC event. If we got woken up from power-saving mode then we may come in with one of the following state: a. No state loss b. Supervisor state loss c. Hypervisor state loss For (a) and (b), we go back to nap again. State (c) is fatal, keep spinning. For all other context which we not sure of queue up the MCE event and return from the interrupt. Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/kernel/exceptions-64s.S | 82 ++ arch/powerpc/kernel/idle_power7.S|1 2 files changed, 83 insertions(+) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fffb16f..9595a82 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -155,6 +155,24 @@ machine_check_pSeries_1: */ HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) /* save r13 */ +#ifdef CONFIG_PPC_P7_NAP +BEGIN_FTR_SECTION + /* Running native on arch 2.06 or later, check if we are +* waking up from nap. We only handle no state loss and +* supervisor state loss. We do -not- handle hypervisor +* state loss at this time. +*/ + mfspr r13,SPRN_SRR1 + rlwinm. r13,r13,47-31,30,31 + beq 9f + + /* waking up from powersave (nap) state */ + cmpwi cr1,r13,2 + /* Total loss of HV state is fatal. let's just stay stuck here */ + bgt cr1,. +9: +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) +#endif /* CONFIG_PPC_P7_NAP */ EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION b machine_check_pSeries_early @@ -792,6 +810,70 @@ BEGIN_FTR_SECTION bl .save_nvgprs addir3,r1,STACK_FRAME_OVERHEAD bl .machine_check_early + ld r12,_MSR(r1) +#ifdef CONFIG_PPC_P7_NAP + /* +* Check if thread was in power saving mode. We come here when any +* of the following is true: +* a. thread wasn't in power saving mode +* b. thread was in power saving mode with no state loss or +*supervisor state loss +* +* Go back to nap again if (b) is true. +*/ + rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */ + beq 4f /* No, it wasn;t */ + /* Thread was in power saving mode. Go back to nap again. */ + cmpwi r11,2 + bne 3f + /* Supervisor state loss */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) +3: MACHINE_CHECK_HANDLER_WINDUP + GET_PACA(r13) + ld r1,PACAR1(r13) + b .power7_enter_nap_mode +4: +#endif + /* +* Check if we are coming from hypervisor userspace. If yes then we +* continue in host kernel in V mode to deliver the MC event. +*/ + rldicl. r11,r12,4,63/* See if MC hit while in HV mode. */ + beq 5f + andi. r11,r12,MSR_PR /* See if coming from user. */ + bne 9f /* continue in V mode if we are. */ + +5: +#ifdef CONFIG_KVM_BOOK3S_64_HV + /* +* We are coming from kernel context. Check if we are coming from +* guest. if yes, then we can continue. We will fall through +* do_kvm_200->kvmppc_interrupt to deliver the MC event to guest. +*/ + lbz r11,HSTATE_IN_GUEST(r13) + cmpwi r11,0 /* Check if coming from guest */ + bne 9f /* continue if we are. */ +#endif + /* +* At this point we are not sure about what context we come from. +* Queue up the MCE event and return from the interrupt. +* But before that, check if this is an un-recoverable exception. +* If yes, then stay on emergency stack and panic. +*/ + andi. r11,r12,MSR_RI + bne 2f +1: addir3,r1,STACK_FRAME_OVERHEAD + bl .unrecoverable_exception + b 1b +2: + /* +* Return from MC interrupt. +* TODO: Queue up the MCE event so that we can log it later. +*/ + MACHINE_CHECK_HANDLER_WINDUP + rfid +9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP b machine_check_pSeries diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S index e11863f..c4b384d 100644 --- a/arch/powerpc/kernel/idle_power7.S +++ b/arch/powerpc/kernel/idle_power7.S @@ -84,6 +84,7 @@ _GLOBAL(power7_nap) std r9,_MSR(r1) std r1,PACAR1(r13) +_GLOBAL(power7_enter_nap_mode) #ifdef CONFIG_KVM_BOOK3S_64_HV /* Tell KVM we're napping */ li r4,KVM_HWTHREAD_IN_NA
[RFC PATCH v5 03/12] powerpc/book3s: handle machine check in Linux host.
From: Mahesh Salgaonkar Move machine check entry point into Linux. So far we were dependent on firmware to decode MCE error details and handover the high level info to OS. This patch introduces early machine check routine that saves the MCE information (srr1, srr0, dar and dsisr) to the emergency stack. We allocate stack frame on emergency stack and set the r1 accordingly. This allows us to be prepared to take another exception without loosing context. One thing to note here that, if we get another machine check while ME bit is off then we risk a checkstop. Hence we restrict ourselves to save only MCE information and register saved on PACA_EXMC save are before we turn the ME bit on. We use paca->in_mce flag to differentiate between first entry and nested machine check entry which helps proper use of emergency stack. We increment paca->in_mce every time we enter in early machine check handler and decrement it while leaving. When we enter machine check early handler first time (paca->in_mce == 0), we are sure nobody is using MC emergency stack and allocate a stack frame at the start of the emergency stack. During subsequent entry (paca->in_mce > 0), we know that r1 points inside emergency stack and we allocate separate stack frame accordingly. This prevents us from clobbering MCE information during nested machine checks. The early machine check handler changes are placed under CPU_FTR_HVMODE section. This makes sure that the early machine check handler will get executed only in hypervisor kernel. This is the code flow: Machine Check Interrupt | V 0x200 vector ME=0, IR=0, DR=0 | V +---+ |machine_check_pSeries_early: | ME=0, IR=0, DR=0 | Alloc frame on emergency stack | | Save srr1, srr0, dar and dsisr on stack | +---+ | (ME=1, IR=0, DR=0, RFID) | V machine_check_handle_earlyME=1, IR=0, DR=0 | V +---+ | machine_check_early (r3=pt_regs)| ME=1, IR=0, DR=0 | Things to do: (in next patches) | | Flush SLB for SLB errors| | Flush TLB for TLB errors| | Decode and save MCE info| +---+ | (Fall through existing exception handler routine.) | V machine_check_pSerie ME=1, IR=0, DR=0 | (ME=1, IR=1, DR=1, RFID) | V machine_check_common ME=1, IR=1, DR=1 . . . Signed-off-by: Mahesh Salgaonkar --- arch/powerpc/kernel/asm-offsets.c|4 + arch/powerpc/kernel/exceptions-64s.S | 111 ++ arch/powerpc/kernel/traps.c | 12 3 files changed, 127 insertions(+) diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 502c7a4..64fe3bf 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -239,6 +239,10 @@ int main(void) DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); #endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); +#ifdef CONFIG_PPC_BOOK3S_64 + DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp)); + DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce)); +#endif DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state)); DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a9ed6a..fffb16f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -156,7 +156,11 @@ machine_check_pSeries_1: HMT_MEDIUM_PPR_DISCARD SET_SCRATCH0(r13) /* save r13 */ EXCEPTION_PROLOG_0(PACA_EXMC) +BEGIN_FTR_SECTION + b machine_check_pSeries_early +FTR_SECTION_ELSE b machine_check_pSeries_0 +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) . = 0x300 .globl data_access_pSeries @@ -405,6 +409,64 @@ denorm_exception_hv: .align 7 /* moved from 0x200 */ +machine_check_pSer
[RFC PATCH v5 01/12] powerpc/book3s: Split the common exception prolog logic into two section.
From: Mahesh Salgaonkar This patch splits the common exception prolog logic into three parts to facilitate reuse of existing code in the next patch. This patch also re-arranges few instructions in such a way that the second part now deals with saving register values from paca save area to stack frame, and the third part deals with saving current register values to stack frame. The second and third part will be reused in the machine check exception routine in the subsequent patch. Please note that this patch does not introduce or change existing code logic. Instead it is just a code movement and instruction re-ordering. Patch Acked-by Paul. But made some minor modification (explained above) to address Paul's comment in the later patch(3). Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras --- arch/powerpc/include/asm/exception-64s.h | 21 ++--- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index cca12f0..306eda9 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -282,9 +282,12 @@ do_kvm_##n: \ beq 4f; /* if from kernel mode */ \ ACCOUNT_CPU_USER_ENTRY(r9, r10); \ SAVE_PPR(area, r9, r10); \ -4: std r2,GPR2(r1);/* save r2 in stackframe*/ \ - SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ - SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe*/ \ +4: EXCEPTION_PROLOG_COMMON_2(area)\ + EXCEPTION_PROLOG_COMMON_3(n) \ + ACCOUNT_STOLEN_TIME + +/* Save original regs values from save area to stack frame. */ +#define EXCEPTION_PROLOG_COMMON_2(area) \ ld r9,area+EX_R9(r13); /* move r9, r10 to stackframe */ \ ld r10,area+EX_R10(r13); \ std r9,GPR9(r1); \ @@ -299,11 +302,16 @@ do_kvm_##n: \ ld r10,area+EX_CFAR(r13); \ std r10,ORIG_GPR3(r1); \ END_FTR_SECTION_NESTED(CPU_FTR_CFAR, CPU_FTR_CFAR, 66);\ + GET_CTR(r10, area);\ + std r10,_CTR(r1); + +#define EXCEPTION_PROLOG_COMMON_3(n) \ + std r2,GPR2(r1);/* save r2 in stackframe*/ \ + SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ + SAVE_2GPRS(7, r1); /* save r7, r8 in stackframe*/ \ mflrr9; /* Get LR, later save to stack */ \ ld r2,PACATOC(r13);/* get kernel TOC into r2 */ \ std r9,_LINK(r1); \ - GET_CTR(r10, area);\ - std r10,_CTR(r1); \ lbz r10,PACASOFTIRQEN(r13);\ mfspr r11,SPRN_XER; /* save XER in stackframe */ \ std r10,SOFTE(r1); \ @@ -313,8 +321,7 @@ do_kvm_##n: \ li r10,0; \ ld r11,exception_marker@toc(r2); \ std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ - ACCOUNT_STOLEN_TIME + std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ /* * Exception vectors. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 02/12] powerpc/book3s: Introduce exclusive emergency stack for machine check exception.
From: Mahesh Salgaonkar This patch introduces exclusive emergency stack for machine check exception. We use emergency stack to handle machine check exception so that we can save MCE information (srr1, srr0, dar and dsisr) before turning on ME bit and be ready for re-entrancy. This helps us to prevent clobbering of MCE information in case of nested machine checks. The reason for using emergency stack over normal kernel stack is that the machine check might occur in the middle of setting up a stack frame which may result into improper use of kernel stack. Signed-off-by: Mahesh Salgaonkar Acked-by: Paul Mackerras --- arch/powerpc/include/asm/paca.h |9 + arch/powerpc/kernel/setup_64.c | 10 +- arch/powerpc/xmon/xmon.c|4 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index a5954ce..2b98c96 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -152,6 +152,15 @@ struct paca_struct { */ struct opal_machine_check_event *opal_mc_evt; #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* Exclusive emergency stack pointer for machine check exception. */ + void *mc_emergency_sp; + /* +* Flag to check whether we are in machine check early handler +* and already using emergency stack. +*/ + u16 in_mce; +#endif /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 278ca93..8754e0d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -546,7 +546,8 @@ static void __init exc_lvl_early_init(void) /* * Stack space used when we detect a bad kernel stack pointer, and - * early in SMP boots before relocation is enabled. + * early in SMP boots before relocation is enabled. Exclusive emergency + * stack for machine checks. */ static void __init emergency_stack_init(void) { @@ -569,6 +570,13 @@ static void __init emergency_stack_init(void) sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); sp += THREAD_SIZE; paca[i].emergency_sp = __va(sp); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* emergency stack for machine check exception handling. */ + sp = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit); + sp += THREAD_SIZE; + paca[i].mc_emergency_sp = __va(sp); +#endif } } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index af9d346..a90731b 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2051,6 +2051,10 @@ static void dump_one_paca(int cpu) DUMP(p, stab_addr, "lx"); #endif DUMP(p, emergency_sp, "p"); +#ifdef CONFIG_PPC_BOOK3S_64 + DUMP(p, mc_emergency_sp, "p"); + DUMP(p, in_mce, "x"); +#endif DUMP(p, data_offset, "lx"); DUMP(p, hw_cpu_id, "x"); DUMP(p, cpu_start, "x"); ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH v5 00/12] Machine check handling in linux host.
Hi, Please find the patch set that performs the machine check handling inside linux host. The design is to be able to handle re-entrancy so that we do not clobber the machine check information during nested machine check interrupt. The patch 2 introduces separate emergency stack in paca structure exclusively for machine check exception handling. Patch 3 implements the logic to save the raw MCE info onto the emergency stack and prepares to take another exception. Patch 5 and 6 adds CPU-side hooks for early machine check handler and TLB flush. The patch 7 and 8 is responsible to detect SLB/TLB errors and flush them off in the real mode. The patch 9 implements the logic to decode and save high level MCE information to per cpu buffer without clobbering. Patch 10 implements mechanism to queue up MCE event in cases where early handler can not deliver the event to host kernel right away. The patch 12 adds the basic error handling to the high level C code with MMU on. I have tested SLB multihit, MC coming from opal context on powernv. Please review and let me know your comments. Changes in v5: - Rebased to v3.12-rc7 Changes in v4: - Split the prolog common macro in 3 parts in patch 1. - Save the regs from EXMC save area to stack before turning on ME bit. - Set/Clear MSR_RI bit at right places. - Handle a situation where machine check comes in when thread was in power saving mode. - Queue up the MCE event and return from the interrupt if MC is hit during context which we are not sure of. Go to kernel in V mode only if we are coming from hypervisor userspace (HV=1, PR=1). Changes in v3: - Rebased to v3.11-rc7 - Handle MCE coming from opal context, secondary thread nap and return from interrupt. Queue up the MCE event in this scenario and log it later during syscall exit path. Changes in v2: - Moved early machine check handling code under CPU_FTR_HVMODE section. This makes sure that the early machine check handler will get executed only in hypervisor kernel. - Add dedicated emergency stack for machine check so that we don't end up disturbing others who use same emergency stack. - Fixed the machine check early handle where it used to assume that r1 always contains the valid stack pointer. - Fixed an issue where per-cpu mce_nest_count variable underflows when kvm fails to handle MC error and exit the guest. - Fixed the code to restore r13 while before exiting early handler. Thanks, -Mahesh. --- Mahesh Salgaonkar (12): powerpc/book3s: Split the common exception prolog logic into two section. powerpc/book3s: Introduce exclusive emergency stack for machine check exception. powerpc/book3s: handle machine check in Linux host. powerpc/book3s: Return from interrupt if coming from evil context. powerpc/book3s: Introduce a early machine check hook in cpu_spec. powerpc/book3s: Add flush_tlb operation in cpu_spec. powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors on power7. powerpc/book3s: Flush SLB/TLBs if we get SLB/TLB machine check errors on power8. powerpc/book3s: Decode and save machine check event. powerpc/book3s: Queue up and process delayed MCE events. powerpc/powernv: Remove machine check handling in OPAL. powerpc/powernv: Machine check exception handling. arch/powerpc/include/asm/bitops.h|5 arch/powerpc/include/asm/cputable.h | 12 + arch/powerpc/include/asm/exception-64s.h | 21 +- arch/powerpc/include/asm/mce.h | 198 + arch/powerpc/include/asm/paca.h |9 + arch/powerpc/kernel/Makefile |1 arch/powerpc/kernel/asm-offsets.c|4 arch/powerpc/kernel/cpu_setup_power.S| 38 ++- arch/powerpc/kernel/cputable.c | 16 + arch/powerpc/kernel/entry_64.S |5 arch/powerpc/kernel/exceptions-64s.S | 196 + arch/powerpc/kernel/idle_power7.S|1 arch/powerpc/kernel/mce.c| 345 ++ arch/powerpc/kernel/mce_power.c | 284 + arch/powerpc/kernel/setup_64.c | 10 + arch/powerpc/kernel/traps.c | 15 + arch/powerpc/kvm/book3s_hv_ras.c | 50 ++-- arch/powerpc/platforms/powernv/opal.c| 161 -- arch/powerpc/xmon/xmon.c |4 19 files changed, 1220 insertions(+), 155 deletions(-) create mode 100644 arch/powerpc/include/asm/mce.h create mode 100644 arch/powerpc/kernel/mce.c create mode 100644 arch/powerpc/kernel/mce_power.c -- -Mahesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/3] powerpc/nvram: fix endian issue when reading the NVRAM size
From: Cedric Le Goater Signed-off-by: Cedric Le Goater --- arch/powerpc/platforms/chrp/nvram.c|4 ++-- arch/powerpc/platforms/pseries/nvram.c |4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/chrp/nvram.c b/arch/powerpc/platforms/chrp/nvram.c index d3ceff0..9ef8cc3 100644 --- a/arch/powerpc/platforms/chrp/nvram.c +++ b/arch/powerpc/platforms/chrp/nvram.c @@ -66,7 +66,7 @@ static void chrp_nvram_write(int addr, unsigned char val) void __init chrp_nvram_init(void) { struct device_node *nvram; - const unsigned int *nbytes_p; + const __be32 *nbytes_p; unsigned int proplen; nvram = of_find_node_by_type(NULL, "nvram"); @@ -79,7 +79,7 @@ void __init chrp_nvram_init(void) return; } - nvram_size = *nbytes_p; + nvram_size = be32_to_cpup(nbytes_p); printk(KERN_INFO "CHRP nvram contains %u bytes\n", nvram_size); of_node_put(nvram); diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 2498d4d..057fc89 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -804,7 +804,7 @@ machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); int __init pSeries_nvram_init(void) { struct device_node *nvram; - const unsigned int *nbytes_p; + const __be32 *nbytes_p; unsigned int proplen; nvram = of_find_node_by_type(NULL, "nvram"); @@ -817,7 +817,7 @@ int __init pSeries_nvram_init(void) return -EIO; } - nvram_size = *nbytes_p; + nvram_size = be32_to_cpup(nbytes_p); nvram_fetch = rtas_token("nvram-fetch"); nvram_store = rtas_token("nvram-store"); -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] powerpc/nvram: fix endian issue when using the partition length
From: Cedric Le Goater When reading partitions, the length has to be translated from big endian to the endian order of the host. Similarly, when writing partitions, the length needs to be in big endian order. The userspace tool 'nvram' needs a similar fix as it is reading and writing partitions through /dev/nram : http://sourceforge.net/p/powerpc-utils/mailman/message/31571277/ Signed-off-by: Cedric Le Goater --- arch/powerpc/kernel/nvram_64.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 8213ee1..fd82c28 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -223,9 +223,13 @@ static int __init nvram_write_header(struct nvram_partition * part) { loff_t tmp_index; int rc; - + struct nvram_header phead; + + memcpy(&phead, &part->header, NVRAM_HEADER_LEN); + phead.length = cpu_to_be16(phead.length); + tmp_index = part->index; - rc = ppc_md.nvram_write((char *)&part->header, NVRAM_HEADER_LEN, &tmp_index); + rc = ppc_md.nvram_write((char *)&phead, NVRAM_HEADER_LEN, &tmp_index); return rc; } @@ -505,6 +509,8 @@ int __init nvram_scan_partitions(void) memcpy(&phead, header, NVRAM_HEADER_LEN); + phead.length = be16_to_cpu(phead.length); + err = 0; c_sum = nvram_checksum(&phead); if (c_sum != phead.checksum) { -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 0/3] nvram fixes
Hi, Here are a few fixes for nvram, mostly endian issues. The patches are based on 3.12-rc7. Cedric Le Goater (3): powerpc/nvram: scan partitions only once powerpc/nvram: fix endian issue when reading the NVRAM size powerpc/nvram: fix endian issue when reading the partition length arch/powerpc/kernel/nvram_64.c | 10 -- arch/powerpc/platforms/chrp/nvram.c|4 ++-- arch/powerpc/platforms/pseries/nvram.c | 10 +- 3 files changed, 15 insertions(+), 9 deletions(-) Cheers, C. -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/3] powerpc/nvram: scan partitions only once
From: Cedric Le Goater nvram_scan_partitions() is called twice when initializing the "lnx,oops-log" partition and the "ibm,rtas-log" partition. This fills the partition list with duplicate entries. This patch moves the partition scan in the init routine pseries_nvram_init_log_partitions() which is called only once. Signed-off-by: Cedric Le Goater --- arch/powerpc/platforms/pseries/nvram.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index d276cd3..2498d4d 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -429,9 +429,6 @@ static int __init pseries_nvram_init_os_partition(struct nvram_os_partition loff_t p; int size; - /* Scan nvram for partitions */ - nvram_scan_partitions(); - /* Look for ours */ p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); @@ -795,6 +792,9 @@ static int __init pseries_nvram_init_log_partitions(void) { int rc; + /* Scan nvram for partitions */ + nvram_scan_partitions(); + rc = pseries_nvram_init_os_partition(&rtas_log_partition); nvram_init_oops_partition(rc == 0); return 0; -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
"Paul E. McKenney" wrote on 10/30/2013 11:27:25 AM: > If you were to back up that insistence with a description of the orderings > you are relying on, why other orderings are not important, and how the > important orderings are enforced, I might be tempted to pay attention > to your opinion. > > Thanx, Paul NP, though, I feel too embarrassed to explain things about memory barriers when one of the authors of Documentation/memory-barriers.txt is on cc: list ;-) Disclaimer: it is anyway impossible to prove lack of *any* problem. Having said that, lets look into an example in Documentation/circular-buffers.txt: > THE PRODUCER > > > The producer will look something like this: > > spin_lock(&producer_lock); > > unsigned long head = buffer->head; > unsigned long tail = ACCESS_ONCE(buffer->tail); > > if (CIRC_SPACE(head, tail, buffer->size) >= 1) { > /* insert one item into the buffer */ > struct item *item = buffer[head]; > > produce_item(item); > > smp_wmb(); /* commit the item before incrementing the head */ > > buffer->head = (head + 1) & (buffer->size - 1); > > /* wake_up() will make sure that the head is committed before >* waking anyone up */ > wake_up(consumer); > } > > spin_unlock(&producer_lock); We can see that authors of the document didn't put any memory barrier after "buffer->tail" read and before "produce_item(item)" and I think they have a good reason. Lets consider an imaginary smp_mb() right before "produce_item(item);". Such a barrier will ensure that - - the memory read on "buffer->tail" is completed before store to memory pointed by "item" is committed. However, the store to "buffer->tail" anyway cannot be completed before conditional branch implied by "if ()" is proven to execute body statement of the if(). And the latter cannot be proven before read of "buffer->tail" is completed. Lets see this other way. Lets imagine that somehow a store to the data pointed by "item" is completed before we read "buffer->tail". That would mean, that the store was completed speculatively. But speculative execution of conditional stores is prohibited by C/C++ standard, otherwise any conditional store at any random place of code could pollute shared memory. On the other hand, if compiler or processor can prove that condition in above if() is going to be true (or if speculative store writes the same value as it was before write), the speculative store *is* allowed. In this case we should not be bothered by the fact that memory pointed by "item" is written before a read from "buffer->tail" is completed. Regards, -- Victor ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 11:48:44AM +, James Hogan wrote: > Hi Peter, > > On 30/10/13 10:42, Peter Zijlstra wrote: > > Subject: perf, tool: Add required memory barriers > > > > To match patch bf378d341e48 ("perf: Fix perf ring buffer memory > > ordering") change userspace to also adhere to the ordering outlined. > > > > Most barrier implementations were gleaned from > > arch/*/include/asm/barrier.h and with the exception of metag I'm fairly > > sure they're correct. > > Yeh... > > Short answer: > For Meta you're probably best off assuming > CONFIG_METAG_SMP_WRITE_REORDERING=n and just using compiler barriers. Thanks, fixed it that way. > Long answer: > The issue with write reordering between Meta's hardware threads beyond > the cache is only with a particular SoC, and SMP is not used in > production on it. > It is possible to make the LINSYSEVENT_WR_COMBINE_FLUSH register > writable to userspace (it's in a non-mappable region already) but even > then the write to that register needs odd placement to be effective > (before the shmem write rather than after - which isn't a place any > existing barriers are guaranteed to be placed). I'm fairly confident we > get away with it in the kernel, and userland normally just uses linked > load/store instructions for atomicity which works fine. Urgh.. sounds like way 'fun' for you ;-) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
Hi Peter, On 30/10/13 10:42, Peter Zijlstra wrote: > Subject: perf, tool: Add required memory barriers > > To match patch bf378d341e48 ("perf: Fix perf ring buffer memory > ordering") change userspace to also adhere to the ordering outlined. > > Most barrier implementations were gleaned from > arch/*/include/asm/barrier.h and with the exception of metag I'm fairly > sure they're correct. Yeh... Short answer: For Meta you're probably best off assuming CONFIG_METAG_SMP_WRITE_REORDERING=n and just using compiler barriers. Long answer: The issue with write reordering between Meta's hardware threads beyond the cache is only with a particular SoC, and SMP is not used in production on it. It is possible to make the LINSYSEVENT_WR_COMBINE_FLUSH register writable to userspace (it's in a non-mappable region already) but even then the write to that register needs odd placement to be effective (before the shmem write rather than after - which isn't a place any existing barriers are guaranteed to be placed). I'm fairly confident we get away with it in the kernel, and userland normally just uses linked load/store instructions for atomicity which works fine. Cheers James ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Wed, Oct 30, 2013 at 02:27:25AM -0700, Paul E. McKenney wrote: > On Mon, Oct 28, 2013 at 10:58:58PM +0200, Victor Kaplansky wrote: > > Oleg Nesterov wrote on 10/28/2013 10:17:35 PM: > > > > > mb(); // : do we really need it? I think yes. > > > > Oh, it is hard to argue with feelings. Also, it is easy to be on > > conservative side and put the barrier here just in case. > > But I still insist that the barrier is redundant in your example. > > If you were to back up that insistence with a description of the orderings > you are relying on, why other orderings are not important, and how the > important orderings are enforced, I might be tempted to pay attention > to your opinion. OK, so let me try.. a slightly less convoluted version of the code in kernel/events/ring_buffer.c coupled with a userspace consumer would look something like the below. One important detail is that the kbuf part and the kbuf_writer() are strictly per cpu and we can thus rely on implicit ordering for those. Only the userspace consumer can possibly run on another cpu, and thus we need to ensure data consistency for those. struct buffer { u64 size; u64 tail; u64 head; void *data; }; struct buffer *kbuf, *ubuf; /* * Determine there's space in the buffer to store data at @offset to * @head without overwriting data at @tail. */ bool space(u64 tail, u64 offset, u64 head) { offset = (offset - tail) % kbuf->size; head = (head - tail) % kbuf->size; return (s64)(head - offset) >= 0; } /* * If there's space in the buffer; store the data @buf; otherwise * discard it. */ void kbuf_write(int sz, void *buf) { u64 tail = ACCESS_ONCE(ubuf->tail); /* last location userspace read */ u64 offset = kbuf->head; /* we already know where we last wrote */ u64 head = offset + sz; if (!space(tail, offset, head)) { /* discard @buf */ return; } /* * Ensure that if we see the userspace tail (ubuf->tail) such * that there is space to write @buf without overwriting data * userspace hasn't seen yet, we won't in fact store data before * that read completes. */ smp_mb(); /* A, matches with D */ write(kbuf->data + offset, buf, sz); kbuf->head = head % kbuf->size; /* * Ensure that we write all the @buf data before we update the * userspace visible ubuf->head pointer. */ smp_wmb(); /* B, matches with C */ ubuf->head = kbuf->head; } /* * Consume the buffer data and update the tail pointer to indicate to * kernel space there's 'free' space. */ void ubuf_read(void) { u64 head, tail; tail = ACCESS_ONCE(ubuf->tail); head = ACCESS_ONCE(ubuf->head); /* * Ensure we read the buffer boundaries before the actual buffer * data... */ smp_rmb(); /* C, matches with B */ while (tail != head) { obj = ubuf->data + tail; /* process obj */ tail += obj->size; tail %= ubuf->size; } /* * Ensure all data reads are complete before we issue the * ubuf->tail update; once that update hits, kbuf_write() can * observe and overwrite data. */ smp_mb(); /* D, matches with A */ ubuf->tail = tail; } Now the whole crux of the question is if we need barrier A at all, since the STORES issued by the @buf writes are dependent on the ubuf->tail read. If the read shows no available space, we simply will not issue those writes -- therefore we could argue we can avoid the memory barrier. However, that leaves D unpaired and me confused. We must have D because otherwise the CPU could reorder that write into the reads previous and the kernel could start overwriting data we're still reading.. which seems like a bad deal. Also, I'm not entirely sure on C, that too seems like a dependency, we simply cannot read the buffer @tail before we've read the tail itself, now can we? Similarly we cannot compare tail to head without having the head read completed. Could we replace A and C with an smp_read_barrier_depends()? ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Tue, Oct 29, 2013 at 03:27:05PM -0400, Vince Weaver wrote: > On Tue, 29 Oct 2013, Peter Zijlstra wrote: > > > On Tue, Oct 29, 2013 at 11:21:31AM +0100, Peter Zijlstra wrote: > > --- linux-2.6.orig/include/uapi/linux/perf_event.h > > +++ linux-2.6/include/uapi/linux/perf_event.h > > @@ -479,13 +479,15 @@ struct perf_event_mmap_page { > > /* > > * Control data for the mmap() data buffer. > > * > > -* User-space reading the @data_head value should issue an rmb(), on > > -* SMP capable platforms, after reading this value -- see > > -* perf_event_wakeup(). > > +* User-space reading the @data_head value should issue an smp_rmb(), > > +* after reading this value. > > so where's the patch fixing perf to use the new recommendations? Fair enough, thanks for reminding me about that. See below. > Is this purely a performance thing or a correctness change? Correctness, although I suppose on most archs you'd be hard pushed to notice. > A change like this a bit of a pain, especially as userspace doesn't really > have nice access to smb_mb() defines so a lot of cut-and-pasting will > ensue for everyone who's trying to parse the mmap buffer. Agreed; we should maybe push for a user visible asm/barrier.h or so. --- Subject: perf, tool: Add required memory barriers To match patch bf378d341e48 ("perf: Fix perf ring buffer memory ordering") change userspace to also adhere to the ordering outlined. Most barrier implementations were gleaned from arch/*/include/asm/barrier.h and with the exception of metag I'm fairly sure they're correct. Cc: James Hogan Signed-off-by: Peter Zijlstra --- tools/perf/perf.h| 39 +-- tools/perf/util/evlist.h | 2 +- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index f61c230beec4..1b8a0a2a63d4 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -4,6 +4,8 @@ #include #if defined(__i386__) +#define mb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") +#define wmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") #define rmb() asm volatile("lock; addl $0,0(%%esp)" ::: "memory") #define cpu_relax()asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC "model name" @@ -13,6 +15,8 @@ #endif #if defined(__x86_64__) +#define mb() asm volatile("mfence" ::: "memory") +#define wmb() asm volatile("sfence" ::: "memory") #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax()asm volatile("rep; nop" ::: "memory"); #define CPUINFO_PROC "model name" @@ -23,20 +27,28 @@ #ifdef __powerpc__ #include "../../arch/powerpc/include/uapi/asm/unistd.h" +#define mb() asm volatile ("sync" ::: "memory") +#define wmb() asm volatile ("sync" ::: "memory") #define rmb() asm volatile ("sync" ::: "memory") #define cpu_relax()asm volatile ("" ::: "memory"); #define CPUINFO_PROC "cpu" #endif #ifdef __s390__ +#define mb() asm volatile("bcr 15,0" ::: "memory") +#define wmb() asm volatile("bcr 15,0" ::: "memory") #define rmb() asm volatile("bcr 15,0" ::: "memory") #define cpu_relax()asm volatile("" ::: "memory"); #endif #ifdef __sh__ #if defined(__SH4A__) || defined(__SH5__) +# define mb() asm volatile("synco" ::: "memory") +# define wmb() asm volatile("synco" ::: "memory") # define rmb() asm volatile("synco" ::: "memory") #else +# define mb() asm volatile("" ::: "memory") +# define wmb() asm volatile("" ::: "memory") # define rmb() asm volatile("" ::: "memory") #endif #define cpu_relax()asm volatile("" ::: "memory") @@ -44,24 +56,38 @@ #endif #ifdef __hppa__ +#define mb() asm volatile("" ::: "memory") +#define wmb() asm volatile("" ::: "memory") #define rmb() asm volatile("" ::: "memory") #define cpu_relax()asm volatile("" ::: "memory"); #define CPUINFO_PROC "cpu" #endif #ifdef __sparc__ +#ifdef __LP64__ +#define mb() asm volatile("ba,pt %%xcc, 1f\n"\ +"membar #StoreLoad\n" \ +"1:\n"":::"memory") +#else +#define mb() asm volatile("":::"memory") +#endif +#define wmb() asm volatile("":::"memory") #define rmb() asm volatile("":::"memory") #define cpu_relax()asm volatile("":::"memory") #define CPUINFO_PROC "cpu" #endif #ifdef __alpha__ +#define mb() asm volatile("mb" ::: "memory") +#define wmb() asm volatile("wmb" ::: "memory") #define rmb() asm volatile("mb" ::: "memory") #define cpu_relax()asm volatile("" ::: "memory") #define CPUINFO_PROC "cpu model" #endif #ifdef __ia64__ +#define mb() asm volatile ("mf" ::: "memory") +#define wmb() asm volatile ("mf" ::: "memory") #define rmb() asm volat
Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
Hi Kamalesh, On 10/30/2013 02:53 PM, Kamalesh Babulal wrote: > Hi Preeti, > >> nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number >> of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set. >> Therefore instead of updating nr_busy_cpus at every level of sched domain, >> since it is irrelevant, we can update this parameter only at the parent >> domain of the sd which has this flag set. Introduce a per-cpu parameter >> sd_busy which represents this parent domain. >> >> In nohz_kick_needed() we directly query the nr_busy_cpus parameter >> associated with the groups of sd_busy. >> >> By associating sd_busy with the highest domain which has >> SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could >> have this flag set and trigger nohz_idle_balancing if any of the levels have >> more than one busy cpu. >> >> sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been >> introduced to represent the highest sched domain which has SD_ASYM_PACKING >> flag set >> so that it can be queried directly when required. >> >> While we are at it, we might as well change the nohz_idle parameter to be >> updated at the sd_busy domain level alone and not the base domain level of a >> CPU. >> This will unify the concept of busy cpus at just one level of sched domain >> where it is currently used. >> >> Signed-off-by: Preeti U Murthy >> --- >> kernel/sched/core.c |6 ++ >> kernel/sched/fair.c | 38 -- >> kernel/sched/sched.h |2 ++ >> 3 files changed, 28 insertions(+), 18 deletions(-) >> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index c06b8d3..e6a6244 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc); >> DEFINE_PER_CPU(int, sd_llc_size); >> DEFINE_PER_CPU(int, sd_llc_id); >> DEFINE_PER_CPU(struct sched_domain *, sd_numa); >> +DEFINE_PER_CPU(struct sched_domain *, sd_busy); >> +DEFINE_PER_CPU(struct sched_domain *, sd_asym); >> >> static void update_top_cache_domain(int cpu) >> { >> @@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu) >> if (sd) { >> id = cpumask_first(sched_domain_span(sd)); >> size = cpumask_weight(sched_domain_span(sd)); >> +rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); >> } > > > consider a machine with single socket, dual core with HT enabled. The top most > domain is also the highest domain with SD_SHARE_PKG_RESOURCES flag set, > i.e MC domain (the machine toplogy consist of SIBLING and MC domain). > > # lstopo-no-graphics --no-bridges --no-io > Machine (7869MB) + Socket L#0 + L3 L#0 (3072KB) > L2 L#0 (256KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0 > PU L#0 (P#0) > PU L#1 (P#1) > L2 L#1 (256KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1 > PU L#2 (P#2) > PU L#3 (P#3) > > With this approach parent of MC domain is NULL and given that sd_busy is NULL, > nr_busy_cpus of sched domain sd_busy will never be incremented/decremented. > Resulting is nohz_kick_needed returning 0. Right and it *should* return 0. There is no sibling domain that can offload tasks from it. Therefore there is no point kicking nohz idle balance. Regards Preeti U Murthy > > Thanks, > Kamalesh. > ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[b42...@freescale.com: Re: [PATCH 1/3] dma: imx-sdma: Add ssi dual fifo script support]
Just found that I missed Sascha's mail address in my TO list of last reply. So resend it. And sorry for the duplicated mails. - Forwarded message from Nicolin Chen - Date: Wed, 30 Oct 2013 12:48:48 +0800 From: Nicolin Chen Subject: Re: [PATCH 1/3] dma: imx-sdma: Add ssi dual fifo script support User-Agent: Mutt/1.5.21 (2010-09-15) Hi Sascha, On Tue, Oct 29, 2013 at 02:51:43PM +0100, Sascha Hauer wrote: > Look at drivers/dma/imx-sdma.c: > > > /** > > * struct sdma_firmware_header - Layout of the firmware image > > * > > * @magic "SDMA" > > * @version_major increased whenever layout of struct > > * sdma_script_start_addrs > > * changes. > > Can you image why this firmware has a version field? Right, it's because > it encodes the layout of struct sdma_script_start_addrs. > > As the comment clearly states you have to *increase this field* when you > add scripts. > > Obviously you missed that, as the firmware on lkml posted recently > shows: > > > : 414d4453 0001 0001 001c SDMA > > Still '1' > > > 0010: 0026 00b4 067a 0282 &...z... > > 0020: > > 0030: > > 0040: 1a6a j... > > 0050: 02eb 18bb 0408 > > 0060: 03c0 > > 0070: 02ab 037b {... > > 0080: 044c 046e L...n... > > 0090: 1800 > > 00a0: 1800 1862 1a16 b... > ^ > new script addresses introduced > > > > -#define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V134 > > +#define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V137 > > And no, this is not a bug. It's your firmware header that is buggy. > I wasn't aware that the problem is far more complicated than I thought. And thank you for telling me all this. > What you need is: > > #define SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V2 37 > > You (you as a company, not you as a person) knew that it was me who > created this firmware format. So it was absolutely unnecessary to create > an incompatible firmware instead of dropping me a short note. > > Please add a version check to the driver as necessary and provide a proper > firmware. > Just currently it's not easy for me to create a new proper firmware, and I's been told that besides this version number, it also lacks a decent license info. So may I just refine this patch as you suggested to add a version check and add those new scripts first? Thank you, Nicolin Chen > Sascha > > -- > Pengutronix e.K. | | > Industrial Linux Solutions | http://www.pengutronix.de/ | > Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| > Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | > - End forwarded message - ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: perf events ring buffer memory barrier on powerpc
On Mon, Oct 28, 2013 at 10:58:58PM +0200, Victor Kaplansky wrote: > Oleg Nesterov wrote on 10/28/2013 10:17:35 PM: > > > mb(); // : do we really need it? I think yes. > > Oh, it is hard to argue with feelings. Also, it is easy to be on > conservative side and put the barrier here just in case. > But I still insist that the barrier is redundant in your example. If you were to back up that insistence with a description of the orderings you are relying on, why other orderings are not important, and how the important orderings are enforced, I might be tempted to pay attention to your opinion. Thanx, Paul ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V2 2/2] sched: Remove un-necessary iteration over sched domains to update nr_busy_cpus
Hi Preeti, > nr_busy_cpus parameter is used by nohz_kick_needed() to find out the number > of busy cpus in a sched domain which has SD_SHARE_PKG_RESOURCES flag set. > Therefore instead of updating nr_busy_cpus at every level of sched domain, > since it is irrelevant, we can update this parameter only at the parent > domain of the sd which has this flag set. Introduce a per-cpu parameter > sd_busy which represents this parent domain. > > In nohz_kick_needed() we directly query the nr_busy_cpus parameter > associated with the groups of sd_busy. > > By associating sd_busy with the highest domain which has > SD_SHARE_PKG_RESOURCES flag set, we cover all lower level domains which could > have this flag set and trigger nohz_idle_balancing if any of the levels have > more than one busy cpu. > > sd_busy is irrelevant for asymmetric load balancing. However sd_asym has been > introduced to represent the highest sched domain which has SD_ASYM_PACKING > flag set > so that it can be queried directly when required. > > While we are at it, we might as well change the nohz_idle parameter to be > updated at the sd_busy domain level alone and not the base domain level of a > CPU. > This will unify the concept of busy cpus at just one level of sched domain > where it is currently used. > > Signed-off-by: Preeti U Murthy > --- > kernel/sched/core.c |6 ++ > kernel/sched/fair.c | 38 -- > kernel/sched/sched.h |2 ++ > 3 files changed, 28 insertions(+), 18 deletions(-) > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index c06b8d3..e6a6244 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -5271,6 +5271,8 @@ DEFINE_PER_CPU(struct sched_domain *, sd_llc); > DEFINE_PER_CPU(int, sd_llc_size); > DEFINE_PER_CPU(int, sd_llc_id); > DEFINE_PER_CPU(struct sched_domain *, sd_numa); > +DEFINE_PER_CPU(struct sched_domain *, sd_busy); > +DEFINE_PER_CPU(struct sched_domain *, sd_asym); > > static void update_top_cache_domain(int cpu) > { > @@ -5282,6 +5284,7 @@ static void update_top_cache_domain(int cpu) > if (sd) { > id = cpumask_first(sched_domain_span(sd)); > size = cpumask_weight(sched_domain_span(sd)); > + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); > } consider a machine with single socket, dual core with HT enabled. The top most domain is also the highest domain with SD_SHARE_PKG_RESOURCES flag set, i.e MC domain (the machine toplogy consist of SIBLING and MC domain). # lstopo-no-graphics --no-bridges --no-io Machine (7869MB) + Socket L#0 + L3 L#0 (3072KB) L2 L#0 (256KB) + L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0 PU L#0 (P#0) PU L#1 (P#1) L2 L#1 (256KB) + L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1 PU L#2 (P#2) PU L#3 (P#3) With this approach parent of MC domain is NULL and given that sd_busy is NULL, nr_busy_cpus of sched domain sd_busy will never be incremented/decremented. Resulting is nohz_kick_needed returning 0. Thanks, Kamalesh. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev