Re: [linux-sunxi] [PATCH 1/5] spi: sunxi: fix transfer timeout
Hi Michal, On Fri, May 27, 2016 at 5:25 AM, Michal Suchanekwrote: > The trasfer timeout is fixed at 1000 ms. Reading a 4Mbyte flash over > 1MHz SPI bus takes way longer than that. Calculate the timeout from the > actual time the transfer is supposed to take and multiply by 2 for good > measure. > > Signed-off-by: Michal Suchanek > --- > > v2: > - fix build error > - use unsigned instead of int in max_t > - use tfr->speed_hz instead of sspi->max_speed_hz > --- > drivers/spi/spi-sun4i.c | 11 ++- > drivers/spi/spi-sun6i.c | 11 ++- > 2 files changed, 20 insertions(+), 2 deletions(-) > > diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c > index 1ddd9e2..fe63bbd 100644 > --- a/drivers/spi/spi-sun4i.c > +++ b/drivers/spi/spi-sun4i.c > @@ -173,6 +173,7 @@ static int sun4i_spi_transfer_one(struct spi_master > *master, > { > struct sun4i_spi *sspi = spi_master_get_devdata(master); > unsigned int mclk_rate, div, timeout; > + unsigned int start, end, tx_time; > unsigned int tx_len = 0; > int ret = 0; > u32 reg; > @@ -279,9 +280,17 @@ static int sun4i_spi_transfer_one(struct spi_master > *master, > reg = sun4i_spi_read(sspi, SUN4I_CTL_REG); > sun4i_spi_write(sspi, SUN4I_CTL_REG, reg | SUN4I_CTL_XCH); > > + tx_time = max_t(unsigned, tfr->len * 8 * 2 / (tfr->speed_hz / 1000), You should probably use "unsigned int" instead of just "unsigned" here. > + 100); > + start = jiffies; > timeout = wait_for_completion_timeout(>done, > - msecs_to_jiffies(1000)); > + msecs_to_jiffies(tx_time)); > + end = jiffies; > if (!timeout) { > + dev_warn(>dev, > +"%s: timeout transferring %u bytes@%iHz for > %i(%i)ms", > +dev_name(>dev), tfr->len, tfr->speed_hz, > +jiffies_to_msecs(end - start), tx_time); Should the debug changes be in a separate patch? > ret = -ETIMEDOUT; > goto out; > } > diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c > index 42e2c4b..8be5c5c 100644 > --- a/drivers/spi/spi-sun6i.c > +++ b/drivers/spi/spi-sun6i.c > @@ -162,6 +162,7 @@ static int sun6i_spi_transfer_one(struct spi_master > *master, > unsigned int mclk_rate, div, timeout; > unsigned int tx_len = 0; > int ret = 0; > + unsigned int start, end, tx_time; > u32 reg; > > /* We don't support transfer larger than the FIFO */ > @@ -269,9 +270,17 @@ static int sun6i_spi_transfer_one(struct spi_master > *master, > reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); > sun6i_spi_write(sspi, SUN6I_TFR_CTL_REG, reg | SUN6I_TFR_CTL_XCH); > > + tx_time = max_t(unsigned, tfr->len * 8 * 2 / (tfr->speed_hz / 1000), Ditto, "unsigned int" instead of "unsigned"? > + 100); > + start = jiffies; > timeout = wait_for_completion_timeout(>done, > - msecs_to_jiffies(1000)); > + msecs_to_jiffies(tx_time)); > + end = jiffies; > if (!timeout) { > + dev_warn(>dev, > +"%s: timeout transferring %u bytes@%iHz for > %i(%i)ms", > +dev_name(>dev), tfr->len, tfr->speed_hz, > +jiffies_to_msecs(end - start), tx_time); Ditto, separate patch? Also, should the changes for the drivers be in two separate patches also? Thanks, -- Julian Calaby Email: julian.cal...@gmail.com Profile: http://www.google.com/profiles/julian.calaby/
Re: [linux-sunxi] [PATCH 1/5] spi: sunxi: fix transfer timeout
Hi Michal, On Fri, May 27, 2016 at 5:25 AM, Michal Suchanek wrote: > The trasfer timeout is fixed at 1000 ms. Reading a 4Mbyte flash over > 1MHz SPI bus takes way longer than that. Calculate the timeout from the > actual time the transfer is supposed to take and multiply by 2 for good > measure. > > Signed-off-by: Michal Suchanek > --- > > v2: > - fix build error > - use unsigned instead of int in max_t > - use tfr->speed_hz instead of sspi->max_speed_hz > --- > drivers/spi/spi-sun4i.c | 11 ++- > drivers/spi/spi-sun6i.c | 11 ++- > 2 files changed, 20 insertions(+), 2 deletions(-) > > diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c > index 1ddd9e2..fe63bbd 100644 > --- a/drivers/spi/spi-sun4i.c > +++ b/drivers/spi/spi-sun4i.c > @@ -173,6 +173,7 @@ static int sun4i_spi_transfer_one(struct spi_master > *master, > { > struct sun4i_spi *sspi = spi_master_get_devdata(master); > unsigned int mclk_rate, div, timeout; > + unsigned int start, end, tx_time; > unsigned int tx_len = 0; > int ret = 0; > u32 reg; > @@ -279,9 +280,17 @@ static int sun4i_spi_transfer_one(struct spi_master > *master, > reg = sun4i_spi_read(sspi, SUN4I_CTL_REG); > sun4i_spi_write(sspi, SUN4I_CTL_REG, reg | SUN4I_CTL_XCH); > > + tx_time = max_t(unsigned, tfr->len * 8 * 2 / (tfr->speed_hz / 1000), You should probably use "unsigned int" instead of just "unsigned" here. > + 100); > + start = jiffies; > timeout = wait_for_completion_timeout(>done, > - msecs_to_jiffies(1000)); > + msecs_to_jiffies(tx_time)); > + end = jiffies; > if (!timeout) { > + dev_warn(>dev, > +"%s: timeout transferring %u bytes@%iHz for > %i(%i)ms", > +dev_name(>dev), tfr->len, tfr->speed_hz, > +jiffies_to_msecs(end - start), tx_time); Should the debug changes be in a separate patch? > ret = -ETIMEDOUT; > goto out; > } > diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c > index 42e2c4b..8be5c5c 100644 > --- a/drivers/spi/spi-sun6i.c > +++ b/drivers/spi/spi-sun6i.c > @@ -162,6 +162,7 @@ static int sun6i_spi_transfer_one(struct spi_master > *master, > unsigned int mclk_rate, div, timeout; > unsigned int tx_len = 0; > int ret = 0; > + unsigned int start, end, tx_time; > u32 reg; > > /* We don't support transfer larger than the FIFO */ > @@ -269,9 +270,17 @@ static int sun6i_spi_transfer_one(struct spi_master > *master, > reg = sun6i_spi_read(sspi, SUN6I_TFR_CTL_REG); > sun6i_spi_write(sspi, SUN6I_TFR_CTL_REG, reg | SUN6I_TFR_CTL_XCH); > > + tx_time = max_t(unsigned, tfr->len * 8 * 2 / (tfr->speed_hz / 1000), Ditto, "unsigned int" instead of "unsigned"? > + 100); > + start = jiffies; > timeout = wait_for_completion_timeout(>done, > - msecs_to_jiffies(1000)); > + msecs_to_jiffies(tx_time)); > + end = jiffies; > if (!timeout) { > + dev_warn(>dev, > +"%s: timeout transferring %u bytes@%iHz for > %i(%i)ms", > +dev_name(>dev), tfr->len, tfr->speed_hz, > +jiffies_to_msecs(end - start), tx_time); Ditto, separate patch? Also, should the changes for the drivers be in two separate patches also? Thanks, -- Julian Calaby Email: julian.cal...@gmail.com Profile: http://www.google.com/profiles/julian.calaby/
Re: [GIT PULL] xfs: updates for 4.7-rc1
On Thu, May 26, 2016 at 5:13 PM, Dave Chinnerwrote: > On Thu, May 26, 2016 at 10:19:13AM -0700, Linus Torvalds wrote: >> >> i'm ok with the late branches, it's not like xfs has been a problem spot. > > Still, I'll try to avoid them because it reduces testing time. Oh, 100% agreed. I'm just saying that you get a pass because you've historically been very reliable. >> Your pull request mentions the 'for-next' branch, but I think you >> *meant* to send me the "xfs-for-linus-4.7-rc1" tag which points to the >> same commit and has your summary in it. > > > > Mea culpa. I ran this command after creating the tag: > > $ git request-pull v4.6 > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git > tags/xfs-for-linus-4.7-rc1 > ~/tmp/t.txt > > And I didn't check the output closely enough. I forgot to push the > tag to the upstream repo before running request-pull. > > Git often makes it very easy to make mistakes whilst simultaneously > making it hard to notice you've made a mistake. :( Actually, that one got fixed in git quite some time ago. You probably have a fairly old version of git, which had the "helpful" logic to pick another remote location if it couldn't find the exact one you had specified, but it could find another one that matched in content It was a misguided attempt at automatically just "doing the right thing" and being easier to use, but yes, it means that it does something subtly different from what you ask for if you haven't pushed out the remote you talked about yet. Bu that was all fixed about two years ago, and I think made v2.0. Do you perhaps happen to run something like debian-stable or similar that never upgrades major versions, so you're still at git-1.x? Building your own git is trivial, and a "make install" will just install in your ~/bin directory so you don't even have to uninstall the system one (assuming you have your own ~/bin before /usr/bin in the path, of course). Linus
Re: [GIT PULL] xfs: updates for 4.7-rc1
On Thu, May 26, 2016 at 5:13 PM, Dave Chinner wrote: > On Thu, May 26, 2016 at 10:19:13AM -0700, Linus Torvalds wrote: >> >> i'm ok with the late branches, it's not like xfs has been a problem spot. > > Still, I'll try to avoid them because it reduces testing time. Oh, 100% agreed. I'm just saying that you get a pass because you've historically been very reliable. >> Your pull request mentions the 'for-next' branch, but I think you >> *meant* to send me the "xfs-for-linus-4.7-rc1" tag which points to the >> same commit and has your summary in it. > > > > Mea culpa. I ran this command after creating the tag: > > $ git request-pull v4.6 > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git > tags/xfs-for-linus-4.7-rc1 > ~/tmp/t.txt > > And I didn't check the output closely enough. I forgot to push the > tag to the upstream repo before running request-pull. > > Git often makes it very easy to make mistakes whilst simultaneously > making it hard to notice you've made a mistake. :( Actually, that one got fixed in git quite some time ago. You probably have a fairly old version of git, which had the "helpful" logic to pick another remote location if it couldn't find the exact one you had specified, but it could find another one that matched in content It was a misguided attempt at automatically just "doing the right thing" and being easier to use, but yes, it means that it does something subtly different from what you ask for if you haven't pushed out the remote you talked about yet. Bu that was all fixed about two years ago, and I think made v2.0. Do you perhaps happen to run something like debian-stable or similar that never upgrades major versions, so you're still at git-1.x? Building your own git is trivial, and a "make install" will just install in your ~/bin directory so you don't even have to uninstall the system one (assuming you have your own ~/bin before /usr/bin in the path, of course). Linus
[PATCH] serial: 8250_fintek: fix the mismatched IRQ mode
Some BIOS only use _OSI("Linux") to distinguish between Linux & Windows. Apply Level/Low to UART trigger mode if Windows, Edge/High mode otherwise. But since 2.6.23 the mainline kernel no longer returns true for _OSI(“Linux”). The default IRQ0~15 trigger mode in Linux is Edge/High mode without ACPI MADT override. It mismatches IRQ mode and makes UART malfunctional on such motherboard. This patch will check the current IRQ mode and apply correct mode to UART. The following link is F81216AD spec PDF: http://html.alldatasheet.com/html-pdf/257956/FINTEK/F81216AD/5569/ 25/F81216AD.html LDN0~3 70h: IRQ channel & Mode register Bit 6~5 : 00 : Active low level mode 01 : Active high edge mode Bit 4 : Sharing Flag (0: not share/1: share) Bit 3~0 : IRQ channel Signed-off-by: Ji-Ze Hong (Peter Hong)--- drivers/tty/serial/8250/8250_fintek.c | 36 ++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c index 870981d..737b4b3 100644 --- a/drivers/tty/serial/8250/8250_fintek.c +++ b/drivers/tty/serial/8250/8250_fintek.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "8250.h" #define ADDR_PORT 0 @@ -30,6 +31,12 @@ #define IO_ADDR2 0x60 #define LDN 0x7 +#define IRQ_MODE 0x70 +#define IRQ_SHARE BIT(4) +#define IRQ_MODE_MASK (BIT(6) | BIT(5)) +#define IRQ_LEVEL_LOW 0 +#define IRQ_EDGE_HIGH BIT(5) + #define RS485 0xF0 #define RTS_INVERT BIT(5) #define RS485_URA BIT(4) @@ -176,10 +183,37 @@ static int find_base_port(struct fintek_8250 *pdata, u16 io_address) return -ENODEV; } +static int fintek_8250_set_irq_mode(struct fintek_8250 *pdata, bool level_mode) +{ + int status; + u8 tmp; + + status = fintek_8250_enter_key(pdata->base_port, pdata->key); + if (status) + return status; + + outb(LDN, pdata->base_port + ADDR_PORT); + outb(pdata->index, pdata->base_port + DATA_PORT); + + outb(IRQ_MODE, pdata->base_port + ADDR_PORT); + tmp = inb(pdata->base_port + DATA_PORT); + + tmp &= ~IRQ_MODE_MASK; + tmp |= IRQ_SHARE; + if (!level_mode) + tmp |= IRQ_EDGE_HIGH; + + outb(tmp, pdata->base_port + DATA_PORT); + fintek_8250_exit_key(pdata->base_port); + return 0; +} + int fintek_8250_probe(struct uart_8250_port *uart) { struct fintek_8250 *pdata; struct fintek_8250 probe_data; + struct irq_data *irq_data = irq_get_irq_data(uart->port.irq); + bool level_mode = irqd_is_level_type(irq_data); if (find_base_port(_data, uart->port.iobase)) return -ENODEV; @@ -192,5 +226,5 @@ int fintek_8250_probe(struct uart_8250_port *uart) uart->port.rs485_config = fintek_8250_rs485_config; uart->port.private_data = pdata; - return 0; + return fintek_8250_set_irq_mode(pdata, level_mode); } -- 1.9.1
[PATCH] serial: 8250_fintek: fix the mismatched IRQ mode
Some BIOS only use _OSI("Linux") to distinguish between Linux & Windows. Apply Level/Low to UART trigger mode if Windows, Edge/High mode otherwise. But since 2.6.23 the mainline kernel no longer returns true for _OSI(“Linux”). The default IRQ0~15 trigger mode in Linux is Edge/High mode without ACPI MADT override. It mismatches IRQ mode and makes UART malfunctional on such motherboard. This patch will check the current IRQ mode and apply correct mode to UART. The following link is F81216AD spec PDF: http://html.alldatasheet.com/html-pdf/257956/FINTEK/F81216AD/5569/ 25/F81216AD.html LDN0~3 70h: IRQ channel & Mode register Bit 6~5 : 00 : Active low level mode 01 : Active high edge mode Bit 4 : Sharing Flag (0: not share/1: share) Bit 3~0 : IRQ channel Signed-off-by: Ji-Ze Hong (Peter Hong) --- drivers/tty/serial/8250/8250_fintek.c | 36 ++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/tty/serial/8250/8250_fintek.c b/drivers/tty/serial/8250/8250_fintek.c index 870981d..737b4b3 100644 --- a/drivers/tty/serial/8250/8250_fintek.c +++ b/drivers/tty/serial/8250/8250_fintek.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "8250.h" #define ADDR_PORT 0 @@ -30,6 +31,12 @@ #define IO_ADDR2 0x60 #define LDN 0x7 +#define IRQ_MODE 0x70 +#define IRQ_SHARE BIT(4) +#define IRQ_MODE_MASK (BIT(6) | BIT(5)) +#define IRQ_LEVEL_LOW 0 +#define IRQ_EDGE_HIGH BIT(5) + #define RS485 0xF0 #define RTS_INVERT BIT(5) #define RS485_URA BIT(4) @@ -176,10 +183,37 @@ static int find_base_port(struct fintek_8250 *pdata, u16 io_address) return -ENODEV; } +static int fintek_8250_set_irq_mode(struct fintek_8250 *pdata, bool level_mode) +{ + int status; + u8 tmp; + + status = fintek_8250_enter_key(pdata->base_port, pdata->key); + if (status) + return status; + + outb(LDN, pdata->base_port + ADDR_PORT); + outb(pdata->index, pdata->base_port + DATA_PORT); + + outb(IRQ_MODE, pdata->base_port + ADDR_PORT); + tmp = inb(pdata->base_port + DATA_PORT); + + tmp &= ~IRQ_MODE_MASK; + tmp |= IRQ_SHARE; + if (!level_mode) + tmp |= IRQ_EDGE_HIGH; + + outb(tmp, pdata->base_port + DATA_PORT); + fintek_8250_exit_key(pdata->base_port); + return 0; +} + int fintek_8250_probe(struct uart_8250_port *uart) { struct fintek_8250 *pdata; struct fintek_8250 probe_data; + struct irq_data *irq_data = irq_get_irq_data(uart->port.irq); + bool level_mode = irqd_is_level_type(irq_data); if (find_base_port(_data, uart->port.iobase)) return -ENODEV; @@ -192,5 +226,5 @@ int fintek_8250_probe(struct uart_8250_port *uart) uart->port.rs485_config = fintek_8250_rs485_config; uart->port.private_data = pdata; - return 0; + return fintek_8250_set_irq_mode(pdata, level_mode); } -- 1.9.1
Re: [PATCH v3 5/6] mm/cma: remove MIGRATE_CMA
Hi Joonsoo, On 2016/5/26 14:22, js1...@gmail.com wrote: > From: Joonsoo Kim> > Now, all reserved pages for CMA region are belong to the ZONE_CMA > and there is no other type of pages. Therefore, we don't need to > use MIGRATE_CMA to distinguish and handle differently for CMA pages > and ordinary pages. Remove MIGRATE_CMA. > > Unfortunately, this patch make free CMA counter incorrect because > we count it when pages are on the MIGRATE_CMA. It will be fixed > by next patch. I can squash next patch here but it makes changes > complicated and hard to review so I separate that. > > Signed-off-by: Joonsoo Kim > --- > include/linux/gfp.h| 3 +- > include/linux/mmzone.h | 22 - > include/linux/vmstat.h | 8 - > mm/cma.c | 2 +- > mm/compaction.c| 10 ++ > mm/hugetlb.c | 2 +- > mm/page_alloc.c| 87 > +- > mm/page_isolation.c| 5 ++- > mm/vmstat.c| 5 +-- > 9 files changed, 31 insertions(+), 113 deletions(-) > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 4d6c008..1a3b869 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -559,8 +559,7 @@ static inline bool pm_suspended_storage(void) > > #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || > defined(CONFIG_CMA) > /* The below functions must be run on a range from a single zone. */ > -extern int alloc_contig_range(unsigned long start, unsigned long end, > - unsigned migratetype); > +extern int alloc_contig_range(unsigned long start, unsigned long end); > extern void free_contig_range(unsigned long pfn, unsigned nr_pages); > #endif > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 54c92a6..236d0bd 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -41,22 +41,6 @@ enum { > MIGRATE_RECLAIMABLE, > MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ > MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, > -#ifdef CONFIG_CMA > - /* > - * MIGRATE_CMA migration type is designed to mimic the way > - * ZONE_MOVABLE works. Only movable pages can be allocated > - * from MIGRATE_CMA pageblocks and page allocator never > - * implicitly change migration type of MIGRATE_CMA pageblock. > - * > - * The way to use it is to change migratetype of a range of > - * pageblocks to MIGRATE_CMA which can be done by > - * __free_pageblock_cma() function. What is important though > - * is that a range of pageblocks must be aligned to > - * MAX_ORDER_NR_PAGES should biggest page be bigger then > - * a single pageblock. > - */ > - MIGRATE_CMA, > -#endif > #ifdef CONFIG_MEMORY_ISOLATION > MIGRATE_ISOLATE,/* can't allocate from here */ > #endif > @@ -66,12 +50,6 @@ enum { > /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ > extern char * const migratetype_names[MIGRATE_TYPES]; > > -#ifdef CONFIG_CMA > -# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) > -#else > -# define is_migrate_cma(migratetype) false > -#endif > - > #define for_each_migratetype_order(order, type) \ > for (order = 0; order < MAX_ORDER; order++) \ > for (type = 0; type < MIGRATE_TYPES; type++) > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index 0aa613d..e0eb3e5 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -264,14 +264,6 @@ static inline void drain_zonestat(struct zone *zone, > struct per_cpu_pageset *pset) { } > #endif /* CONFIG_SMP */ > > -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, > - int migratetype) > -{ > - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); > - if (is_migrate_cma(migratetype)) > - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); > -} > - > extern const char * const vmstat_text[]; > > #endif /* _LINUX_VMSTAT_H */ > diff --git a/mm/cma.c b/mm/cma.c > index 8684f50..bd436e4 100644 > --- a/mm/cma.c > +++ b/mm/cma.c > @@ -444,7 +444,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, > unsigned int align) > > pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); > mutex_lock(_mutex); > - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); > + ret = alloc_contig_range(pfn, pfn + count); > mutex_unlock(_mutex); > if (ret == 0) { > page = pfn_to_page(pfn); > diff --git a/mm/compaction.c b/mm/compaction.c > index 1427366..acb1d1a 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -76,7 +76,7 @@ static void map_pages(struct list_head *list) > > static inline bool
Re: [PATCH v3 5/6] mm/cma: remove MIGRATE_CMA
Hi Joonsoo, On 2016/5/26 14:22, js1...@gmail.com wrote: > From: Joonsoo Kim > > Now, all reserved pages for CMA region are belong to the ZONE_CMA > and there is no other type of pages. Therefore, we don't need to > use MIGRATE_CMA to distinguish and handle differently for CMA pages > and ordinary pages. Remove MIGRATE_CMA. > > Unfortunately, this patch make free CMA counter incorrect because > we count it when pages are on the MIGRATE_CMA. It will be fixed > by next patch. I can squash next patch here but it makes changes > complicated and hard to review so I separate that. > > Signed-off-by: Joonsoo Kim > --- > include/linux/gfp.h| 3 +- > include/linux/mmzone.h | 22 - > include/linux/vmstat.h | 8 - > mm/cma.c | 2 +- > mm/compaction.c| 10 ++ > mm/hugetlb.c | 2 +- > mm/page_alloc.c| 87 > +- > mm/page_isolation.c| 5 ++- > mm/vmstat.c| 5 +-- > 9 files changed, 31 insertions(+), 113 deletions(-) > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 4d6c008..1a3b869 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -559,8 +559,7 @@ static inline bool pm_suspended_storage(void) > > #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || > defined(CONFIG_CMA) > /* The below functions must be run on a range from a single zone. */ > -extern int alloc_contig_range(unsigned long start, unsigned long end, > - unsigned migratetype); > +extern int alloc_contig_range(unsigned long start, unsigned long end); > extern void free_contig_range(unsigned long pfn, unsigned nr_pages); > #endif > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 54c92a6..236d0bd 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -41,22 +41,6 @@ enum { > MIGRATE_RECLAIMABLE, > MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ > MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, > -#ifdef CONFIG_CMA > - /* > - * MIGRATE_CMA migration type is designed to mimic the way > - * ZONE_MOVABLE works. Only movable pages can be allocated > - * from MIGRATE_CMA pageblocks and page allocator never > - * implicitly change migration type of MIGRATE_CMA pageblock. > - * > - * The way to use it is to change migratetype of a range of > - * pageblocks to MIGRATE_CMA which can be done by > - * __free_pageblock_cma() function. What is important though > - * is that a range of pageblocks must be aligned to > - * MAX_ORDER_NR_PAGES should biggest page be bigger then > - * a single pageblock. > - */ > - MIGRATE_CMA, > -#endif > #ifdef CONFIG_MEMORY_ISOLATION > MIGRATE_ISOLATE,/* can't allocate from here */ > #endif > @@ -66,12 +50,6 @@ enum { > /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ > extern char * const migratetype_names[MIGRATE_TYPES]; > > -#ifdef CONFIG_CMA > -# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) > -#else > -# define is_migrate_cma(migratetype) false > -#endif > - > #define for_each_migratetype_order(order, type) \ > for (order = 0; order < MAX_ORDER; order++) \ > for (type = 0; type < MIGRATE_TYPES; type++) > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index 0aa613d..e0eb3e5 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -264,14 +264,6 @@ static inline void drain_zonestat(struct zone *zone, > struct per_cpu_pageset *pset) { } > #endif /* CONFIG_SMP */ > > -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, > - int migratetype) > -{ > - __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); > - if (is_migrate_cma(migratetype)) > - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); > -} > - > extern const char * const vmstat_text[]; > > #endif /* _LINUX_VMSTAT_H */ > diff --git a/mm/cma.c b/mm/cma.c > index 8684f50..bd436e4 100644 > --- a/mm/cma.c > +++ b/mm/cma.c > @@ -444,7 +444,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, > unsigned int align) > > pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); > mutex_lock(_mutex); > - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); > + ret = alloc_contig_range(pfn, pfn + count); > mutex_unlock(_mutex); > if (ret == 0) { > page = pfn_to_page(pfn); > diff --git a/mm/compaction.c b/mm/compaction.c > index 1427366..acb1d1a 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -76,7 +76,7 @@ static void map_pages(struct list_head *list) > > static inline bool migrate_async_suitable(int migratetype) > { > - return
[PATCH 1/2 v2] staging: dgnc: remove redundant NULL check for brd
the "brd" value cannot be NULL in dgnc_finalize_board_init(). Because "brd" as a parameter of this function was already checked for NULL. the dgnc_finalize_board_init() as a static function was called only from dgnc_found_board() function and brd->magic value was assigned once in dgnc_found_board(). So it doesn't need to check for DGNC_BOARD_MAGIC value. Signed-off-by: Daeseok Youn--- V2: Adds more comments to Change Log. drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index af2e835..22257d2 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -579,9 +579,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) { int rc = 0; - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return -ENODEV; - if (brd->irq) { rc = request_irq(brd->irq, brd->bd_ops->intr, IRQF_SHARED, "DGNC", brd); -- 1.9.1
[PATCH 1/2 v2] staging: dgnc: remove redundant NULL check for brd
the "brd" value cannot be NULL in dgnc_finalize_board_init(). Because "brd" as a parameter of this function was already checked for NULL. the dgnc_finalize_board_init() as a static function was called only from dgnc_found_board() function and brd->magic value was assigned once in dgnc_found_board(). So it doesn't need to check for DGNC_BOARD_MAGIC value. Signed-off-by: Daeseok Youn --- V2: Adds more comments to Change Log. drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index af2e835..22257d2 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -579,9 +579,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) { int rc = 0; - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return -ENODEV; - if (brd->irq) { rc = request_irq(brd->irq, brd->bd_ops->intr, IRQF_SHARED, "DGNC", brd); -- 1.9.1
[PATCH 2/2 V2] staging: dgnc: remove redundant null check in
the "brd" was already checked for NULL before calling dgnc_do_remap(). the dgnc_do_remap() function was called only from the dgnc_found_board() and the DGNC_BOARD_MAGIC value was assigned to "brd->magic" in dgcn_found_board(). So it doesn't need to check about magic value. Signed-off-by: Daeseok Youn--- V2: Adds more comments to Change Log. drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index 22257d2..1882ef5 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -599,9 +599,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) */ static void dgnc_do_remap(struct dgnc_board *brd) { - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return; - brd->re_map_membase = ioremap(brd->membase, 0x1000); } -- 1.9.1
[PATCH 2/2 V2] staging: dgnc: remove redundant null check in
the "brd" was already checked for NULL before calling dgnc_do_remap(). the dgnc_do_remap() function was called only from the dgnc_found_board() and the DGNC_BOARD_MAGIC value was assigned to "brd->magic" in dgcn_found_board(). So it doesn't need to check about magic value. Signed-off-by: Daeseok Youn --- V2: Adds more comments to Change Log. drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index 22257d2..1882ef5 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -599,9 +599,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) */ static void dgnc_do_remap(struct dgnc_board *brd) { - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return; - brd->re_map_membase = ioremap(brd->membase, 0x1000); } -- 1.9.1
[PATCH] usb: core: fix a double free in the usb driver
There is a double free problem in the usb driver. This is caused by delayed deregister for scsi device. <*> at Insert USB Storage - USB bus #1 register usb_create_hcd (primary-kref==1) * primary-bandwidth_mutex(alloc)) usb_get_hcd(primary-kref==2) - USB bus #2 register usb_create_hcd (second-kref==1) * second-bandwidth_mutex==primary-bandwidth_mutex usb_get_hcd(second-kref==2) - scsi_device_get usb_get_hcd(second-kref==3) <*> at remove USB Storage (Normal) - scsi_device_put usb_put_hcd(second-kref==2) - USB bus #2 deregister usb_release_dev(second-kref==1) usb_release_dev(second-kref==0) -> hcd_release() - USB bus #1 deregister usb_release_dev(primary-kref==1) usb_release_dev(primary-kref==0) -> hcd_release() *(primary-bandwidth_mutex free) at remove USB Storage - USB bus #2 deregister usb_release_dev(second-kref==2) usb_release_dev(second-kref==1) - USB bus #1 deregister usb_release_dev(primary-kref==1) usb_release_dev(primary-kref==0) -> hcd_release() *(primary-bandwidth_mutex free) - scsi_device_put usb_put_hcd(second-kref==0) -> hcd_release(*) * at this, second->primary==0 therefore try to free the primary-bandwidth_mutex.(already freed) To fix this problem kfree(hcd->bandwidth_mutex); should be executed at only (hcd->primary_hcd==hcd). Signed-off-by: Chunggeol Kim --- drivers/usb/core/hcd.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 34b837a..60077f3 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2608,7 +2608,7 @@ static void hcd_release(struct kref *kref) struct usb_hcd *hcd = container_of (kref, struct usb_hcd, kref); mutex_lock(_port_peer_mutex); - if (usb_hcd_is_primary_hcd(hcd)) { + if (hcd == hcd->primary_hcd) { kfree(hcd->address0_mutex); kfree(hcd->bandwidth_mutex); } -- 1.7.9.5
[PATCH] usb: core: fix a double free in the usb driver
There is a double free problem in the usb driver. This is caused by delayed deregister for scsi device. <*> at Insert USB Storage - USB bus #1 register usb_create_hcd (primary-kref==1) * primary-bandwidth_mutex(alloc)) usb_get_hcd(primary-kref==2) - USB bus #2 register usb_create_hcd (second-kref==1) * second-bandwidth_mutex==primary-bandwidth_mutex usb_get_hcd(second-kref==2) - scsi_device_get usb_get_hcd(second-kref==3) <*> at remove USB Storage (Normal) - scsi_device_put usb_put_hcd(second-kref==2) - USB bus #2 deregister usb_release_dev(second-kref==1) usb_release_dev(second-kref==0) -> hcd_release() - USB bus #1 deregister usb_release_dev(primary-kref==1) usb_release_dev(primary-kref==0) -> hcd_release() *(primary-bandwidth_mutex free) at remove USB Storage - USB bus #2 deregister usb_release_dev(second-kref==2) usb_release_dev(second-kref==1) - USB bus #1 deregister usb_release_dev(primary-kref==1) usb_release_dev(primary-kref==0) -> hcd_release() *(primary-bandwidth_mutex free) - scsi_device_put usb_put_hcd(second-kref==0) -> hcd_release(*) * at this, second->primary==0 therefore try to free the primary-bandwidth_mutex.(already freed) To fix this problem kfree(hcd->bandwidth_mutex); should be executed at only (hcd->primary_hcd==hcd). Signed-off-by: Chunggeol Kim --- drivers/usb/core/hcd.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 34b837a..60077f3 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -2608,7 +2608,7 @@ static void hcd_release(struct kref *kref) struct usb_hcd *hcd = container_of (kref, struct usb_hcd, kref); mutex_lock(_port_peer_mutex); - if (usb_hcd_is_primary_hcd(hcd)) { + if (hcd == hcd->primary_hcd) { kfree(hcd->address0_mutex); kfree(hcd->bandwidth_mutex); } -- 1.7.9.5
Re: [PATCH] Use the correct size to set block max sectors
dOn Thu, 2016-05-26 at 17:08 -0700, Long Li wrote: > The block sector size should be in unit of 512 bytes, not in bytes. Thanks. The patch subject should use something like: [PATCH] sd: Use the correct size to set block max sectors to show what subsystem is being modified. > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c [] > @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) > if (sdkp->opt_xfer_blocks && > sdkp->opt_xfer_blocks <= dev_max && > sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && > - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) > - rw_max = q->limits.io_opt = > + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { > + q->limits.io_opt = > sdkp->opt_xfer_blocks * sdp->sector_size; > + rw_max = (q->limits.io_opt >> 9); > + } > else > rw_max = BLK_DEF_MAX_SECTORS; And style trivia: it'd be more kernel style consistent as: if (... sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { q->limits.io_opt = sdkp->opt_xfer_blocks * sdp->sector_size; rw_max = q->limits.io_opt >> 9; } else { rw_max = BLK_DEF_MAX_SECTORS; } ie: no parentheses necessary around the shifted value and braces around both arms.
Re: [PATCH] Use the correct size to set block max sectors
dOn Thu, 2016-05-26 at 17:08 -0700, Long Li wrote: > The block sector size should be in unit of 512 bytes, not in bytes. Thanks. The patch subject should use something like: [PATCH] sd: Use the correct size to set block max sectors to show what subsystem is being modified. > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c [] > @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) > if (sdkp->opt_xfer_blocks && > sdkp->opt_xfer_blocks <= dev_max && > sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && > - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) > - rw_max = q->limits.io_opt = > + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { > + q->limits.io_opt = > sdkp->opt_xfer_blocks * sdp->sector_size; > + rw_max = (q->limits.io_opt >> 9); > + } > else > rw_max = BLK_DEF_MAX_SECTORS; And style trivia: it'd be more kernel style consistent as: if (... sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { q->limits.io_opt = sdkp->opt_xfer_blocks * sdp->sector_size; rw_max = q->limits.io_opt >> 9; } else { rw_max = BLK_DEF_MAX_SECTORS; } ie: no parentheses necessary around the shifted value and braces around both arms.
Re: [PATCH 1/2] staging: dgnc: remove redundant NULL check for brd
2016-05-26 21:29 GMT+09:00 Luis de Bethencourt: > On 26/05/16 05:56, DaeSeok Youn wrote: >> 2016-05-26 6:48 GMT+09:00 Luis de Bethencourt : >>> On 20/05/16 10:51, Daeseok Youn wrote: the "brd" value cannot be NULL in dgnc_finalize_board_init(). Because "brd" as a parameter of this function was already checked for NULL. Signed-off-by: Daeseok Youn --- drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index af2e835..22257d2 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -579,9 +579,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) { int rc = 0; - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return -ENODEV; - if (brd->irq) { rc = request_irq(brd->irq, brd->bd_ops->intr, IRQF_SHARED, "DGNC", brd); >>> >>> This is partially correct, the check for brd being NULL is in line 371. >> Hi Luis, >> >> Yes, right. but also brd was assigned the value DGNC_BOARD_MAGIC in line 384. >> brd->magic = DGNC_BOARD_MAGIC; >> and also dgnc_finalize_board_init() as a static function is only >> called in dgnc_found_board(), right? >> >>> >>> But there is a second check for brd->magic != DGNC_BOARD_MAGIC. Do you want >>> to keep that one? >> So.. I think it doesn't need to check about DGNC_BOARD_MAGIC. > > This is good. I was asking just to make sure it was your intention. > > Please add the reason to drop that second check in the commit message as > well. So people > reading the git log can understand both parts. For both patches. OK. I will update the change log and resend it. Thanks. regards, Daeseok. > > Thanks for the fixes :) > > Luis > >> >>> >>> Also, how did you find this patch. It is useful to mention this in the >>> commit >>> message if it was through some static analysis tool. For people using these >>> tools >>> in the future. >> There are some static analysis tool for checking linux kernel code. >> But I didn't use >> those tools for this patch. sometimes, I usually run "smatch" tool for >> checking linux kernel >> code. >> >> thanks. >> regards, >> Daeseok. >> >>> >>> Thanks for the patch :) >>> Luis >
Re: [PATCH 1/2] staging: dgnc: remove redundant NULL check for brd
2016-05-26 21:29 GMT+09:00 Luis de Bethencourt : > On 26/05/16 05:56, DaeSeok Youn wrote: >> 2016-05-26 6:48 GMT+09:00 Luis de Bethencourt : >>> On 20/05/16 10:51, Daeseok Youn wrote: the "brd" value cannot be NULL in dgnc_finalize_board_init(). Because "brd" as a parameter of this function was already checked for NULL. Signed-off-by: Daeseok Youn --- drivers/staging/dgnc/dgnc_driver.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/dgnc/dgnc_driver.c b/drivers/staging/dgnc/dgnc_driver.c index af2e835..22257d2 100644 --- a/drivers/staging/dgnc/dgnc_driver.c +++ b/drivers/staging/dgnc/dgnc_driver.c @@ -579,9 +579,6 @@ static int dgnc_finalize_board_init(struct dgnc_board *brd) { int rc = 0; - if (!brd || brd->magic != DGNC_BOARD_MAGIC) - return -ENODEV; - if (brd->irq) { rc = request_irq(brd->irq, brd->bd_ops->intr, IRQF_SHARED, "DGNC", brd); >>> >>> This is partially correct, the check for brd being NULL is in line 371. >> Hi Luis, >> >> Yes, right. but also brd was assigned the value DGNC_BOARD_MAGIC in line 384. >> brd->magic = DGNC_BOARD_MAGIC; >> and also dgnc_finalize_board_init() as a static function is only >> called in dgnc_found_board(), right? >> >>> >>> But there is a second check for brd->magic != DGNC_BOARD_MAGIC. Do you want >>> to keep that one? >> So.. I think it doesn't need to check about DGNC_BOARD_MAGIC. > > This is good. I was asking just to make sure it was your intention. > > Please add the reason to drop that second check in the commit message as > well. So people > reading the git log can understand both parts. For both patches. OK. I will update the change log and resend it. Thanks. regards, Daeseok. > > Thanks for the fixes :) > > Luis > >> >>> >>> Also, how did you find this patch. It is useful to mention this in the >>> commit >>> message if it was through some static analysis tool. For people using these >>> tools >>> in the future. >> There are some static analysis tool for checking linux kernel code. >> But I didn't use >> those tools for this patch. sometimes, I usually run "smatch" tool for >> checking linux kernel >> code. >> >> thanks. >> regards, >> Daeseok. >> >>> >>> Thanks for the patch :) >>> Luis >
RE: [Intel-wired-lan] [RFC PATCH net] e1000e: keep vlan interfaces functional after rxvlan off
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On > Behalf Of Jeff Kirsher > Sent: Wednesday, May 18, 2016 2:40 PM > To: Jarod Wilson; linux-kernel@vger.kernel.org; > Avargil, Raanan > Cc: net...@vger.kernel.org; intel-wired-...@lists.osuosl.org > Subject: Re: [Intel-wired-lan] [RFC PATCH net] e1000e: keep vlan interfaces > functional after rxvlan off > > On Tue, 2016-05-17 at 15:03 -0400, Jarod Wilson wrote: > > I've got a bug report about an e1000e interface, where a vlan interface > > is > > set up on top of it: > > > > $ ip link add link ens1f0 name ens1f0.99 type vlan id 99 > > $ ip link set ens1f0 up > > $ ip link set ens1f0.99 up > > $ ip addr add 192.168.99.92 dev ens1f0.99 > > > > At this point, I can ping another host on vlan 99, ip 192.168.99.91. > > However, if I do the following: > > > > $ ethtool -K ens1f0 rxvlan off > > > > Then no traffic passes on ens1f0.99. It comes back if I toggle rxvlan on > > again. I'm not sure if this is actually intended behavior, or if there's > > a > > lack of software vlan stripping fallback, or what, but things continue to > > work if I simply don't call e1000e_vlan_strip_disable() if there are > > active vlans (plagiarizing a function from the e1000 driver here) on the > > interface. > > > > Also slipped a related-ish fix to the kerneldoc text for > > e1000e_vlan_strip_disable here... > > > > CC: Jeff Kirsher > > CC: intel-wired-...@lists.osuosl.org > > CC: net...@vger.kernel.org > > Signed-off-by: Jarod Wilson > > --- > > drivers/net/ethernet/intel/e1000e/netdev.c | 15 +-- > > 1 file changed, 13 insertions(+), 2 deletions(-) > > Raanan, please review this patch. Even though it is an RFC I will be > adding it to my queue for testing. > http://patchwork.ozlabs.org/patch/623238/ Yup, without this patch disabling rxvlan offload does indeed break vlan connectivity and with the patch I can disable and re-enable rxvlan offloads as much as I care to. It also makes it through my regression tests without problems. Tested-by: Aaron Brown This is from functional - does it work - testing perspective so review is probably still in order.
RE: [Intel-wired-lan] [RFC PATCH net] e1000e: keep vlan interfaces functional after rxvlan off
> From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On > Behalf Of Jeff Kirsher > Sent: Wednesday, May 18, 2016 2:40 PM > To: Jarod Wilson ; linux-kernel@vger.kernel.org; > Avargil, Raanan > Cc: net...@vger.kernel.org; intel-wired-...@lists.osuosl.org > Subject: Re: [Intel-wired-lan] [RFC PATCH net] e1000e: keep vlan interfaces > functional after rxvlan off > > On Tue, 2016-05-17 at 15:03 -0400, Jarod Wilson wrote: > > I've got a bug report about an e1000e interface, where a vlan interface > > is > > set up on top of it: > > > > $ ip link add link ens1f0 name ens1f0.99 type vlan id 99 > > $ ip link set ens1f0 up > > $ ip link set ens1f0.99 up > > $ ip addr add 192.168.99.92 dev ens1f0.99 > > > > At this point, I can ping another host on vlan 99, ip 192.168.99.91. > > However, if I do the following: > > > > $ ethtool -K ens1f0 rxvlan off > > > > Then no traffic passes on ens1f0.99. It comes back if I toggle rxvlan on > > again. I'm not sure if this is actually intended behavior, or if there's > > a > > lack of software vlan stripping fallback, or what, but things continue to > > work if I simply don't call e1000e_vlan_strip_disable() if there are > > active vlans (plagiarizing a function from the e1000 driver here) on the > > interface. > > > > Also slipped a related-ish fix to the kerneldoc text for > > e1000e_vlan_strip_disable here... > > > > CC: Jeff Kirsher > > CC: intel-wired-...@lists.osuosl.org > > CC: net...@vger.kernel.org > > Signed-off-by: Jarod Wilson > > --- > > drivers/net/ethernet/intel/e1000e/netdev.c | 15 +-- > > 1 file changed, 13 insertions(+), 2 deletions(-) > > Raanan, please review this patch. Even though it is an RFC I will be > adding it to my queue for testing. > http://patchwork.ozlabs.org/patch/623238/ Yup, without this patch disabling rxvlan offload does indeed break vlan connectivity and with the patch I can disable and re-enable rxvlan offloads as much as I care to. It also makes it through my regression tests without problems. Tested-by: Aaron Brown This is from functional - does it work - testing perspective so review is probably still in order.
[RFT v3] drm: use late_initcall() for amdkfd and radeon
To get KFD support in radeon we need the following initialization to happen in this order, their respective driver file that has its init routine listed next to it: 0. AMD IOMMUv1:arch/x86/kernel/pci-dma.c 1. AMD IOMMUv2:drivers/iommu/amd_iommu_v2.c 2. AMD KFD:drivers/gpu/drm/amd/amdkfd/kfd_module.c 3. AMD Radeon: drivers/gpu/drm/radeon/radeon_drv.c Order is rather implicit, but these drivers can currently only do so much given the amount of leg room available. Below are the respective init routines and how they are initialized: arch/x86/kernel/pci-dma.c rootfs_initcall(pci_iommu_init); drivers/iommu/amd_iommu_v2.cmodule_init(amd_iommu_v2_init); drivers/gpu/drm/amd/amdkfd/kfd_module.c module_init(kfd_module_init); drivers/gpu/drm/radeon/radeon_drv.c module_init(radeon_init); When a driver is built-in module_init() folds to use device_initcall(), and we have the following possible orders: #define pure_initcall(fn)__define_initcall(fn, 0) #define core_initcall(fn)__define_initcall(fn, 1) #define postcore_initcall(fn)__define_initcall(fn, 2) #define arch_initcall(fn)__define_initcall(fn, 3) #define subsys_initcall(fn) __define_initcall(fn, 4) #define fs_initcall(fn) __define_initcall(fn, 5) - #define rootfs_initcall(fn) __define_initcall(fn, rootfs) #define device_initcall(fn) __define_initcall(fn, 6) #define late_initcall(fn)__define_initcall(fn, 7) Since we start off from rootfs_initcall(), it gives us 3 more levels of leg room to play with for order semantics, this isn't enough to address all required levels of dependencies, this is specially true given that AMD-KFD needs to be loaded before the radeon driver -- -but this it not enforced by symbols. If the AMD-KFD driver is not loaded prior to the radeon driver because otherwise the radeon driver will not initialize the AMD-KFD driver and you get no KFD functionality in userspace. Commit 1bacc894c227fad8a7 ("drivers: Move iommu/ before gpu/ in Makefile") works around some of the possibe races between the AMD IOMMU v2 and GPU drivers by changing the link order. This is fragile, however its the bets we can do, given that making the GPU drivers use late_initcall() would also implicate a similar race between them. That possible race is fortunatley addressed given that the drm Makefile currently has amdkfd linked prior to radeon: drivers/gpu/drm/Makefile ... obj-$(CONFIG_HSA_AMD) += amd/amdkfd/ obj-$(CONFIG_DRM_RADEON)+= radeon/ ... Changing amdkfd and radeon to late_initcall() however is still the right call in orde to annotate explicitly a delayed dependency requirement between the GPU drivers and the IOMMUs. We can't address the fragile nature of the link order right now, but in the future that might be possible. Signed-off-by: Luis R. Rodriguez--- Please note, the changes to drivers/Makefile are just for the sake of forcing the possible race to occur, if this works well the actual [PATCH] submission will skip those changes as its pointless to remove those work arounds as it stands, due to the limited nature of the levels available for addressing requirements. Also, if you are aware of further dependency hell things like these -- please do let me know as I am interested in looking at addressing them. drivers/Makefile| 6 ++ drivers/gpu/drm/amd/amdkfd/kfd_module.c | 2 +- drivers/gpu/drm/radeon/radeon_drv.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 0b6f3d60193d..0fbe3982041f 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -50,10 +50,7 @@ obj-$(CONFIG_RESET_CONTROLLER) += reset/ obj-y += tty/ obj-y += char/ -# iommu/ comes before gpu as gpu are using iommu controllers -obj-$(CONFIG_IOMMU_SUPPORT)+= iommu/ - -# gpu/ comes after char for AGP vs DRM startup and after iommu +# gpu/ comes after char for AGP vs DRM startup obj-y += gpu/ obj-$(CONFIG_CONNECTOR)+= connector/ @@ -147,6 +144,7 @@ obj-y += clk/ obj-$(CONFIG_MAILBOX) += mailbox/ obj-$(CONFIG_HWSPINLOCK) += hwspinlock/ +obj-$(CONFIG_IOMMU_SUPPORT)+= iommu/ obj-$(CONFIG_REMOTEPROC) += remoteproc/ obj-$(CONFIG_RPMSG)+= rpmsg/ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 850a5623661f..3d1dab8a31c7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -141,7 +141,7 @@ static void __exit kfd_module_exit(void) dev_info(kfd_device, "Removed module\n"); } -module_init(kfd_module_init); +late_initcall(kfd_module_init); module_exit(kfd_module_exit);
[RFT v3] drm: use late_initcall() for amdkfd and radeon
To get KFD support in radeon we need the following initialization to happen in this order, their respective driver file that has its init routine listed next to it: 0. AMD IOMMUv1:arch/x86/kernel/pci-dma.c 1. AMD IOMMUv2:drivers/iommu/amd_iommu_v2.c 2. AMD KFD:drivers/gpu/drm/amd/amdkfd/kfd_module.c 3. AMD Radeon: drivers/gpu/drm/radeon/radeon_drv.c Order is rather implicit, but these drivers can currently only do so much given the amount of leg room available. Below are the respective init routines and how they are initialized: arch/x86/kernel/pci-dma.c rootfs_initcall(pci_iommu_init); drivers/iommu/amd_iommu_v2.cmodule_init(amd_iommu_v2_init); drivers/gpu/drm/amd/amdkfd/kfd_module.c module_init(kfd_module_init); drivers/gpu/drm/radeon/radeon_drv.c module_init(radeon_init); When a driver is built-in module_init() folds to use device_initcall(), and we have the following possible orders: #define pure_initcall(fn)__define_initcall(fn, 0) #define core_initcall(fn)__define_initcall(fn, 1) #define postcore_initcall(fn)__define_initcall(fn, 2) #define arch_initcall(fn)__define_initcall(fn, 3) #define subsys_initcall(fn) __define_initcall(fn, 4) #define fs_initcall(fn) __define_initcall(fn, 5) - #define rootfs_initcall(fn) __define_initcall(fn, rootfs) #define device_initcall(fn) __define_initcall(fn, 6) #define late_initcall(fn)__define_initcall(fn, 7) Since we start off from rootfs_initcall(), it gives us 3 more levels of leg room to play with for order semantics, this isn't enough to address all required levels of dependencies, this is specially true given that AMD-KFD needs to be loaded before the radeon driver -- -but this it not enforced by symbols. If the AMD-KFD driver is not loaded prior to the radeon driver because otherwise the radeon driver will not initialize the AMD-KFD driver and you get no KFD functionality in userspace. Commit 1bacc894c227fad8a7 ("drivers: Move iommu/ before gpu/ in Makefile") works around some of the possibe races between the AMD IOMMU v2 and GPU drivers by changing the link order. This is fragile, however its the bets we can do, given that making the GPU drivers use late_initcall() would also implicate a similar race between them. That possible race is fortunatley addressed given that the drm Makefile currently has amdkfd linked prior to radeon: drivers/gpu/drm/Makefile ... obj-$(CONFIG_HSA_AMD) += amd/amdkfd/ obj-$(CONFIG_DRM_RADEON)+= radeon/ ... Changing amdkfd and radeon to late_initcall() however is still the right call in orde to annotate explicitly a delayed dependency requirement between the GPU drivers and the IOMMUs. We can't address the fragile nature of the link order right now, but in the future that might be possible. Signed-off-by: Luis R. Rodriguez --- Please note, the changes to drivers/Makefile are just for the sake of forcing the possible race to occur, if this works well the actual [PATCH] submission will skip those changes as its pointless to remove those work arounds as it stands, due to the limited nature of the levels available for addressing requirements. Also, if you are aware of further dependency hell things like these -- please do let me know as I am interested in looking at addressing them. drivers/Makefile| 6 ++ drivers/gpu/drm/amd/amdkfd/kfd_module.c | 2 +- drivers/gpu/drm/radeon/radeon_drv.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 0b6f3d60193d..0fbe3982041f 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -50,10 +50,7 @@ obj-$(CONFIG_RESET_CONTROLLER) += reset/ obj-y += tty/ obj-y += char/ -# iommu/ comes before gpu as gpu are using iommu controllers -obj-$(CONFIG_IOMMU_SUPPORT)+= iommu/ - -# gpu/ comes after char for AGP vs DRM startup and after iommu +# gpu/ comes after char for AGP vs DRM startup obj-y += gpu/ obj-$(CONFIG_CONNECTOR)+= connector/ @@ -147,6 +144,7 @@ obj-y += clk/ obj-$(CONFIG_MAILBOX) += mailbox/ obj-$(CONFIG_HWSPINLOCK) += hwspinlock/ +obj-$(CONFIG_IOMMU_SUPPORT)+= iommu/ obj-$(CONFIG_REMOTEPROC) += remoteproc/ obj-$(CONFIG_RPMSG)+= rpmsg/ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c index 850a5623661f..3d1dab8a31c7 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c @@ -141,7 +141,7 @@ static void __exit kfd_module_exit(void) dev_info(kfd_device, "Removed module\n"); } -module_init(kfd_module_init); +late_initcall(kfd_module_init); module_exit(kfd_module_exit); MODULE_AUTHOR(KFD_DRIVER_AUTHOR);
Re: [PATCH v3 10/12] spi: add driver for J-Core SPI controller
On Wed, May 25, 2016 at 11:17:08AM +0100, Mark Brown wrote: > On Wed, May 25, 2016 at 05:43:03AM +, Rich Felker wrote: > > > changes based on ml discussion of the v2 patch. The chipselect change > > has not been made yet, except for rewriting the current logic to be > > more clear. If the decision is that it's needed, I can do it easily, > > but it needs testing still. > > You need to fix that, it's not good to just decide to ignore review > comments. It is astonishing that you are doing this while expecting to > bypass the normal process for merging things. I'm not going to look at > this version, please ensure you have addressed all the feedback from the > previous review before any new version you submit. I've made and tested a change that looks like what you want, with the hw->csReg's chipselect bits set initially high in the probe function, and only the requested bit being modified by the chipselect function. Would you prefer I continue to submit this driver with new versions of the patch series, or separate it out for further review on its own? Rich
Re: [PATCH v3 10/12] spi: add driver for J-Core SPI controller
On Wed, May 25, 2016 at 11:17:08AM +0100, Mark Brown wrote: > On Wed, May 25, 2016 at 05:43:03AM +, Rich Felker wrote: > > > changes based on ml discussion of the v2 patch. The chipselect change > > has not been made yet, except for rewriting the current logic to be > > more clear. If the decision is that it's needed, I can do it easily, > > but it needs testing still. > > You need to fix that, it's not good to just decide to ignore review > comments. It is astonishing that you are doing this while expecting to > bypass the normal process for merging things. I'm not going to look at > this version, please ensure you have addressed all the feedback from the > previous review before any new version you submit. I've made and tested a change that looks like what you want, with the hw->csReg's chipselect bits set initially high in the probe function, and only the requested bit being modified by the chipselect function. Would you prefer I continue to submit this driver with new versions of the patch series, or separate it out for further review on its own? Rich
Re: [RFC PATCH 1/2] mmc: dw_mmc: remove redundant of set_bit and clear_bit
Hi Shawn, On 05/26/2016 12:07 PM, Shawn Lin wrote: > dw_mci_get_cd have already dealed with these for > both of internal card-detect and gpio card-detect. s/dealed/dealt This patch looks good to me. Could you resend the patch? not RFC. Best Regards, Jaehoon Chung > > Signed-off-by: Shawn Lin> > --- > > drivers/mmc/host/dw_mmc.c | 5 + > 1 file changed, 1 insertion(+), 4 deletions(-) > > diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c > index 829a6ee..cb30e91 100644 > --- a/drivers/mmc/host/dw_mmc.c > +++ b/drivers/mmc/host/dw_mmc.c > @@ -2616,10 +2616,7 @@ static int dw_mci_init_slot(struct dw_mci *host, > unsigned int id) > mmc->max_seg_size = mmc->max_req_size; > } > > - if (dw_mci_get_cd(mmc)) > - set_bit(DW_MMC_CARD_PRESENT, >flags); > - else > - clear_bit(DW_MMC_CARD_PRESENT, >flags); > + dw_mci_get_cd(mmc); > > ret = mmc_add_host(mmc); > if (ret) >
Re: [RFC PATCH 2/2] mmc: dw_mmc: check card present before starting request
Hi Shawn, On 05/26/2016 12:08 PM, Shawn Lin wrote: > The main reason to add this check is to avoid unnecessary > mmc_request if the card is removed. Although we have already > check this in dw_mci_handle_cd for runtime usage of sd card and > dw_mci_init_slot for noremovable devices, but there is a timing > gap before it really calls dw_mci_get_cd as mmc_detect_change needs > some delay here. What's unnecessary mmc_request? > > Another gain here is that we could save some checkings of card status > after sd card been removed. > > Signed-off-by: Shawn Lin> --- > > drivers/mmc/host/dw_mmc.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c > index cb30e91..2940d24 100644 > --- a/drivers/mmc/host/dw_mmc.c > +++ b/drivers/mmc/host/dw_mmc.c > @@ -105,6 +105,7 @@ struct idmac_desc { > static bool dw_mci_reset(struct dw_mci *host); > static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset); > static int dw_mci_card_busy(struct mmc_host *mmc); > +static int dw_mci_get_cd(struct mmc_host *mmc); > > #if defined(CONFIG_DEBUG_FS) > static int dw_mci_req_show(struct seq_file *s, void *v) > @@ -1248,6 +1249,8 @@ static void dw_mci_request(struct mmc_host *mmc, struct > mmc_request *mrq) > > WARN_ON(slot->mrq); > > + dw_mci_get_cd(mmc); > + If my understanding is right, it can be replaced if (!test_bit(DW_MMC_CARD_PRESENT, >flags)) to if (!dw_mci_get_cd(mmc)) Best Regards, Jaehoon Chung > /* >* The check for card presence and queueing of the request must be >* atomic, otherwise the card could be removed in between and the >
Re: [RFC PATCH 1/2] mmc: dw_mmc: remove redundant of set_bit and clear_bit
Hi Shawn, On 05/26/2016 12:07 PM, Shawn Lin wrote: > dw_mci_get_cd have already dealed with these for > both of internal card-detect and gpio card-detect. s/dealed/dealt This patch looks good to me. Could you resend the patch? not RFC. Best Regards, Jaehoon Chung > > Signed-off-by: Shawn Lin > > --- > > drivers/mmc/host/dw_mmc.c | 5 + > 1 file changed, 1 insertion(+), 4 deletions(-) > > diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c > index 829a6ee..cb30e91 100644 > --- a/drivers/mmc/host/dw_mmc.c > +++ b/drivers/mmc/host/dw_mmc.c > @@ -2616,10 +2616,7 @@ static int dw_mci_init_slot(struct dw_mci *host, > unsigned int id) > mmc->max_seg_size = mmc->max_req_size; > } > > - if (dw_mci_get_cd(mmc)) > - set_bit(DW_MMC_CARD_PRESENT, >flags); > - else > - clear_bit(DW_MMC_CARD_PRESENT, >flags); > + dw_mci_get_cd(mmc); > > ret = mmc_add_host(mmc); > if (ret) >
Re: [RFC PATCH 2/2] mmc: dw_mmc: check card present before starting request
Hi Shawn, On 05/26/2016 12:08 PM, Shawn Lin wrote: > The main reason to add this check is to avoid unnecessary > mmc_request if the card is removed. Although we have already > check this in dw_mci_handle_cd for runtime usage of sd card and > dw_mci_init_slot for noremovable devices, but there is a timing > gap before it really calls dw_mci_get_cd as mmc_detect_change needs > some delay here. What's unnecessary mmc_request? > > Another gain here is that we could save some checkings of card status > after sd card been removed. > > Signed-off-by: Shawn Lin > --- > > drivers/mmc/host/dw_mmc.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c > index cb30e91..2940d24 100644 > --- a/drivers/mmc/host/dw_mmc.c > +++ b/drivers/mmc/host/dw_mmc.c > @@ -105,6 +105,7 @@ struct idmac_desc { > static bool dw_mci_reset(struct dw_mci *host); > static bool dw_mci_ctrl_reset(struct dw_mci *host, u32 reset); > static int dw_mci_card_busy(struct mmc_host *mmc); > +static int dw_mci_get_cd(struct mmc_host *mmc); > > #if defined(CONFIG_DEBUG_FS) > static int dw_mci_req_show(struct seq_file *s, void *v) > @@ -1248,6 +1249,8 @@ static void dw_mci_request(struct mmc_host *mmc, struct > mmc_request *mrq) > > WARN_ON(slot->mrq); > > + dw_mci_get_cd(mmc); > + If my understanding is right, it can be replaced if (!test_bit(DW_MMC_CARD_PRESENT, >flags)) to if (!dw_mci_get_cd(mmc)) Best Regards, Jaehoon Chung > /* >* The check for card presence and queueing of the request must be >* atomic, otherwise the card could be removed in between and the >
Re: [RFT v2] iommu/amd: use subsys_initcall() on amdv2 iommu
On Mon, Apr 25, 2016 at 12:23:51PM +0200, Joerg Roedel wrote: > On Mon, Apr 18, 2016 at 02:03:50PM +0200, Luis R. Rodriguez wrote: > > You said that with my patch you saw AMD IOMMUv2 kick off first, > > that was intentional as I thought that's what you needed. Can > > someone please describe the requirements? > > > > Also what does drm use that you say has a conflict already? What > > drm code are we talking about exactly ? > > The required order is: > > 1. AMD IOMMUv1 (in-kernel only, initialized at boot time after PCI) > 2. AMD IOMMUv2 (in-kernel or as module, provides demand paging > and uses v1 interfaces to talk to the IOMMU) > 3. AMD-KFD (Implements compute offloading to the GPU and > uses AMD IOMMUv2 functionality, also provides a > symbol for the radeon driver) > 4. DRM with(Checks if the symbol provided by AMD-KFD is > Radeon available at init time and does the KFD > initialization from there, because the GPU needs > to be up first) > > So AMD IOMMUv2 does not initialize (but it does load to have the symbols > available for drivers that optionally use its functionality) without the > AMD IOMMUv1 driver, as it can't access the IOMMU hardware then. > > AMD-KFD does not load without AMD IOMMUv2 being loaded, as it depends on > its symbols. AMD-KFD on the other hand needs to be loaded before the > radeon driver (but this it not enforced by symbols), because otherwise > the radeon driver will not initialize the AMD-KFD driver. > > When AMD-KFD is loaded and you load radeon then, you get the KFD > functionality in the kernel. Then you can move on to the fun getting the > userspace running and actually execute anything on the GPU. But thats > another story. > > I know what you think, and I agree: It's a big mess :) Its no concern for me that its a mess, frankly most drivers are. This however illustrates a semantic gap in the driver core when you have more than 3 dependencies after a rootfs_initcall(), given that the buck stops at late_initcall(), only 3 levels above. With 4 dependencies you run out of semantics to specify explicit requirements. Ties can be disputed through link order though, but this is obviously fragile, and the dependencies are then left implicit. Nothing vets for correctness, either in software or through static analysers. To summarize the specific code in question is (and order required): AMD IOMMUv1:arch/x86/kernel/pci-dma.c rootfs_initcall(pci_iommu_init); AMD IOMMUv2:drivers/iommu/amd_iommu_v2.c module_init(amd_iommu_v2_init); (device_initcall()) AMD KFD:drivers/gpu/drm/amd/amdkfd/kfd_module.c module_init(kfd_module_init); (device_initcall()) AMD Radeon: drivers/gpu/drm/radeon/radeon_drv.c module_init(radeon_init); (device_initcall()) Commit 1bacc894c227fad8a7 ("drivers: Move iommu/ before gpu/ in Makefile") resolved the race between IOMMU and GPU drivers this way, however an alternative is to make the dependencies in levels explicit by making the amdkfd and radeon drivers both use late_initcall(), this however is technically still racy specially since you note that the amdkfd driver is not loaded prior to radeon through symbol dependencies, if happens to be loaded it then you get KFD functionality, otherwise you don't. Making both amdkfd and radeon use late_initcall() would actually work now though but that's only because the link order also happens to match the dependency requirements: drivers/gpu/drm/Makefile ... obj-$(CONFIG_HSA_AMD) += amd/amdkfd/ obj-$(CONFIG_DRM_RADEON)+= radeon/ ... Since we currently lack proper semantics to define clear dependencies for all this reverting 1bacc894c227fad8a7 after using late_initcall() on both amdkfd and radeon would not be useful other than to just test the race fix, given that such work around would still be present also on drivers/gpu/drm/Makefile to account for the race between amdkfd and radeon. Reverting 1bacc894c227fad8a7 after using late_initcall() on both amdkfd and radeon would just bump the race work around another level. Modifying both amdkfd and radeon to use late_initcall() however seems well justified for now, and given the current state of affairs with link order one should be able to then correctly build all these things as built-in and it should work correctly. I'm still not satisfied with the issues on semantics here though. A fix for that, if desired, should be possible using linker-tables, currently in RFCv2 [0] stage. Linker tables offer practically infinite number (as long as a valid ELF section name, as the order level is part of the section name) of order levels. It also would offer the opportunity to build dependencies outside of the driver core layer, so you can customize the dependency map per subsystem as you see fit. So -- for now I'll
Re: [RFT v2] iommu/amd: use subsys_initcall() on amdv2 iommu
On Mon, Apr 25, 2016 at 12:23:51PM +0200, Joerg Roedel wrote: > On Mon, Apr 18, 2016 at 02:03:50PM +0200, Luis R. Rodriguez wrote: > > You said that with my patch you saw AMD IOMMUv2 kick off first, > > that was intentional as I thought that's what you needed. Can > > someone please describe the requirements? > > > > Also what does drm use that you say has a conflict already? What > > drm code are we talking about exactly ? > > The required order is: > > 1. AMD IOMMUv1 (in-kernel only, initialized at boot time after PCI) > 2. AMD IOMMUv2 (in-kernel or as module, provides demand paging > and uses v1 interfaces to talk to the IOMMU) > 3. AMD-KFD (Implements compute offloading to the GPU and > uses AMD IOMMUv2 functionality, also provides a > symbol for the radeon driver) > 4. DRM with(Checks if the symbol provided by AMD-KFD is > Radeon available at init time and does the KFD > initialization from there, because the GPU needs > to be up first) > > So AMD IOMMUv2 does not initialize (but it does load to have the symbols > available for drivers that optionally use its functionality) without the > AMD IOMMUv1 driver, as it can't access the IOMMU hardware then. > > AMD-KFD does not load without AMD IOMMUv2 being loaded, as it depends on > its symbols. AMD-KFD on the other hand needs to be loaded before the > radeon driver (but this it not enforced by symbols), because otherwise > the radeon driver will not initialize the AMD-KFD driver. > > When AMD-KFD is loaded and you load radeon then, you get the KFD > functionality in the kernel. Then you can move on to the fun getting the > userspace running and actually execute anything on the GPU. But thats > another story. > > I know what you think, and I agree: It's a big mess :) Its no concern for me that its a mess, frankly most drivers are. This however illustrates a semantic gap in the driver core when you have more than 3 dependencies after a rootfs_initcall(), given that the buck stops at late_initcall(), only 3 levels above. With 4 dependencies you run out of semantics to specify explicit requirements. Ties can be disputed through link order though, but this is obviously fragile, and the dependencies are then left implicit. Nothing vets for correctness, either in software or through static analysers. To summarize the specific code in question is (and order required): AMD IOMMUv1:arch/x86/kernel/pci-dma.c rootfs_initcall(pci_iommu_init); AMD IOMMUv2:drivers/iommu/amd_iommu_v2.c module_init(amd_iommu_v2_init); (device_initcall()) AMD KFD:drivers/gpu/drm/amd/amdkfd/kfd_module.c module_init(kfd_module_init); (device_initcall()) AMD Radeon: drivers/gpu/drm/radeon/radeon_drv.c module_init(radeon_init); (device_initcall()) Commit 1bacc894c227fad8a7 ("drivers: Move iommu/ before gpu/ in Makefile") resolved the race between IOMMU and GPU drivers this way, however an alternative is to make the dependencies in levels explicit by making the amdkfd and radeon drivers both use late_initcall(), this however is technically still racy specially since you note that the amdkfd driver is not loaded prior to radeon through symbol dependencies, if happens to be loaded it then you get KFD functionality, otherwise you don't. Making both amdkfd and radeon use late_initcall() would actually work now though but that's only because the link order also happens to match the dependency requirements: drivers/gpu/drm/Makefile ... obj-$(CONFIG_HSA_AMD) += amd/amdkfd/ obj-$(CONFIG_DRM_RADEON)+= radeon/ ... Since we currently lack proper semantics to define clear dependencies for all this reverting 1bacc894c227fad8a7 after using late_initcall() on both amdkfd and radeon would not be useful other than to just test the race fix, given that such work around would still be present also on drivers/gpu/drm/Makefile to account for the race between amdkfd and radeon. Reverting 1bacc894c227fad8a7 after using late_initcall() on both amdkfd and radeon would just bump the race work around another level. Modifying both amdkfd and radeon to use late_initcall() however seems well justified for now, and given the current state of affairs with link order one should be able to then correctly build all these things as built-in and it should work correctly. I'm still not satisfied with the issues on semantics here though. A fix for that, if desired, should be possible using linker-tables, currently in RFCv2 [0] stage. Linker tables offer practically infinite number (as long as a valid ELF section name, as the order level is part of the section name) of order levels. It also would offer the opportunity to build dependencies outside of the driver core layer, so you can customize the dependency map per subsystem as you see fit. So -- for now I'll
Re: [PATCH] ACPI / Thermal / video: fix max_level incorrect value
On Thu, 2016-05-26 at 13:15 +0800, Aaron Lu wrote: > On 05/26/2016 09:49 AM, valdis.kletni...@vt.edu wrote: > > On Wed, 25 May 2016 13:15:26 +0800, Aaron Lu said: > >> Valdis, can you please give the patch a try? Thanks. > > > > Sorry, had a few days where actual work commitments and other > > things got in the way... I tested this patch against next-20160524, > > and can report that the problem is fixed, so feel free to > > stick a "Tested-By:" on it > > Thanks a lot for the confirm, here is the updated patch with your > tested-by tag added. > > From: Aaron Lu> Date: Sat, 21 May 2016 15:30:46 +0800 > Subject: [PATCH] ACPI / Thermal / video: fix max_level incorrect value > > commit 059500940def("ACPI/video: export acpi_video_get_levels") > mistakenly dropped the correct value of max_level and that caused the > set_level function following failed and the acpi_video backlight interface > didn't get created. Fix this by passing back the correct max_level value. > > While at it, also fix the param used in acpi_video_device_lcd_query_levels > where acpi_handle is expected but acpi_video_device is passed. > > Reported-and-tested-by: Valdis Kletnieks > Signed-off-by: Aaron Lu Acked-by: Zhang Rui > --- > drivers/acpi/acpi_video.c | 9 ++--- > drivers/thermal/int340x_thermal/int3406_thermal.c | 2 +- > include/acpi/video.h | 6 -- > 3 files changed, 11 insertions(+), 6 deletions(-) > > diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c > index 3d5b8a099351..c1d138e128cb 100644 > --- a/drivers/acpi/acpi_video.c > +++ b/drivers/acpi/acpi_video.c > @@ -754,7 +754,8 @@ static int acpi_video_bqc_quirk(struct acpi_video_device > *device, > } > > int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br) > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level) > { > union acpi_object *obj = NULL; > int i, max_level = 0, count = 0, level_ac_battery = 0; > @@ -841,6 +842,8 @@ int acpi_video_get_levels(struct acpi_device *device, > > br->count = count; > *dev_br = br; > + if (pmax_level) > + *pmax_level = max_level; > > out: > kfree(obj); > @@ -869,7 +872,7 @@ acpi_video_init_brightness(struct acpi_video_device > *device) > struct acpi_video_device_brightness *br = NULL; > int result = -EINVAL; > > - result = acpi_video_get_levels(device->dev, ); > + result = acpi_video_get_levels(device->dev, , _level); > if (result) > return result; > device->brightness = br; > @@ -1737,7 +1740,7 @@ static void acpi_video_run_bcl_for_osi(struct > acpi_video_bus *video) > > mutex_lock(>device_list_lock); > list_for_each_entry(dev, >video_device_list, entry) { > - if (!acpi_video_device_lcd_query_levels(dev, )) > + if (!acpi_video_device_lcd_query_levels(dev->dev->handle, > )) > kfree(levels); > } > mutex_unlock(>device_list_lock); > diff --git a/drivers/thermal/int340x_thermal/int3406_thermal.c > b/drivers/thermal/int340x_thermal/int3406_thermal.c > index 13d431cbd29e..a578cd257db4 100644 > --- a/drivers/thermal/int340x_thermal/int3406_thermal.c > +++ b/drivers/thermal/int340x_thermal/int3406_thermal.c > @@ -177,7 +177,7 @@ static int int3406_thermal_probe(struct platform_device > *pdev) > return -ENODEV; > d->raw_bd = bd; > > - ret = acpi_video_get_levels(ACPI_COMPANION(>dev), >br); > + ret = acpi_video_get_levels(ACPI_COMPANION(>dev), >br, NULL); > if (ret) > return ret; > > diff --git a/include/acpi/video.h b/include/acpi/video.h > index 70a41f742037..5731ccb42585 100644 > --- a/include/acpi/video.h > +++ b/include/acpi/video.h > @@ -51,7 +51,8 @@ extern void acpi_video_set_dmi_backlight_type(enum > acpi_backlight_type type); > */ > extern bool acpi_video_handles_brightness_key_presses(void); > extern int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br); > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level); > #else > static inline int acpi_video_register(void) { return 0; } > static inline void acpi_video_unregister(void) { return; } > @@ -72,7 +73,8 @@ static inline bool > acpi_video_handles_brightness_key_presses(void) > return false; > } > static inline int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br) > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level) > { > return -ENODEV; > }
Re: [PATCH] ACPI / Thermal / video: fix max_level incorrect value
On Thu, 2016-05-26 at 13:15 +0800, Aaron Lu wrote: > On 05/26/2016 09:49 AM, valdis.kletni...@vt.edu wrote: > > On Wed, 25 May 2016 13:15:26 +0800, Aaron Lu said: > >> Valdis, can you please give the patch a try? Thanks. > > > > Sorry, had a few days where actual work commitments and other > > things got in the way... I tested this patch against next-20160524, > > and can report that the problem is fixed, so feel free to > > stick a "Tested-By:" on it > > Thanks a lot for the confirm, here is the updated patch with your > tested-by tag added. > > From: Aaron Lu > Date: Sat, 21 May 2016 15:30:46 +0800 > Subject: [PATCH] ACPI / Thermal / video: fix max_level incorrect value > > commit 059500940def("ACPI/video: export acpi_video_get_levels") > mistakenly dropped the correct value of max_level and that caused the > set_level function following failed and the acpi_video backlight interface > didn't get created. Fix this by passing back the correct max_level value. > > While at it, also fix the param used in acpi_video_device_lcd_query_levels > where acpi_handle is expected but acpi_video_device is passed. > > Reported-and-tested-by: Valdis Kletnieks > Signed-off-by: Aaron Lu Acked-by: Zhang Rui > --- > drivers/acpi/acpi_video.c | 9 ++--- > drivers/thermal/int340x_thermal/int3406_thermal.c | 2 +- > include/acpi/video.h | 6 -- > 3 files changed, 11 insertions(+), 6 deletions(-) > > diff --git a/drivers/acpi/acpi_video.c b/drivers/acpi/acpi_video.c > index 3d5b8a099351..c1d138e128cb 100644 > --- a/drivers/acpi/acpi_video.c > +++ b/drivers/acpi/acpi_video.c > @@ -754,7 +754,8 @@ static int acpi_video_bqc_quirk(struct acpi_video_device > *device, > } > > int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br) > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level) > { > union acpi_object *obj = NULL; > int i, max_level = 0, count = 0, level_ac_battery = 0; > @@ -841,6 +842,8 @@ int acpi_video_get_levels(struct acpi_device *device, > > br->count = count; > *dev_br = br; > + if (pmax_level) > + *pmax_level = max_level; > > out: > kfree(obj); > @@ -869,7 +872,7 @@ acpi_video_init_brightness(struct acpi_video_device > *device) > struct acpi_video_device_brightness *br = NULL; > int result = -EINVAL; > > - result = acpi_video_get_levels(device->dev, ); > + result = acpi_video_get_levels(device->dev, , _level); > if (result) > return result; > device->brightness = br; > @@ -1737,7 +1740,7 @@ static void acpi_video_run_bcl_for_osi(struct > acpi_video_bus *video) > > mutex_lock(>device_list_lock); > list_for_each_entry(dev, >video_device_list, entry) { > - if (!acpi_video_device_lcd_query_levels(dev, )) > + if (!acpi_video_device_lcd_query_levels(dev->dev->handle, > )) > kfree(levels); > } > mutex_unlock(>device_list_lock); > diff --git a/drivers/thermal/int340x_thermal/int3406_thermal.c > b/drivers/thermal/int340x_thermal/int3406_thermal.c > index 13d431cbd29e..a578cd257db4 100644 > --- a/drivers/thermal/int340x_thermal/int3406_thermal.c > +++ b/drivers/thermal/int340x_thermal/int3406_thermal.c > @@ -177,7 +177,7 @@ static int int3406_thermal_probe(struct platform_device > *pdev) > return -ENODEV; > d->raw_bd = bd; > > - ret = acpi_video_get_levels(ACPI_COMPANION(>dev), >br); > + ret = acpi_video_get_levels(ACPI_COMPANION(>dev), >br, NULL); > if (ret) > return ret; > > diff --git a/include/acpi/video.h b/include/acpi/video.h > index 70a41f742037..5731ccb42585 100644 > --- a/include/acpi/video.h > +++ b/include/acpi/video.h > @@ -51,7 +51,8 @@ extern void acpi_video_set_dmi_backlight_type(enum > acpi_backlight_type type); > */ > extern bool acpi_video_handles_brightness_key_presses(void); > extern int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br); > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level); > #else > static inline int acpi_video_register(void) { return 0; } > static inline void acpi_video_unregister(void) { return; } > @@ -72,7 +73,8 @@ static inline bool > acpi_video_handles_brightness_key_presses(void) > return false; > } > static inline int acpi_video_get_levels(struct acpi_device *device, > - struct acpi_video_device_brightness **dev_br) > + struct acpi_video_device_brightness **dev_br, > + int *pmax_level) > { > return -ENODEV; > }
Re: [PATCH] Use the correct size to set block max sectors
On Thu, 05/26 17:08, Long Li wrote: > The block sector size should be in unit of 512 bytes, not in bytes. > > Signed-off-by: Long Li> > --- > drivers/scsi/sd.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c > index 428c03e..4bce17e 100644 > --- a/drivers/scsi/sd.c > +++ b/drivers/scsi/sd.c > @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) > if (sdkp->opt_xfer_blocks && > sdkp->opt_xfer_blocks <= dev_max && > sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && > - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) > - rw_max = q->limits.io_opt = > + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { > + q->limits.io_opt = > sdkp->opt_xfer_blocks * sdp->sector_size; > + rw_max = (q->limits.io_opt >> 9); Other than the superfluous parenthesis, looks good to me. Fam > + } > else > rw_max = BLK_DEF_MAX_SECTORS; > > -- > 1.8.5.6 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Use the correct size to set block max sectors
On Thu, 05/26 17:08, Long Li wrote: > The block sector size should be in unit of 512 bytes, not in bytes. > > Signed-off-by: Long Li > > --- > drivers/scsi/sd.c | 6 -- > 1 file changed, 4 insertions(+), 2 deletions(-) > > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c > index 428c03e..4bce17e 100644 > --- a/drivers/scsi/sd.c > +++ b/drivers/scsi/sd.c > @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) > if (sdkp->opt_xfer_blocks && > sdkp->opt_xfer_blocks <= dev_max && > sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && > - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) > - rw_max = q->limits.io_opt = > + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { > + q->limits.io_opt = > sdkp->opt_xfer_blocks * sdp->sector_size; > + rw_max = (q->limits.io_opt >> 9); Other than the superfluous parenthesis, looks good to me. Fam > + } > else > rw_max = BLK_DEF_MAX_SECTORS; > > -- > 1.8.5.6 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-scsi" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 01/23] all: syscall wrappers: add documentation
On Thu, May 26, 2016 at 11:29:45PM +0100, Catalin Marinas wrote: > On Thu, May 26, 2016 at 11:48:19PM +0300, Yury Norov wrote: > > On Wed, May 25, 2016 at 02:28:21PM -0700, David Miller wrote: > > > From: Arnd Bergmann> > > Date: Wed, 25 May 2016 23:01:06 +0200 > > > > > > > On Wednesday, May 25, 2016 1:50:39 PM CEST David Miller wrote: > > > >> From: Arnd Bergmann > > > >> Date: Wed, 25 May 2016 22:47:33 +0200 > > > >> > > > >> > If we use the normal calling conventions, we could remove these > > > >> > overrides > > > >> > along with the respective special-case handling in glibc. None of > > > >> > them > > > >> > look particularly performance-sensitive, but I could be wrong there. > > > >> > > > >> You could set the lowest bit in the system call entry pointer to > > > >> indicate > > > >> the upper-half clears should be elided. > > > > > > > > Right, but that would introduce an extra conditional branch in the > > > > syscall > > > > hotpath, and likely eliminate the gains from passing the loff_t > > > > arguments > > > > in a single register instead of a pair. > > > > > > Ok, then, how much are you really gaining from avoiding a 'shift' and > > > an 'or' to build the full 64-bit value? 3 cycles? Maybe 4? > > > > 4 cycles in kernel and ~same cost in glibc to create a pair. > > It would take a single instruction per argument in the kernel to do > shift+or and maybe 1-2 more instructions to move the remaining arguments > in place (we do this for a few wrappers in arch/arm64/kernel/entry32.S). > And the glibc counterpart. > > > And 8 'mov's that exist for every syscall, even yield(). > > > > > And the executing the wrappers, those have a non-trivial cost too. > > > > The cost is pretty trivial though. See kernel/compat_wrapper.o: > > COMPAT_SYSCALL_WRAP2(creat, const char __user *, pathname, umode_t, mode); > > 0: a9bf7bfdstp x29, x30, [sp,#-16]! > > 4: 910003fdmov x29, sp > > 8: 2a0003e0mov w0, w0 > > c: 9400bl 0 > > 10: a8c17bfdldp x29, x30, [sp],#16 > > 14: d65f03c0ret > > I would say the above could be more expensive than 8 movs (16 bytes to > write, read, a branch and a ret). You can also add the I-cache locality, > having wrappers for each syscalls instead of a single place for zeroing > the upper half (where no other wrapper is necessary). > > Can we trick the compiler into doing a tail call optimisation. This > could have simply been: > > COMPAT_SYSCALL_WRAP2(creat, ...): > mov w0, w0 > b What you talk about was in my initial version. But Heiko insisted on having all wrappers together. http://www.spinics.net/lists/linux-s390/msg11593.html Grep your email for discussion. > > > > Cost wise, this seems like it all cancels out in the end, but what > > > do I know? > > > > I think you know something, and I also think Heiko and other s390 guys > > know something as well. So I'd like to listen their arguments here. > > > > For me spark64 way is looking reasonable only because it's really simple > > and takes less coding. I'll try it on some branch and share here what > > happened. > > The kernel code will definitely look simpler ;). It would be good to see > if there actually is any performance impact. Even with 16 more cycles on > syscall entry, would they be lost in the noise? You don't need a full > implementation, just some dummy mov x0, x0 on the entry path. > > -- > Catalin
Re: [PATCH 01/23] all: syscall wrappers: add documentation
On Thu, May 26, 2016 at 11:29:45PM +0100, Catalin Marinas wrote: > On Thu, May 26, 2016 at 11:48:19PM +0300, Yury Norov wrote: > > On Wed, May 25, 2016 at 02:28:21PM -0700, David Miller wrote: > > > From: Arnd Bergmann > > > Date: Wed, 25 May 2016 23:01:06 +0200 > > > > > > > On Wednesday, May 25, 2016 1:50:39 PM CEST David Miller wrote: > > > >> From: Arnd Bergmann > > > >> Date: Wed, 25 May 2016 22:47:33 +0200 > > > >> > > > >> > If we use the normal calling conventions, we could remove these > > > >> > overrides > > > >> > along with the respective special-case handling in glibc. None of > > > >> > them > > > >> > look particularly performance-sensitive, but I could be wrong there. > > > >> > > > >> You could set the lowest bit in the system call entry pointer to > > > >> indicate > > > >> the upper-half clears should be elided. > > > > > > > > Right, but that would introduce an extra conditional branch in the > > > > syscall > > > > hotpath, and likely eliminate the gains from passing the loff_t > > > > arguments > > > > in a single register instead of a pair. > > > > > > Ok, then, how much are you really gaining from avoiding a 'shift' and > > > an 'or' to build the full 64-bit value? 3 cycles? Maybe 4? > > > > 4 cycles in kernel and ~same cost in glibc to create a pair. > > It would take a single instruction per argument in the kernel to do > shift+or and maybe 1-2 more instructions to move the remaining arguments > in place (we do this for a few wrappers in arch/arm64/kernel/entry32.S). > And the glibc counterpart. > > > And 8 'mov's that exist for every syscall, even yield(). > > > > > And the executing the wrappers, those have a non-trivial cost too. > > > > The cost is pretty trivial though. See kernel/compat_wrapper.o: > > COMPAT_SYSCALL_WRAP2(creat, const char __user *, pathname, umode_t, mode); > > 0: a9bf7bfdstp x29, x30, [sp,#-16]! > > 4: 910003fdmov x29, sp > > 8: 2a0003e0mov w0, w0 > > c: 9400bl 0 > > 10: a8c17bfdldp x29, x30, [sp],#16 > > 14: d65f03c0ret > > I would say the above could be more expensive than 8 movs (16 bytes to > write, read, a branch and a ret). You can also add the I-cache locality, > having wrappers for each syscalls instead of a single place for zeroing > the upper half (where no other wrapper is necessary). > > Can we trick the compiler into doing a tail call optimisation. This > could have simply been: > > COMPAT_SYSCALL_WRAP2(creat, ...): > mov w0, w0 > b What you talk about was in my initial version. But Heiko insisted on having all wrappers together. http://www.spinics.net/lists/linux-s390/msg11593.html Grep your email for discussion. > > > > Cost wise, this seems like it all cancels out in the end, but what > > > do I know? > > > > I think you know something, and I also think Heiko and other s390 guys > > know something as well. So I'd like to listen their arguments here. > > > > For me spark64 way is looking reasonable only because it's really simple > > and takes less coding. I'll try it on some branch and share here what > > happened. > > The kernel code will definitely look simpler ;). It would be good to see > if there actually is any performance impact. Even with 16 more cycles on > syscall entry, would they be lost in the noise? You don't need a full > implementation, just some dummy mov x0, x0 on the entry path. > > -- > Catalin
[PATCH 2/4] f2fs: inject to produce some orphan inodes
Signed-off-by: Jaegeuk Kim--- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/inode.c | 5 + fs/f2fs/super.c | 1 + 3 files changed, 9 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5daba19..c4697b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -45,6 +45,7 @@ enum { FAULT_ORPHAN, FAULT_BLOCK, FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, FAULT_MAX, }; @@ -74,6 +75,8 @@ static inline bool time_to_inject(int type) return false; else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) return false; + else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) + return false; atomic_inc(_fault.inject_ops); if (atomic_read(_fault.inject_ops) >= f2fs_fault.inject_rate) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bdd814d..11cb60a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -345,6 +345,11 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5b5739..27f76819e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", }; static void f2fs_build_fault_attr(unsigned int rate) -- 2.6.3
[PATCH 2/4] f2fs: inject to produce some orphan inodes
Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/inode.c | 5 + fs/f2fs/super.c | 1 + 3 files changed, 9 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5daba19..c4697b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -45,6 +45,7 @@ enum { FAULT_ORPHAN, FAULT_BLOCK, FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, FAULT_MAX, }; @@ -74,6 +75,8 @@ static inline bool time_to_inject(int type) return false; else if (type == FAULT_DIR_DEPTH && !IS_FAULT_SET(type)) return false; + else if (type == FAULT_EVICT_INODE && !IS_FAULT_SET(type)) + return false; atomic_inc(_fault.inject_ops); if (atomic_read(_fault.inject_ops) >= f2fs_fault.inject_rate) { diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bdd814d..11cb60a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -345,6 +345,11 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a5b5739..27f76819e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", }; static void f2fs_build_fault_attr(unsigned int rate) -- 2.6.3
[PATCH 4/4] f2fs: remove two steps to flush dirty data pages
If there is no cold page, we don't need to do a loop to flush dirty data pages. On /dev/pmem0, 1. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 conv=fsync Before : 1.1 GB/s After : 1.2 GB/s 2. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 Before : 2.2 GB/s After : 2.3 GB/s Signed-off-by: Jaegeuk Kim--- fs/f2fs/data.c | 11 +-- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 85ceb2b..5dcd8db 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1326,10 +1326,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; pagevec_init(, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1384,9 +1383,6 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, @@ -1421,11 +1417,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; -- 2.6.3
[PATCH 4/4] f2fs: remove two steps to flush dirty data pages
If there is no cold page, we don't need to do a loop to flush dirty data pages. On /dev/pmem0, 1. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 conv=fsync Before : 1.1 GB/s After : 1.2 GB/s 2. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 Before : 2.2 GB/s After : 2.3 GB/s Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +-- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 85ceb2b..5dcd8db 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1326,10 +1326,9 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; pagevec_init(, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1384,9 +1383,6 @@ continue_unlock: goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) f2fs_wait_on_page_writeback(page, @@ -1421,11 +1417,6 @@ continue_unlock: cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; -- 2.6.3
[PATCH 3/4] f2fs: do not skip writing data pages
For data pages, let's try to flush as much as possible in background. On /dev/pmem0, 1. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 conv=fsync Before : 800 MB/s After : 1.1 GB/s 2. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 Before : 1.3 GB/s After : 2.2 GB/s Signed-off-by: Jaegeuk Kim--- fs/f2fs/data.c| 11 +-- fs/f2fs/segment.h | 4 +--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7132b02..85ceb2b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1444,7 +1444,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; - long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1469,14 +1468,14 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); - diff = nr_pages_to_write(sbi, DATA, wbc); - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + /* +* if some pages were truncated, we cannot guarantee its mapping->host +* to detect pending bios. +*/ + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5d016a1..890bb28d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -728,9 +728,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) + if (type == NODE) desired = 3 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); -- 2.6.3
[PATCH 1/4] f2fs: propagate error given by f2fs_find_entry
If we get ENOMEM or EIO in f2fs_find_entry, we should stop right away. Otherwise, for example, we can get duplicate directory entry by ->chash and ->clevel. Signed-off-by: Jaegeuk Kim--- fs/f2fs/dir.c| 23 --- fs/f2fs/inline.c | 4 +++- fs/f2fs/namei.c | 5 + 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 24d1308..ae37543 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -185,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, _slots, @@ -223,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct fscrypt_name fname; int err; - *res_page = NULL; - err = fscrypt_setup_filename(dir, child, 1, ); - if (err) + if (err) { + *res_page = ERR_PTR(-ENOMEM); return NULL; + } if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; de = find_in_inline_dir(dir, , res_page); goto out; } - if (npages == 0) + if (npages == 0) { + *res_page = NULL; goto out; + } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -247,8 +255,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { + *res_page = NULL; de = find_in_level(dir, level, , res_page); - if (de) + if (de || IS_ERR(*res_page)) break; } out: diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 77c9c24..1eb3043 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -286,8 +286,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 496f4e3..3f6119e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -232,6 +232,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, , NULL, dir->i_ino, S_IFDIR); if (err) @@ -242,6 +245,8 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, , NULL, pino, S_IFDIR); } -- 2.6.3
[PATCH 3/4] f2fs: do not skip writing data pages
For data pages, let's try to flush as much as possible in background. On /dev/pmem0, 1. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 conv=fsync Before : 800 MB/s After : 1.1 GB/s 2. dd if=/dev/zero of=/mnt/test/testfile bs=1M count=2048 Before : 1.3 GB/s After : 2.2 GB/s Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c| 11 +-- fs/f2fs/segment.h | 4 +--- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7132b02..85ceb2b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1444,7 +1444,6 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; - long diff; /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1469,14 +1468,14 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); - diff = nr_pages_to_write(sbi, DATA, wbc); - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + /* +* if some pages were truncated, we cannot guarantee its mapping->host +* to detect pending bios. +*/ + f2fs_submit_merged_bio(sbi, DATA, WRITE); remove_dirty_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; skip_write: diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5d016a1..890bb28d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -728,9 +728,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) + if (type == NODE) desired = 3 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); -- 2.6.3
[PATCH 1/4] f2fs: propagate error given by f2fs_find_entry
If we get ENOMEM or EIO in f2fs_find_entry, we should stop right away. Otherwise, for example, we can get duplicate directory entry by ->chash and ->clevel. Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c| 23 --- fs/f2fs/inline.c | 4 +++- fs/f2fs/namei.c | 5 + 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 24d1308..ae37543 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -185,8 +185,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, _slots, @@ -223,19 +228,22 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct fscrypt_name fname; int err; - *res_page = NULL; - err = fscrypt_setup_filename(dir, child, 1, ); - if (err) + if (err) { + *res_page = ERR_PTR(-ENOMEM); return NULL; + } if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; de = find_in_inline_dir(dir, , res_page); goto out; } - if (npages == 0) + if (npages == 0) { + *res_page = NULL; goto out; + } max_depth = F2FS_I(dir)->i_current_depth; if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { @@ -247,8 +255,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, } for (level = 0; level < max_depth; level++) { + *res_page = NULL; de = find_in_level(dir, level, , res_page); - if (de) + if (de || IS_ERR(*res_page)) break; } out: diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 77c9c24..1eb3043 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -286,8 +286,10 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } namehash = f2fs_dentry_hash(); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 496f4e3..3f6119e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -232,6 +232,9 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, , NULL, dir->i_ino, S_IFDIR); if (err) @@ -242,6 +245,8 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, , NULL, pino, S_IFDIR); } -- 2.6.3
Re: [GIT PULL] xfs: updates for 4.7-rc1
On Thu, May 26, 2016 at 10:19:13AM -0700, Linus Torvalds wrote: > On Wed, May 25, 2016 at 11:13 PM, Dave Chinnerwrote: > > > > Just yell if this is not OK and I'll drop those branches for this > > merge and resend the pull request > > i'm ok with the late branches, it's not like xfs has been a problem spot. Still, I'll try to avoid them because it reduces testing time. > However: > > > are available in the git repository at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git for-next > > Your pull request mentions the 'for-next' branch, but I think you > *meant* to send me the "xfs-for-linus-4.7-rc1" tag which points to the > same commit and has your summary in it. Mea culpa. I ran this command after creating the tag: $ git request-pull v4.6 git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git tags/xfs-for-linus-4.7-rc1 > ~/tmp/t.txt And I didn't check the output closely enough. I forgot to push the tag to the upstream repo before running request-pull. Git often makes it very easy to make mistakes whilst simultaneously making it hard to notice you've made a mistake. :( Cheers, Dave. -- Dave Chinner da...@fromorbit.com
Re: [GIT PULL] xfs: updates for 4.7-rc1
On Thu, May 26, 2016 at 10:19:13AM -0700, Linus Torvalds wrote: > On Wed, May 25, 2016 at 11:13 PM, Dave Chinner wrote: > > > > Just yell if this is not OK and I'll drop those branches for this > > merge and resend the pull request > > i'm ok with the late branches, it's not like xfs has been a problem spot. Still, I'll try to avoid them because it reduces testing time. > However: > > > are available in the git repository at: > > > > git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git for-next > > Your pull request mentions the 'for-next' branch, but I think you > *meant* to send me the "xfs-for-linus-4.7-rc1" tag which points to the > same commit and has your summary in it. Mea culpa. I ran this command after creating the tag: $ git request-pull v4.6 git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs.git tags/xfs-for-linus-4.7-rc1 > ~/tmp/t.txt And I didn't check the output closely enough. I forgot to push the tag to the upstream repo before running request-pull. Git often makes it very easy to make mistakes whilst simultaneously making it hard to notice you've made a mistake. :( Cheers, Dave. -- Dave Chinner da...@fromorbit.com
[PATCH 1/2] timer: Export destroy_hrtimer_on_stack()
hrtimer_init_on_stack() needs a matching call to destroy_hrtimer_on_stack(), so both need to be exported. Signed-off-by: Guenter Roeck--- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8c7392c4fdbd..e99df0ff1d42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, _debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } -- 2.5.0
[PATCH 2/2] net: pktgen: Call destroy_hrtimer_on_stack()
If CONFIG_DEBUG_OBJECTS_TIMERS=y, hrtimer_init_on_stack() requires a matching call to destroy_hrtimer_on_stack() to clean up timer debug objects. Signed-off-by: Guenter Roeck--- net/core/pktgen.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8604ae245960..8b02df0d354d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_set_expires(, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining()); - if (remaining <= 0) { - pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); - return; - } + if (remaining <= 0) + goto out; start_time = ktime_get(); if (remaining < 10) { @@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); +out: pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + destroy_hrtimer_on_stack(); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) -- 2.5.0
[PATCH 1/2] timer: Export destroy_hrtimer_on_stack()
hrtimer_init_on_stack() needs a matching call to destroy_hrtimer_on_stack(), so both need to be exported. Signed-off-by: Guenter Roeck --- kernel/time/hrtimer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8c7392c4fdbd..e99df0ff1d42 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,6 +425,7 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, _debug_descr); } +EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } -- 2.5.0
[PATCH 2/2] net: pktgen: Call destroy_hrtimer_on_stack()
If CONFIG_DEBUG_OBJECTS_TIMERS=y, hrtimer_init_on_stack() requires a matching call to destroy_hrtimer_on_stack() to clean up timer debug objects. Signed-off-by: Guenter Roeck --- net/core/pktgen.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8604ae245960..8b02df0d354d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2245,10 +2245,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) hrtimer_set_expires(, spin_until); remaining = ktime_to_ns(hrtimer_expires_remaining()); - if (remaining <= 0) { - pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); - return; - } + if (remaining <= 0) + goto out; start_time = ktime_get(); if (remaining < 10) { @@ -2273,7 +2271,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) } pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time)); +out: pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay); + destroy_hrtimer_on_stack(); } static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) -- 2.5.0
Re: How should I use kernel-defined i2c structs in this driver
On 05/26/2016 04:59 PM, Andrey Utkin wrote: Could anybody please give a hint - which kernel-defined i2c objects, and how many of them, I need to define and use to substitute these driver-defined functions i2c_read(), i2c_write() ? https://github.com/bluecherrydvr/linux/blob/release/tw5864/1.16/drivers/media/pci/tw5864/tw5864-config.c In a word, there's 4 chips with different addresses, to which this code communicates via main chip's dedicated registers. Do i need a single i2c_adapter or several? Do i need i2c_client entities? where should I put what is named "devid" here? Thanks in advance. It depends how those are connected at hardware level. Quickly looking I think "devid" is here to select proper I2C adapter. So I think there is 4 I2C adapters and each of those adapter has 1 slave device. Is that correct? If yes, then register 4 I2C adapters and register single client for each of those adapters. regards Antti -- http://palosaari.fi/
Re: How should I use kernel-defined i2c structs in this driver
On 05/26/2016 04:59 PM, Andrey Utkin wrote: Could anybody please give a hint - which kernel-defined i2c objects, and how many of them, I need to define and use to substitute these driver-defined functions i2c_read(), i2c_write() ? https://github.com/bluecherrydvr/linux/blob/release/tw5864/1.16/drivers/media/pci/tw5864/tw5864-config.c In a word, there's 4 chips with different addresses, to which this code communicates via main chip's dedicated registers. Do i need a single i2c_adapter or several? Do i need i2c_client entities? where should I put what is named "devid" here? Thanks in advance. It depends how those are connected at hardware level. Quickly looking I think "devid" is here to select proper I2C adapter. So I think there is 4 I2C adapters and each of those adapter has 1 slave device. Is that correct? If yes, then register 4 I2C adapters and register single client for each of those adapters. regards Antti -- http://palosaari.fi/
Re: [RFT v2] iommu/amd: use subsys_initcall() on amdv2 iommu
On Tue, Apr 19, 2016 at 10:02:52AM +0800, Wan Zongshun wrote: > > You have to take carefully to arrange the calling sequence for > iommuv1, iommuv2, kfd module, and drm like the following sequence : > v1 ->v2->kfd, drm. > > iommuv1 -- rootfs_initcall(fn) > IOMMUV2 -- device_initcall(fn) > kfd module -- late_initcall(fn) > drm -- late_initcall(fn) Thanks, it turns out this is not exactly enough, given as Joerg notes: -- AMD-KFD on the other hand needs to be loaded before the radeon driver (but this it not enforced by symbols), because otherwise the radeon driver will not initialize the AMD-KFD driver. --- We have a theoretical race still possible between the kfd module and the drm driver. I'll reply to Joerg's e-mail with more feedback. Luis
Re: [RFT v2] iommu/amd: use subsys_initcall() on amdv2 iommu
On Tue, Apr 19, 2016 at 10:02:52AM +0800, Wan Zongshun wrote: > > You have to take carefully to arrange the calling sequence for > iommuv1, iommuv2, kfd module, and drm like the following sequence : > v1 ->v2->kfd, drm. > > iommuv1 -- rootfs_initcall(fn) > IOMMUV2 -- device_initcall(fn) > kfd module -- late_initcall(fn) > drm -- late_initcall(fn) Thanks, it turns out this is not exactly enough, given as Joerg notes: -- AMD-KFD on the other hand needs to be loaded before the radeon driver (but this it not enforced by symbols), because otherwise the radeon driver will not initialize the AMD-KFD driver. --- We have a theoretical race still possible between the kfd module and the drm driver. I'll reply to Joerg's e-mail with more feedback. Luis
[PATCH][RT] netpoll: Always take poll_lock when doing polling
[ Alison, can you try this patch ] This uses netpoll_poll_lock()/unlock() to synchronize netpoll and napi poll operations. Without this method, the synchronization is done by looping on NAPI_STATE_SCHED 'bitset'. This method works fine on a non-rt kernel because a softirq can not be preempted, and the thread poll is called with local_bh_disable() which prevents softirqs from running and preempting it. But on rt, this code can be preempted. Thus, the code may be preempted out while holding the NAPI_STATE_SCHED 'bitset', opening a window for a livelock. For example: napi_schedule_prep() test_and_set_bit(NAPI_STATE_SCHED, >state) sk_busy_loop() do { rc = busy_poll() ret = napi_schedule_prep() return !test_and_set_bit(NAPI_STATE_SCHED, >state) if (!ret) return 0 } while (...) /* for ever */ This isn't a problem in non PREEMPT_RT because the napi_schedule_prep() can not be preempted. But because it can in PREEMPT_RT, we need to add some extra locking. The netpoll_poll_lock() works well here, but they need to be added around any call to busy_poll(). Using IS_ENABLED(CONFIG_PREEMPT_RT_FULL) will allow gcc to optimize out the extra calls to poll_lock. Tested-by: "Luis Claudio R. Goncalves"Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- include/linux/netpoll.h |2 +- include/net/busy_poll.h | 14 +- 2 files changed, 14 insertions(+), 2 deletions(-) Index: linux-rt.git/include/linux/netpoll.h === --- linux-rt.git.orig/include/linux/netpoll.h 2016-05-26 18:31:09.183150389 -0400 +++ linux-rt.git/include/linux/netpoll.h2016-05-26 18:52:02.657014280 -0400 @@ -77,7 +77,7 @@ static inline void *netpoll_poll_lock(st { struct net_device *dev = napi->dev; - if (dev && dev->npinfo) { + if (dev && (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) || dev->npinfo)) { spin_lock(>poll_lock); napi->poll_owner = smp_processor_id(); return napi; Index: linux-rt.git/include/net/busy_poll.h === --- linux-rt.git.orig/include/net/busy_poll.h 2016-05-26 18:31:09.183150389 -0400 +++ linux-rt.git/include/net/busy_poll.h2016-05-26 19:10:12.134266713 -0400 @@ -25,6 +25,7 @@ #define _LINUX_NET_BUSY_POLL_H #include +#include #include #ifdef CONFIG_NET_RX_BUSY_POLL @@ -97,7 +98,18 @@ static inline bool sk_busy_loop(struct s goto out; do { - rc = ops->ndo_busy_poll(napi); + /* When RT is enabled, napi_schedule_prep() can be preempted +* with NAPI_STATE_SCHED set, causing the busy_poll() function +* to always return zero, and this loop may never exit. +* In that case, we must always take the netpoll_poll_lock. +*/ + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) { + void *have = netpoll_poll_lock(napi); + rc = ops->ndo_busy_poll(napi); + netpoll_poll_unlock(have); + } else { + rc = ops->ndo_busy_poll(napi); + } if (rc == LL_FLUSH_FAILED) break; /* permanent failure */
[PATCH][RT] netpoll: Always take poll_lock when doing polling
[ Alison, can you try this patch ] This uses netpoll_poll_lock()/unlock() to synchronize netpoll and napi poll operations. Without this method, the synchronization is done by looping on NAPI_STATE_SCHED 'bitset'. This method works fine on a non-rt kernel because a softirq can not be preempted, and the thread poll is called with local_bh_disable() which prevents softirqs from running and preempting it. But on rt, this code can be preempted. Thus, the code may be preempted out while holding the NAPI_STATE_SCHED 'bitset', opening a window for a livelock. For example: napi_schedule_prep() test_and_set_bit(NAPI_STATE_SCHED, >state) sk_busy_loop() do { rc = busy_poll() ret = napi_schedule_prep() return !test_and_set_bit(NAPI_STATE_SCHED, >state) if (!ret) return 0 } while (...) /* for ever */ This isn't a problem in non PREEMPT_RT because the napi_schedule_prep() can not be preempted. But because it can in PREEMPT_RT, we need to add some extra locking. The netpoll_poll_lock() works well here, but they need to be added around any call to busy_poll(). Using IS_ENABLED(CONFIG_PREEMPT_RT_FULL) will allow gcc to optimize out the extra calls to poll_lock. Tested-by: "Luis Claudio R. Goncalves" Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt --- include/linux/netpoll.h |2 +- include/net/busy_poll.h | 14 +- 2 files changed, 14 insertions(+), 2 deletions(-) Index: linux-rt.git/include/linux/netpoll.h === --- linux-rt.git.orig/include/linux/netpoll.h 2016-05-26 18:31:09.183150389 -0400 +++ linux-rt.git/include/linux/netpoll.h2016-05-26 18:52:02.657014280 -0400 @@ -77,7 +77,7 @@ static inline void *netpoll_poll_lock(st { struct net_device *dev = napi->dev; - if (dev && dev->npinfo) { + if (dev && (IS_ENABLED(CONFIG_PREEMPT_RT_FULL) || dev->npinfo)) { spin_lock(>poll_lock); napi->poll_owner = smp_processor_id(); return napi; Index: linux-rt.git/include/net/busy_poll.h === --- linux-rt.git.orig/include/net/busy_poll.h 2016-05-26 18:31:09.183150389 -0400 +++ linux-rt.git/include/net/busy_poll.h2016-05-26 19:10:12.134266713 -0400 @@ -25,6 +25,7 @@ #define _LINUX_NET_BUSY_POLL_H #include +#include #include #ifdef CONFIG_NET_RX_BUSY_POLL @@ -97,7 +98,18 @@ static inline bool sk_busy_loop(struct s goto out; do { - rc = ops->ndo_busy_poll(napi); + /* When RT is enabled, napi_schedule_prep() can be preempted +* with NAPI_STATE_SCHED set, causing the busy_poll() function +* to always return zero, and this loop may never exit. +* In that case, we must always take the netpoll_poll_lock. +*/ + if (IS_ENABLED(CONFIG_PREEMPT_RT_FULL)) { + void *have = netpoll_poll_lock(napi); + rc = ops->ndo_busy_poll(napi); + netpoll_poll_unlock(have); + } else { + rc = ops->ndo_busy_poll(napi); + } if (rc == LL_FLUSH_FAILED) break; /* permanent failure */
[PATCH] Staging: comedi: ni_atmio: fixed spacing and comment style issues
Fixed coding style issues. Signed-off-by: Steven Laabs--- drivers/staging/comedi/drivers/ni_atmio.c | 180 +++--- 1 file changed, 91 insertions(+), 89 deletions(-) diff --git a/drivers/staging/comedi/drivers/ni_atmio.c b/drivers/staging/comedi/drivers/ni_atmio.c index 95435b8..f957790 100644 --- a/drivers/staging/comedi/drivers/ni_atmio.c +++ b/drivers/staging/comedi/drivers/ni_atmio.c @@ -1,93 +1,95 @@ /* -comedi/drivers/ni_atmio.c -Hardware driver for NI AT-MIO E series cards - -COMEDI - Linux Control and Measurement Device Interface -Copyright (C) 1997-2001 David A. Schleef - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -*/ -/* -Driver: ni_atmio -Description: National Instruments AT-MIO-E series -Author: ds -Devices: [National Instruments] AT-MIO-16E-1 (ni_atmio), - AT-MIO-16E-2, AT-MIO-16E-10, AT-MIO-16DE-10, AT-MIO-64E-3, - AT-MIO-16XE-50, AT-MIO-16XE-10, AT-AI-16XE-10 -Status: works -Updated: Thu May 1 20:03:02 CDT 2003 - -The driver has 2.6 kernel isapnp support, and -will automatically probe for a supported board if the -I/O base is left unspecified with comedi_config. -However, many of -the isapnp id numbers are unknown. If your board is not -recognized, please send the output of 'cat /proc/isapnp' -(you may need to modprobe the isa-pnp module for -/proc/isapnp to exist) so the -id numbers for your board can be added to the driver. - -Otherwise, you can use the isapnptools package to configure -your board. Use isapnp to -configure the I/O base and IRQ for the board, and then pass -the same values as -parameters in comedi_config. A sample isapnp.conf file is included -in the etc/ directory of Comedilib. - -Comedilib includes a utility to autocalibrate these boards. The -boards seem to boot into a state where the all calibration DACs -are at one extreme of their range, thus the default calibration -is terrible. Calibration at boot is strongly encouraged. - -To use the extended digital I/O on some of the boards, enable the -8255 driver when configuring the Comedi source tree. - -External triggering is supported for some events. The channel index -(scan_begin_arg, etc.) maps to PFI0 - PFI9. - -Some of the more esoteric triggering possibilities of these boards -are not supported. -*/ -/* - The real guts of the driver is in ni_mio_common.c, which is included - both here and in ni_pcimio.c - - Interrupt support added by Truxton Fulton - - References for specifications: - - 340747b.pdf Register Level Programmer Manual (obsolete) - 340747c.pdf Register Level Programmer Manual (new) - DAQ-STC reference manual - - Other possibly relevant info: - - 320517c.pdf User manual (obsolete) - 320517f.pdf User manual (new) - 320889a.pdf delete - 320906c.pdf maximum signal ratings - 321066a.pdf about 16x - 321791a.pdf discontinuation of at-mio-16e-10 rev. c - 321808a.pdf about at-mio-16e-10 rev P - 321837a.pdf discontinuation of at-mio-16de-10 rev d - 321838a.pdf about at-mio-16de-10 rev N - - ISSUES: - - need to deal with external reference for DAC, and other DAC - properties in board properties + *comedi/drivers/ni_atmio.c + *Hardware driver for NI AT-MIO E series cards + * + *COMEDI - Linux Control and Measurement Device Interface + *Copyright (C) 1997-2001 David A. Schleef + * + *This program is free software; you can redistribute it and/or modify + *it under the terms of the GNU General Public License as published by + *the Free Software Foundation; either version 2 of the License, or + *(at your option) any later version. + * + *This program is distributed in the hope that it will be useful, + *but WITHOUT ANY WARRANTY; without even the implied warranty of + *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + *GNU General Public License for more details. + */ - deal with at-mio-16de-10 revision D to N changes, etc. +/* + * Driver: ni_atmio + * Description: National Instruments AT-MIO-E series + * Author: ds + * Devices: [National Instruments] AT-MIO-16E-1 (ni_atmio), + * AT-MIO-16E-2, AT-MIO-16E-10, AT-MIO-16DE-10, AT-MIO-64E-3, + * AT-MIO-16XE-50, AT-MIO-16XE-10, AT-AI-16XE-10 + * Status: works + * Updated: Thu May 1 20:03:02 CDT 2003 + * + * The driver has 2.6 kernel isapnp support,
[PATCH] Staging: comedi: ni_atmio: fixed spacing and comment style issues
Fixed coding style issues. Signed-off-by: Steven Laabs --- drivers/staging/comedi/drivers/ni_atmio.c | 180 +++--- 1 file changed, 91 insertions(+), 89 deletions(-) diff --git a/drivers/staging/comedi/drivers/ni_atmio.c b/drivers/staging/comedi/drivers/ni_atmio.c index 95435b8..f957790 100644 --- a/drivers/staging/comedi/drivers/ni_atmio.c +++ b/drivers/staging/comedi/drivers/ni_atmio.c @@ -1,93 +1,95 @@ /* -comedi/drivers/ni_atmio.c -Hardware driver for NI AT-MIO E series cards - -COMEDI - Linux Control and Measurement Device Interface -Copyright (C) 1997-2001 David A. Schleef - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -*/ -/* -Driver: ni_atmio -Description: National Instruments AT-MIO-E series -Author: ds -Devices: [National Instruments] AT-MIO-16E-1 (ni_atmio), - AT-MIO-16E-2, AT-MIO-16E-10, AT-MIO-16DE-10, AT-MIO-64E-3, - AT-MIO-16XE-50, AT-MIO-16XE-10, AT-AI-16XE-10 -Status: works -Updated: Thu May 1 20:03:02 CDT 2003 - -The driver has 2.6 kernel isapnp support, and -will automatically probe for a supported board if the -I/O base is left unspecified with comedi_config. -However, many of -the isapnp id numbers are unknown. If your board is not -recognized, please send the output of 'cat /proc/isapnp' -(you may need to modprobe the isa-pnp module for -/proc/isapnp to exist) so the -id numbers for your board can be added to the driver. - -Otherwise, you can use the isapnptools package to configure -your board. Use isapnp to -configure the I/O base and IRQ for the board, and then pass -the same values as -parameters in comedi_config. A sample isapnp.conf file is included -in the etc/ directory of Comedilib. - -Comedilib includes a utility to autocalibrate these boards. The -boards seem to boot into a state where the all calibration DACs -are at one extreme of their range, thus the default calibration -is terrible. Calibration at boot is strongly encouraged. - -To use the extended digital I/O on some of the boards, enable the -8255 driver when configuring the Comedi source tree. - -External triggering is supported for some events. The channel index -(scan_begin_arg, etc.) maps to PFI0 - PFI9. - -Some of the more esoteric triggering possibilities of these boards -are not supported. -*/ -/* - The real guts of the driver is in ni_mio_common.c, which is included - both here and in ni_pcimio.c - - Interrupt support added by Truxton Fulton - - References for specifications: - - 340747b.pdf Register Level Programmer Manual (obsolete) - 340747c.pdf Register Level Programmer Manual (new) - DAQ-STC reference manual - - Other possibly relevant info: - - 320517c.pdf User manual (obsolete) - 320517f.pdf User manual (new) - 320889a.pdf delete - 320906c.pdf maximum signal ratings - 321066a.pdf about 16x - 321791a.pdf discontinuation of at-mio-16e-10 rev. c - 321808a.pdf about at-mio-16e-10 rev P - 321837a.pdf discontinuation of at-mio-16de-10 rev d - 321838a.pdf about at-mio-16de-10 rev N - - ISSUES: - - need to deal with external reference for DAC, and other DAC - properties in board properties + *comedi/drivers/ni_atmio.c + *Hardware driver for NI AT-MIO E series cards + * + *COMEDI - Linux Control and Measurement Device Interface + *Copyright (C) 1997-2001 David A. Schleef + * + *This program is free software; you can redistribute it and/or modify + *it under the terms of the GNU General Public License as published by + *the Free Software Foundation; either version 2 of the License, or + *(at your option) any later version. + * + *This program is distributed in the hope that it will be useful, + *but WITHOUT ANY WARRANTY; without even the implied warranty of + *MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + *GNU General Public License for more details. + */ - deal with at-mio-16de-10 revision D to N changes, etc. +/* + * Driver: ni_atmio + * Description: National Instruments AT-MIO-E series + * Author: ds + * Devices: [National Instruments] AT-MIO-16E-1 (ni_atmio), + * AT-MIO-16E-2, AT-MIO-16E-10, AT-MIO-16DE-10, AT-MIO-64E-3, + * AT-MIO-16XE-50, AT-MIO-16XE-10, AT-AI-16XE-10 + * Status: works + * Updated: Thu May 1 20:03:02 CDT 2003 + * + * The driver has 2.6 kernel isapnp support, and + * will automatically probe for a supported board if the + * I/O
Re: [PATCH] clk: rockchip: add a dummy clock for the watchdog pclk on rk3399
Am Mittwoch, 25. Mai 2016, 16:51:56 schrieb Xing Zheng: > Like rk3288, the pclk supplying the watchdog is controlled via the > SGRF register area. Additionally the SGRF isn't even writable in > every boot mode. > > But still the clock control is available and in the future someone > might want to use it. Therefore define a simple clock for the time > being so that the watchdog driver can read its rate. > > Signed-off-by: Xing Zhengapplied for 4.8 Thanks Heiko
Re: [PATCH] clk: rockchip: add a dummy clock for the watchdog pclk on rk3399
Am Mittwoch, 25. Mai 2016, 16:51:56 schrieb Xing Zheng: > Like rk3288, the pclk supplying the watchdog is controlled via the > SGRF register area. Additionally the SGRF isn't even writable in > every boot mode. > > But still the clock control is available and in the future someone > might want to use it. Therefore define a simple clock for the time > being so that the watchdog driver can read its rate. > > Signed-off-by: Xing Zheng applied for 4.8 Thanks Heiko
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang ShiI didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Regards, Yang Thanks. --- include/linux/page_idle.h | 43 --- mm/page_alloc.c | 6 ++ mm/page_owner.c | 27 +++ mm/page_poison.c | 8 +++- mm/vmstat.c | 2 ++ 5 files changed, 78 insertions(+), 8 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index bf268fa..8f5d4ad 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc..d27e8b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, _ext->flags); INIT_LIST_HEAD(>lru); @@ -673,6 +676,9 @@
Re: [PATCH] mm: check the return value of lookup_page_ext for all call sites
On 5/25/2016 5:37 PM, Minchan Kim wrote: On Tue, May 24, 2016 at 11:58:11AM +0900, Minchan Kim wrote: On Mon, May 23, 2016 at 10:16:08AM -0700, Yang Shi wrote: Per the discussion with Joonsoo Kim [1], we need check the return value of lookup_page_ext() for all call sites since it might return NULL in some cases, although it is unlikely, i.e. memory hotplug. Tested with ltp with "page_owner=0". [1] http://lkml.kernel.org/r/20160519002809.GA10245@js1304-P5Q-DELUXE Signed-off-by: Yang Shi I didn't read code code in detail to see how page_ext memory space allocated in boot code and memory hotplug but to me, it's not good to check NULL whenever we calls lookup_page_ext. More dangerous thing is now page_ext is used by optionable feature(ie, not critical for system stability) but if we want to use page_ext as another important tool for the system in future, it could be a serious problem. Can we put some hooks of page_ext into memory-hotplug so guarantee that page_ext memory space is allocated with memmap space at the same time? IOW, once every PFN wakers find a page is valid, page_ext is valid, too so lookup_page_ext never returns NULL on valid page by design. I hope we consider this direction, too. Yang, Could you think about this? Thanks a lot for the suggestion. Sorry for the late reply, I was busy on preparing patches. I do agree this is a direction we should look into, but I haven't got time to think about it deeper. I hope Joonsoo could chime in too since he is the original author for page extension. Even, your patch was broken, I think. It doesn't work with !CONFIG_DEBUG_VM && !CONFIG_PAGE_POISONING because lookup_page_ext doesn't return NULL in that case. Actually, I think the #ifdef should be removed if lookup_page_ext() is possible to return NULL. It sounds not make sense returning NULL only when DEBUG_VM is enabled. It should return NULL no matter what debug config is selected. If Joonsoo agrees with me I'm going to come up with a patch to fix it. Regards, Yang Thanks. --- include/linux/page_idle.h | 43 --- mm/page_alloc.c | 6 ++ mm/page_owner.c | 27 +++ mm/page_poison.c | 8 +++- mm/vmstat.c | 2 ++ 5 files changed, 78 insertions(+), 8 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index bf268fa..8f5d4ad 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -46,33 +46,62 @@ extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - return test_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline void set_page_young(struct page *page) { - set_bit(PAGE_EXT_YOUNG, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool test_and_clear_page_young(struct page *page) { - return test_and_clear_bit(PAGE_EXT_YOUNG, - _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_and_clear_bit(PAGE_EXT_YOUNG, _ext->flags); } static inline bool page_is_idle(struct page *page) { - return test_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return false; + + return test_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void set_page_idle(struct page *page) { - set_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + set_bit(PAGE_EXT_IDLE, _ext->flags); } static inline void clear_page_idle(struct page *page) { - clear_bit(PAGE_EXT_IDLE, _page_ext(page)->flags); + struct page_ext *page_ext; + page_ext = lookup_page_ext(page); + if (unlikely(!page_ext) + return; + + clear_bit(PAGE_EXT_IDLE, _ext->flags); } #endif /* CONFIG_64BIT */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc..d27e8b9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, _ext->flags); INIT_LIST_HEAD(>lru); @@ -673,6 +676,9 @@ static inline void
[PATCH V4] brcmfmac: print errors if creating interface fails
This is helpful for debugging. Without this all I was getting from "iw" command on failed creating of P2P interface was: > command failed: Too many open files in system (-23) Signed-off-by: Rafał Miłecki--- V2: s/in/if/ in commit message V3: Add one more error message as suggested by Arend V4: Also update brcmf_cfg80211_add_iface & print error for AP --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 8 ++-- drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 3d09d23..e7975a3 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -669,14 +669,18 @@ static struct wireless_dev *brcmf_cfg80211_add_iface(struct wiphy *wiphy, return ERR_PTR(-EOPNOTSUPP); case NL80211_IFTYPE_AP: wdev = brcmf_ap_add_vif(wiphy, name, flags, params); - if (!IS_ERR(wdev)) + if (IS_ERR(wdev)) + brcmf_err("Failed to create AP interface %s: %d\n", name, PTR_ERR(wdev)); + else brcmf_cfg80211_update_proto_addr_mode(wdev); return wdev; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: wdev = brcmf_p2p_add_vif(wiphy, name, name_assign_type, type, flags, params); - if (!IS_ERR(wdev)) + if (IS_ERR(wdev)) + brcmf_err("Failed to create P2P interface %s: %d\n", name, PTR_ERR(wdev)); + else brcmf_cfg80211_update_proto_addr_mode(wdev); return wdev; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index 1652a48..bc26aec 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -2031,7 +2031,7 @@ static int brcmf_p2p_request_p2p_if(struct brcmf_p2p_info *p2p, err = brcmf_fil_iovar_data_set(ifp, "p2p_ifadd", _request, sizeof(if_request)); if (err) - return err; + brcmf_err("p2p_ifadd failed %d\n", err); return err; } @@ -2185,6 +2185,7 @@ struct wireless_dev *brcmf_p2p_add_vif(struct wiphy *wiphy, const char *name, err = brcmf_p2p_request_p2p_if(>p2p, ifp, cfg->p2p.int_addr, iftype); if (err) { + brcmf_err("Failed to request P2P virtual interface %s\n", name); brcmf_cfg80211_arm_vif_event(cfg, NULL); goto fail; } -- 1.8.4.5
[PATCH V4] brcmfmac: print errors if creating interface fails
This is helpful for debugging. Without this all I was getting from "iw" command on failed creating of P2P interface was: > command failed: Too many open files in system (-23) Signed-off-by: Rafał Miłecki --- V2: s/in/if/ in commit message V3: Add one more error message as suggested by Arend V4: Also update brcmf_cfg80211_add_iface & print error for AP --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 8 ++-- drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c | 3 ++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 3d09d23..e7975a3 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -669,14 +669,18 @@ static struct wireless_dev *brcmf_cfg80211_add_iface(struct wiphy *wiphy, return ERR_PTR(-EOPNOTSUPP); case NL80211_IFTYPE_AP: wdev = brcmf_ap_add_vif(wiphy, name, flags, params); - if (!IS_ERR(wdev)) + if (IS_ERR(wdev)) + brcmf_err("Failed to create AP interface %s: %d\n", name, PTR_ERR(wdev)); + else brcmf_cfg80211_update_proto_addr_mode(wdev); return wdev; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_P2P_DEVICE: wdev = brcmf_p2p_add_vif(wiphy, name, name_assign_type, type, flags, params); - if (!IS_ERR(wdev)) + if (IS_ERR(wdev)) + brcmf_err("Failed to create P2P interface %s: %d\n", name, PTR_ERR(wdev)); + else brcmf_cfg80211_update_proto_addr_mode(wdev); return wdev; case NL80211_IFTYPE_UNSPECIFIED: diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c index 1652a48..bc26aec 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/p2p.c @@ -2031,7 +2031,7 @@ static int brcmf_p2p_request_p2p_if(struct brcmf_p2p_info *p2p, err = brcmf_fil_iovar_data_set(ifp, "p2p_ifadd", _request, sizeof(if_request)); if (err) - return err; + brcmf_err("p2p_ifadd failed %d\n", err); return err; } @@ -2185,6 +2185,7 @@ struct wireless_dev *brcmf_p2p_add_vif(struct wiphy *wiphy, const char *name, err = brcmf_p2p_request_p2p_if(>p2p, ifp, cfg->p2p.int_addr, iftype); if (err) { + brcmf_err("Failed to request P2P virtual interface %s\n", name); brcmf_cfg80211_arm_vif_event(cfg, NULL); goto fail; } -- 1.8.4.5
Re: [PATCH] seccomp: plug syscall-dodging ptrace hole
On Thu, May 26, 2016 at 02:04:50PM -0700, Kees Cook wrote: > One problem with seccomp was that ptrace could be used to change a > syscall after seccomp filtering had completed. This was a well documented > limitation, and it was recommended to block ptrace when defining a filter > to avoid this problem. This can be quite a limitation for containers or > other places where ptrace is desired even under seccomp filters. > > Since seccomp filtering has been split into pre-trace and trace phases > (phase1 and phase2 respectively), it's possible to re-run phase1 seccomp > after ptrace. This makes that change, and updates the test suite for > both SECCOMP_RET_TRACE and PTRACE_SYSCALL manipulation. Looks good to me. As far as I can tell, there are no codepaths that allow manipulation of syscall arguments via ptrace register modification without going through tracehook_report_syscall_entry() or seccomp_phase2(), and the checks look good, too. > Signed-off-by: Kees Cook> --- > include/linux/seccomp.h | 6 + > include/linux/tracehook.h | 8 +- > kernel/seccomp.c | 42 ++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 176 > -- > 4 files changed, 220 insertions(+), 12 deletions(-) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 2296e6b2f690..e2b72394c200 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -85,6 +85,7 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > +extern int seccomp_phase1_recheck(void); > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > @@ -94,6 +95,11 @@ static inline void get_seccomp_filter(struct task_struct > *tsk) > { > return; > } > + > +static inline int seccomp_phase1_recheck(void) > +{ > + return 0; > +} > #endif /* CONFIG_SECCOMP_FILTER */ > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h > index 26c152122a42..69b584d88508 100644 > --- a/include/linux/tracehook.h > +++ b/include/linux/tracehook.h > @@ -48,6 +48,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -100,7 +101,12 @@ static inline int ptrace_report_syscall(struct pt_regs > *regs) > static inline __must_check int tracehook_report_syscall_entry( > struct pt_regs *regs) > { > - return ptrace_report_syscall(regs); > + int skip; > + > + skip = ptrace_report_syscall(regs); > + if (skip) > + return skip; > + return seccomp_phase1_recheck(); > } > > /** > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 7002796f14a4..6eaa3a1c5edb 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -665,6 +665,46 @@ u32 seccomp_phase1(struct seccomp_data *sd) > } > > /** > + * seccomp_phase1_recheck() - recheck phase1 in the context of ptrace > + * > + * This re-runs phase 1 seccomp checks in the case where ptrace may have > + * just changed things out from under us. > + * > + * Returns 0 if the syscall should be processed or -1 to skip the syscall. > + */ > +int seccomp_phase1_recheck(void) > +{ > + u32 action; > + > + /* If we're not under seccomp, continue normally. */ > + if (!test_thread_flag(TIF_SECCOMP)) > + return 0; > + > + /* Pass NULL struct seccomp_data to force reload after ptrace. */ > + action = seccomp_phase1(NULL); > + switch (action) { > + case SECCOMP_PHASE1_OK: > + /* Passes seccomp, continue normally. */ > + break; > + case SECCOMP_PHASE1_SKIP: > + /* Skip the syscall. */ > + return -1; > + default: > + if ((action & SECCOMP_RET_ACTION) != SECCOMP_RET_TRACE) { > + /* Impossible return value: kill the process. */ > + do_exit(SIGSYS); > + } > + /* > + * We've hit a trace request, but ptrace already put us > + * into this state, so just continue. > + */ > + break; > + } > + > + return 0; > +} > + > +/** > * seccomp_phase2() - finish slow path seccomp work for the current syscall > * @phase1_result: The return value from seccomp_phase1() > * > @@ -701,6 +741,8 @@ int seccomp_phase2(u32 phase1_result) > do_exit(SIGSYS); > if (syscall_get_nr(current, regs) < 0) > return -1; /* Explicit request to skip. */ > + if (seccomp_phase1_recheck() < 0) > + return -1; > > return 0; > } [...] signature.asc Description: Digital signature
Re: [PATCH] seccomp: plug syscall-dodging ptrace hole
On Thu, May 26, 2016 at 02:04:50PM -0700, Kees Cook wrote: > One problem with seccomp was that ptrace could be used to change a > syscall after seccomp filtering had completed. This was a well documented > limitation, and it was recommended to block ptrace when defining a filter > to avoid this problem. This can be quite a limitation for containers or > other places where ptrace is desired even under seccomp filters. > > Since seccomp filtering has been split into pre-trace and trace phases > (phase1 and phase2 respectively), it's possible to re-run phase1 seccomp > after ptrace. This makes that change, and updates the test suite for > both SECCOMP_RET_TRACE and PTRACE_SYSCALL manipulation. Looks good to me. As far as I can tell, there are no codepaths that allow manipulation of syscall arguments via ptrace register modification without going through tracehook_report_syscall_entry() or seccomp_phase2(), and the checks look good, too. > Signed-off-by: Kees Cook > --- > include/linux/seccomp.h | 6 + > include/linux/tracehook.h | 8 +- > kernel/seccomp.c | 42 ++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 176 > -- > 4 files changed, 220 insertions(+), 12 deletions(-) > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 2296e6b2f690..e2b72394c200 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -85,6 +85,7 @@ static inline int seccomp_mode(struct seccomp *s) > #ifdef CONFIG_SECCOMP_FILTER > extern void put_seccomp_filter(struct task_struct *tsk); > extern void get_seccomp_filter(struct task_struct *tsk); > +extern int seccomp_phase1_recheck(void); > #else /* CONFIG_SECCOMP_FILTER */ > static inline void put_seccomp_filter(struct task_struct *tsk) > { > @@ -94,6 +95,11 @@ static inline void get_seccomp_filter(struct task_struct > *tsk) > { > return; > } > + > +static inline int seccomp_phase1_recheck(void) > +{ > + return 0; > +} > #endif /* CONFIG_SECCOMP_FILTER */ > > #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) > diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h > index 26c152122a42..69b584d88508 100644 > --- a/include/linux/tracehook.h > +++ b/include/linux/tracehook.h > @@ -48,6 +48,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -100,7 +101,12 @@ static inline int ptrace_report_syscall(struct pt_regs > *regs) > static inline __must_check int tracehook_report_syscall_entry( > struct pt_regs *regs) > { > - return ptrace_report_syscall(regs); > + int skip; > + > + skip = ptrace_report_syscall(regs); > + if (skip) > + return skip; > + return seccomp_phase1_recheck(); > } > > /** > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 7002796f14a4..6eaa3a1c5edb 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -665,6 +665,46 @@ u32 seccomp_phase1(struct seccomp_data *sd) > } > > /** > + * seccomp_phase1_recheck() - recheck phase1 in the context of ptrace > + * > + * This re-runs phase 1 seccomp checks in the case where ptrace may have > + * just changed things out from under us. > + * > + * Returns 0 if the syscall should be processed or -1 to skip the syscall. > + */ > +int seccomp_phase1_recheck(void) > +{ > + u32 action; > + > + /* If we're not under seccomp, continue normally. */ > + if (!test_thread_flag(TIF_SECCOMP)) > + return 0; > + > + /* Pass NULL struct seccomp_data to force reload after ptrace. */ > + action = seccomp_phase1(NULL); > + switch (action) { > + case SECCOMP_PHASE1_OK: > + /* Passes seccomp, continue normally. */ > + break; > + case SECCOMP_PHASE1_SKIP: > + /* Skip the syscall. */ > + return -1; > + default: > + if ((action & SECCOMP_RET_ACTION) != SECCOMP_RET_TRACE) { > + /* Impossible return value: kill the process. */ > + do_exit(SIGSYS); > + } > + /* > + * We've hit a trace request, but ptrace already put us > + * into this state, so just continue. > + */ > + break; > + } > + > + return 0; > +} > + > +/** > * seccomp_phase2() - finish slow path seccomp work for the current syscall > * @phase1_result: The return value from seccomp_phase1() > * > @@ -701,6 +741,8 @@ int seccomp_phase2(u32 phase1_result) > do_exit(SIGSYS); > if (syscall_get_nr(current, regs) < 0) > return -1; /* Explicit request to skip. */ > + if (seccomp_phase1_recheck() < 0) > + return -1; > > return 0; > } [...] signature.asc Description: Digital signature
mmotm 2016-05-26-15-51 uploaded
The mm-of-the-moment snapshot 2016-05-26-15-51 has been uploaded to http://www.ozlabs.org/~akpm/mmotm/ mmotm-readme.txt says README for mm-of-the-moment: http://www.ozlabs.org/~akpm/mmotm/ This is a snapshot of my -mm patch queue. Uploaded at random hopefully more than once a week. You will need quilt to apply these patches to the latest Linus release (4.x or 4.x-rcY). The series file is in broken-out.tar.gz and is duplicated in http://ozlabs.org/~akpm/mmotm/series The file broken-out.tar.gz contains two datestamp files: .DATE and .DATE--mm-dd-hh-mm-ss. Both contain the string -mm-dd-hh-mm-ss, followed by the base kernel version against which this patch series is to be applied. This tree is partially included in linux-next. To see which patches are included in linux-next, consult the `series' file. Only the patches within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in linux-next. A git tree which contains the memory management portion of this tree is maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git by Michal Hocko. It contains the patches which are between the "#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series file, http://www.ozlabs.org/~akpm/mmotm/series. A full copy of the full kernel tree with the linux-next and mmotm patches already applied is available through git within an hour of the mmotm release. Individual mmotm releases are tagged. The master branch always points to the latest release, so it's constantly rebasing. http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/ To develop on top of mmotm git: $ git remote add mmotm git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git $ git remote update mmotm $ git checkout -b topic mmotm/master $ git send-email mmotm/master.. [...] To rebase a branch with older patches to a new mmotm release: $ git remote update mmotm $ git rebase --onto mmotm/master topic The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second) contains daily snapshots of the -mm tree. It is updated more frequently than mmotm, and is untested. A git copy of this tree is available at http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/ and use of this tree is similar to http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above. This mmotm tree contains the following patches against 4.6: (patches marked "*" will be included in linux-next) origin.patch * seqlock-fix-raw_read_seqcount_latch.patch * mm-make-config_deferred_struct_page_init-depends-on-flatmem-explicitly.patch * mm-kasan-remove-unused-reserved-field-from-struct-kasan_alloc_meta.patch * mm-slub-remove-unused-virt_to_obj.patch * ocfs2-fix-improper-handling-of-return-errno.patch * memcg-fix-mem_cgroup_out_of_memory-return-value.patch * mm-oom_reaper-do-not-mmput-synchronously-from-the-oom-reaper-context-fix.patch * dma-debug-avoid-spinlock-recursion-when-disabling-dma-debug.patch * update-mm-zsmalloc-dont-fail-if-cant-create-debugfs-info.patch * drivers-pinctrl-intel-pinctrl-baytrailc-fix-build-with-gcc-44.patch i-need-old-gcc.patch arch-alpha-kernel-systblss-remove-debug-check.patch * direct-io-fix-direct-write-stale-data-exposure-from-concurrent-buffered-read.patch * mm-oom-do-not-reap-task-if-there-are-live-threads-in-threadgroup.patch * maintainers-add-kexec_corec-and-kexec_filec.patch * maintainers-kdump-maintainers-update.patch * mm-use-early_pfn_to_nid-in-page_ext_init.patch * mm-use-early_pfn_to_nid-in-register_page_bootmem_info_node.patch * oom_reaper-close-race-with-exiting-task.patch * oom_reaper-close-race-with-exiting-task-checkpatch-fixes.patch * mm-thp-avoid-false-positive-vm_bug_on_page-in-page_move_anon_rmap.patch * mm-cma-silence-warnings-due-to-max-usage.patch * mm-cma-silence-warnings-due-to-max-usage-checkpatch-fixes.patch * mm-memcontrol-fix-the-margin-computation-in-mem_cgroup_margin.patch * mm-memcontrol-move-comments-for-get_mctgt_type-to-proper-position.patch * mm-disable-deferred_struct_page_init-on-no_bootmem.patch * mm-fix-overflow-in-vm_map_ram.patch * kdump-fix-dmesg-gdbmacro-to-work-with-record-based-printk.patch * memcg-add-rcu-locking-around-css_for_each_descendant_pre-in-memcg_offline_kmem.patch * arm-arch-arm-include-asm-pageh-needs-personalityh.patch * fs-ext4-fsyncc-generic_file_fsync-call-based-on-barrier-flag.patch * ocfs2-fix-a-redundant-re-initialization.patch * ocfs2-o2hb-add-negotiate-timer.patch * ocfs2-o2hb-add-nego_timeout-message.patch * ocfs2-o2hb-add-negotiate_approve-message.patch * ocfs2-o2hb-add-some-user-debug-log.patch * ocfs2-o2hb-dont-negotiate-if-last-hb-fail.patch * ocfs2-o2hb-fix-hb-hung-time.patch * block-restore-proc-partitions-to-not-display-non-partitionable-removable-devices.patch mm.patch * mm-memcontrol-remove-the-useless-parameter-for-mc_handle_swap_pte.patch * mm-init-fix-zone-boundary-creation.patch * mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites.patch *
mmotm 2016-05-26-15-51 uploaded
The mm-of-the-moment snapshot 2016-05-26-15-51 has been uploaded to http://www.ozlabs.org/~akpm/mmotm/ mmotm-readme.txt says README for mm-of-the-moment: http://www.ozlabs.org/~akpm/mmotm/ This is a snapshot of my -mm patch queue. Uploaded at random hopefully more than once a week. You will need quilt to apply these patches to the latest Linus release (4.x or 4.x-rcY). The series file is in broken-out.tar.gz and is duplicated in http://ozlabs.org/~akpm/mmotm/series The file broken-out.tar.gz contains two datestamp files: .DATE and .DATE--mm-dd-hh-mm-ss. Both contain the string -mm-dd-hh-mm-ss, followed by the base kernel version against which this patch series is to be applied. This tree is partially included in linux-next. To see which patches are included in linux-next, consult the `series' file. Only the patches within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in linux-next. A git tree which contains the memory management portion of this tree is maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git by Michal Hocko. It contains the patches which are between the "#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series file, http://www.ozlabs.org/~akpm/mmotm/series. A full copy of the full kernel tree with the linux-next and mmotm patches already applied is available through git within an hour of the mmotm release. Individual mmotm releases are tagged. The master branch always points to the latest release, so it's constantly rebasing. http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/ To develop on top of mmotm git: $ git remote add mmotm git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git $ git remote update mmotm $ git checkout -b topic mmotm/master $ git send-email mmotm/master.. [...] To rebase a branch with older patches to a new mmotm release: $ git remote update mmotm $ git rebase --onto mmotm/master topic The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second) contains daily snapshots of the -mm tree. It is updated more frequently than mmotm, and is untested. A git copy of this tree is available at http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/ and use of this tree is similar to http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above. This mmotm tree contains the following patches against 4.6: (patches marked "*" will be included in linux-next) origin.patch * seqlock-fix-raw_read_seqcount_latch.patch * mm-make-config_deferred_struct_page_init-depends-on-flatmem-explicitly.patch * mm-kasan-remove-unused-reserved-field-from-struct-kasan_alloc_meta.patch * mm-slub-remove-unused-virt_to_obj.patch * ocfs2-fix-improper-handling-of-return-errno.patch * memcg-fix-mem_cgroup_out_of_memory-return-value.patch * mm-oom_reaper-do-not-mmput-synchronously-from-the-oom-reaper-context-fix.patch * dma-debug-avoid-spinlock-recursion-when-disabling-dma-debug.patch * update-mm-zsmalloc-dont-fail-if-cant-create-debugfs-info.patch * drivers-pinctrl-intel-pinctrl-baytrailc-fix-build-with-gcc-44.patch i-need-old-gcc.patch arch-alpha-kernel-systblss-remove-debug-check.patch * direct-io-fix-direct-write-stale-data-exposure-from-concurrent-buffered-read.patch * mm-oom-do-not-reap-task-if-there-are-live-threads-in-threadgroup.patch * maintainers-add-kexec_corec-and-kexec_filec.patch * maintainers-kdump-maintainers-update.patch * mm-use-early_pfn_to_nid-in-page_ext_init.patch * mm-use-early_pfn_to_nid-in-register_page_bootmem_info_node.patch * oom_reaper-close-race-with-exiting-task.patch * oom_reaper-close-race-with-exiting-task-checkpatch-fixes.patch * mm-thp-avoid-false-positive-vm_bug_on_page-in-page_move_anon_rmap.patch * mm-cma-silence-warnings-due-to-max-usage.patch * mm-cma-silence-warnings-due-to-max-usage-checkpatch-fixes.patch * mm-memcontrol-fix-the-margin-computation-in-mem_cgroup_margin.patch * mm-memcontrol-move-comments-for-get_mctgt_type-to-proper-position.patch * mm-disable-deferred_struct_page_init-on-no_bootmem.patch * mm-fix-overflow-in-vm_map_ram.patch * kdump-fix-dmesg-gdbmacro-to-work-with-record-based-printk.patch * memcg-add-rcu-locking-around-css_for_each_descendant_pre-in-memcg_offline_kmem.patch * arm-arch-arm-include-asm-pageh-needs-personalityh.patch * fs-ext4-fsyncc-generic_file_fsync-call-based-on-barrier-flag.patch * ocfs2-fix-a-redundant-re-initialization.patch * ocfs2-o2hb-add-negotiate-timer.patch * ocfs2-o2hb-add-nego_timeout-message.patch * ocfs2-o2hb-add-negotiate_approve-message.patch * ocfs2-o2hb-add-some-user-debug-log.patch * ocfs2-o2hb-dont-negotiate-if-last-hb-fail.patch * ocfs2-o2hb-fix-hb-hung-time.patch * block-restore-proc-partitions-to-not-display-non-partitionable-removable-devices.patch mm.patch * mm-memcontrol-remove-the-useless-parameter-for-mc_handle_swap_pte.patch * mm-init-fix-zone-boundary-creation.patch * mm-check-the-return-value-of-lookup_page_ext-for-all-call-sites.patch *
Re: can't boot with reiserfs on linux-4.6.0+
On Thu, May 26, 2016 at 3:46 PM, Hillf Dantonwrote: >> > See if this fixes your reproducer. >> > >> > diff --git a/fs/xattr.c b/fs/xattr.c >> > index b11945e..49b8eab 100644 >> > --- a/fs/xattr.c >> > +++ b/fs/xattr.c >> > @@ -667,6 +667,9 @@ xattr_resolve_name(const struct xattr_handler >> > **handlers, const char **name) >> > { >> > const struct xattr_handler *handler; >> > >> > + if (!handlers) >> > + return NULL; >> > + >> > if (!*name) >> > return NULL; >> > >> >> Tried, but doesn't work. >> > See if this fixes your reproducer. > > --- linux-4.6/fs/xattr.cMon May 16 06:43:13 2016 > +++ b/fs/xattr.cThu May 26 15:36:14 2016 > @@ -667,8 +667,8 @@ xattr_resolve_name(const struct xattr_ha > { > const struct xattr_handler *handler; > > - if (!*name) > - return NULL; > + if (!handlers || !*name) > + return ERR_PTR(-EINVAL); > > for_each_xattr_handler(handlers, handler) { > const char *n; > -- Hillf, That worked and it's already in Linus's latest tree. Thanks for sharing. Jeff
Re: can't boot with reiserfs on linux-4.6.0+
On Thu, May 26, 2016 at 3:46 PM, Hillf Danton wrote: >> > See if this fixes your reproducer. >> > >> > diff --git a/fs/xattr.c b/fs/xattr.c >> > index b11945e..49b8eab 100644 >> > --- a/fs/xattr.c >> > +++ b/fs/xattr.c >> > @@ -667,6 +667,9 @@ xattr_resolve_name(const struct xattr_handler >> > **handlers, const char **name) >> > { >> > const struct xattr_handler *handler; >> > >> > + if (!handlers) >> > + return NULL; >> > + >> > if (!*name) >> > return NULL; >> > >> >> Tried, but doesn't work. >> > See if this fixes your reproducer. > > --- linux-4.6/fs/xattr.cMon May 16 06:43:13 2016 > +++ b/fs/xattr.cThu May 26 15:36:14 2016 > @@ -667,8 +667,8 @@ xattr_resolve_name(const struct xattr_ha > { > const struct xattr_handler *handler; > > - if (!*name) > - return NULL; > + if (!handlers || !*name) > + return ERR_PTR(-EINVAL); > > for_each_xattr_handler(handlers, handler) { > const char *n; > -- Hillf, That worked and it's already in Linus's latest tree. Thanks for sharing. Jeff
Re: [PATCH v5 0/1] ARM64: ACPI: Update documentation for latest specification version
On 05/17/2016 10:30 AM, Al Stone wrote: > On 05/16/2016 05:44 PM, Alexey Klimov wrote: >> On Mon, May 2, 2016 at 09:19 PM, Al Stone wrote: >>> On 04/25/2016 03:21 PM, Al Stone wrote: The ACPI 6.1 specification was recently released at the end of January 2016, but the arm64 kernel documentation for the use of ACPI was written for the 5.1 version of the spec. There were significant additions to the spec that had not yet been mentioned -- for example, the 6.0 mechanisms added to make it easier to define processors and low power idle states, as well as the 6.1 addition allowing regular interrupts (not just from GPIO) be used to signal ACPI general purpose events. This patch reflects going back through and examining the specs in detail and updating content appropriately. Whilst there, a few odds and ends of typos were caught as well. This brings the documentation up to date with ACPI 6.1 for arm64. Changes for v5: -- Miscellaneous typos and corrections (Lorenzo Pieralisi) -- Add linux-acpi@ ML to the distribution list (Alexey Klimov) -- Corrections to CPPC information (Alexey Klimov) -- ACK from Lorenzo Pieralisi -- Updated bibliographic info (Al Stone) Changes for v4: -- Clarify that IORT can sometimes be optional (Jon Masters). -- Remove "Use as needed" descriptions of ACPI objects; they provide no substantive information and doing so simplifies maintenance of this document over time. These have been replaced with a simpler notice that states that unless otherwise noted, do what the ACPI specification says is needed. -- Corrected the _OSI object usage recommendation; it described kernel behavior that does not exist (Al Stone). Changes for v3: -- Clarify use of _LPI/_RDI (Vikas Sajjan) -- Whitespace cleanup as pointed out by checkpatch Changes for v2: -- Clean up white space (Harb Abdulhahmid) -- Clarification on _CCA usage (Harb Abdulhamid) -- IORT moved to required from recommended (Hanjun Guo) -- Clarify IORT description (Hanjun Guo) Al Stone (1): ARM64: ACPI: Update documentation for latest specification version Documentation/arm64/acpi_object_usage.txt | 343 -- Documentation/arm64/arm-acpi.txt | 40 ++-- 2 files changed, 213 insertions(+), 170 deletions(-) >>> >>> Ping? If there are no further comments, can this be pulled in through >>> either the documentation or arm64 tree? >>> >>> Thanks. >> >> Hi Al, >> sorry for delay. >> >> CPPC and PCC corrections look fine. Thanks. >> >> >> This comment is not to block your patch (maybe some to-do): >> I greped sources and your patch and I don't see description of _PSD object. >> This P-state dependancy object is optional but it's presense and correct data >> are extremely useful for CPPC and can potentially descrease number of >> performance >> changing requests. >> >> ACPI spec in section about CPPC tells that it may use _PSD (page 503 if I >> remember >> correctly) to specify domain belongings of CPUs. >> >> You may consider to add description of _PSD object later. >> >> Best regards, >> Alexey. >> > > Hrm. Thanks, Alexey. I'll take a look. _PSD may be in one of > the gray areas where we expect people to read the spec and follow > it properly, but it may make sense to be very explicit about what > they need to do to use it properly. Perhaps this would make a good > FWTS test, too. > Yet another ping... Just in case it is not clear, Alexey's comment and my reply here are things that *might* need to be done in the future. This version of the patch I believe is sufficient for now, based on the comments received. Lorenzo has ACKd; Hanjun has reviewed. Do I need Will and/or Catalin to ACK? Any others? -- ciao, al --- Al Stone Software Engineer Red Hat, Inc. a...@redhat.com ---
Re: [PATCH v5 0/1] ARM64: ACPI: Update documentation for latest specification version
On 05/17/2016 10:30 AM, Al Stone wrote: > On 05/16/2016 05:44 PM, Alexey Klimov wrote: >> On Mon, May 2, 2016 at 09:19 PM, Al Stone wrote: >>> On 04/25/2016 03:21 PM, Al Stone wrote: The ACPI 6.1 specification was recently released at the end of January 2016, but the arm64 kernel documentation for the use of ACPI was written for the 5.1 version of the spec. There were significant additions to the spec that had not yet been mentioned -- for example, the 6.0 mechanisms added to make it easier to define processors and low power idle states, as well as the 6.1 addition allowing regular interrupts (not just from GPIO) be used to signal ACPI general purpose events. This patch reflects going back through and examining the specs in detail and updating content appropriately. Whilst there, a few odds and ends of typos were caught as well. This brings the documentation up to date with ACPI 6.1 for arm64. Changes for v5: -- Miscellaneous typos and corrections (Lorenzo Pieralisi) -- Add linux-acpi@ ML to the distribution list (Alexey Klimov) -- Corrections to CPPC information (Alexey Klimov) -- ACK from Lorenzo Pieralisi -- Updated bibliographic info (Al Stone) Changes for v4: -- Clarify that IORT can sometimes be optional (Jon Masters). -- Remove "Use as needed" descriptions of ACPI objects; they provide no substantive information and doing so simplifies maintenance of this document over time. These have been replaced with a simpler notice that states that unless otherwise noted, do what the ACPI specification says is needed. -- Corrected the _OSI object usage recommendation; it described kernel behavior that does not exist (Al Stone). Changes for v3: -- Clarify use of _LPI/_RDI (Vikas Sajjan) -- Whitespace cleanup as pointed out by checkpatch Changes for v2: -- Clean up white space (Harb Abdulhahmid) -- Clarification on _CCA usage (Harb Abdulhamid) -- IORT moved to required from recommended (Hanjun Guo) -- Clarify IORT description (Hanjun Guo) Al Stone (1): ARM64: ACPI: Update documentation for latest specification version Documentation/arm64/acpi_object_usage.txt | 343 -- Documentation/arm64/arm-acpi.txt | 40 ++-- 2 files changed, 213 insertions(+), 170 deletions(-) >>> >>> Ping? If there are no further comments, can this be pulled in through >>> either the documentation or arm64 tree? >>> >>> Thanks. >> >> Hi Al, >> sorry for delay. >> >> CPPC and PCC corrections look fine. Thanks. >> >> >> This comment is not to block your patch (maybe some to-do): >> I greped sources and your patch and I don't see description of _PSD object. >> This P-state dependancy object is optional but it's presense and correct data >> are extremely useful for CPPC and can potentially descrease number of >> performance >> changing requests. >> >> ACPI spec in section about CPPC tells that it may use _PSD (page 503 if I >> remember >> correctly) to specify domain belongings of CPUs. >> >> You may consider to add description of _PSD object later. >> >> Best regards, >> Alexey. >> > > Hrm. Thanks, Alexey. I'll take a look. _PSD may be in one of > the gray areas where we expect people to read the spec and follow > it properly, but it may make sense to be very explicit about what > they need to do to use it properly. Perhaps this would make a good > FWTS test, too. > Yet another ping... Just in case it is not clear, Alexey's comment and my reply here are things that *might* need to be done in the future. This version of the patch I believe is sufficient for now, based on the comments received. Lorenzo has ACKd; Hanjun has reviewed. Do I need Will and/or Catalin to ACK? Any others? -- ciao, al --- Al Stone Software Engineer Red Hat, Inc. a...@redhat.com ---
Re: [PATCH] iommu/vt-d: reduce extra first level entry in iommu->domains
On Thu, May 26, 2016 at 11:11:51AM +0100, Robin Murphy wrote: >On 25/05/16 22:43, Wei Yang wrote: >>On Wed, May 25, 2016 at 11:17:49AM +0100, Robin Murphy wrote: >>>On 25/05/16 00:06, Wei Yang wrote: Hi, Joerg Not sure whether you think this calculation is correct. If I missed something for this " + 1" in your formula, I am glad to hear your explanation. So that I could learn something from you :-) >>> >>>I'm not familiar enough with this aspect of the driver to confirm whether the >>>change is appropriate or not, but it does seem worth noting that using >>>DIV_ROUND_UP would be an even neater approach. >>> >> >>Hi, Robin, >> >>Thanks for your comment. >> >>Yes, I agree DIV_ROUND_UP would make the code more easy to read. >> >>I have thought about using DIV_ROUND_UP, while from the definition >>DIV_ROUND_UP use operation "/", and ALIGN use bit operation. So the change in >>my patch chooses the second one and tries to keep the efficiency. > >The efficiency of what, though? > >It's an unsigned division by a constant power of two, which GCC implements >with a shift instruction regardless of optimisation - and at -O1 and above >the machine code generated for either form of expression is completely >identical (try it and see!). > Thanks. Looks my knowledge of the compiler is an ancient one :-) I haven't thought about to compare the generated code. This is really a good test before making the decision. Next time I would try this before choosing one. >On the other hand, the small amount of time and cognitive effort it took to >parse "ALIGN(x, 256) >> 8" as "divide by 256, rounding up" compared to simply >seeing "DIV_ROUND_UP(x, 256)" and knowing instantly what's intended, >certainly makes it less efficient to _maintain_; thus it's exactly the kind >of thing to which Dijkstra's famous quotation applies. > >Does that count towards learning something? ;) > Really~ I am really happy to see your comments which help me to be more mature on the solution. I owe you a favor :-) If you would come to Shanghai, I would like to take you around~ >Robin. > Have a good day~ On Sat, May 21, 2016 at 02:41:51AM +, Wei Yang wrote: >In commit <8bf478163e69> ("iommu/vt-d: Split up iommu->domains array"), it >it splits iommu->domains in two levels. Each first level contains 256 >entries of second level. In case of the ndomains is exact a multiple of >256, it would have one more extra first level entry for current >implementation. > >This patch refines this calculation to reduce the extra first level entry. > >Signed-off-by: Wei Yang>--- >drivers/iommu/intel-iommu.c |4 ++-- >1 file changed, 2 insertions(+), 2 deletions(-) > >diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c >index e3061d3..2204ca4 100644 >--- a/drivers/iommu/intel-iommu.c >+++ b/drivers/iommu/intel-iommu.c >@@ -1634,7 +1634,7 @@ static int iommu_init_domains(struct intel_iommu >*iommu) > return -ENOMEM; > } > >- size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **); >+ size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); > iommu->domains = kzalloc(size, GFP_KERNEL); > > if (iommu->domains) { >@@ -1699,7 +1699,7 @@ static void disable_dmar_iommu(struct intel_iommu >*iommu) >static void free_dmar_iommu(struct intel_iommu *iommu) >{ > if ((iommu->domains) && (iommu->domain_ids)) { >- int elems = (cap_ndoms(iommu->cap) >> 8) + 1; >+ int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; > int i; > > for (i = 0; i < elems; i++) >-- >1.7.9.5 >> -- Wei Yang Help you, Help me
Re: [PATCH] iommu/vt-d: reduce extra first level entry in iommu->domains
On Thu, May 26, 2016 at 11:11:51AM +0100, Robin Murphy wrote: >On 25/05/16 22:43, Wei Yang wrote: >>On Wed, May 25, 2016 at 11:17:49AM +0100, Robin Murphy wrote: >>>On 25/05/16 00:06, Wei Yang wrote: Hi, Joerg Not sure whether you think this calculation is correct. If I missed something for this " + 1" in your formula, I am glad to hear your explanation. So that I could learn something from you :-) >>> >>>I'm not familiar enough with this aspect of the driver to confirm whether the >>>change is appropriate or not, but it does seem worth noting that using >>>DIV_ROUND_UP would be an even neater approach. >>> >> >>Hi, Robin, >> >>Thanks for your comment. >> >>Yes, I agree DIV_ROUND_UP would make the code more easy to read. >> >>I have thought about using DIV_ROUND_UP, while from the definition >>DIV_ROUND_UP use operation "/", and ALIGN use bit operation. So the change in >>my patch chooses the second one and tries to keep the efficiency. > >The efficiency of what, though? > >It's an unsigned division by a constant power of two, which GCC implements >with a shift instruction regardless of optimisation - and at -O1 and above >the machine code generated for either form of expression is completely >identical (try it and see!). > Thanks. Looks my knowledge of the compiler is an ancient one :-) I haven't thought about to compare the generated code. This is really a good test before making the decision. Next time I would try this before choosing one. >On the other hand, the small amount of time and cognitive effort it took to >parse "ALIGN(x, 256) >> 8" as "divide by 256, rounding up" compared to simply >seeing "DIV_ROUND_UP(x, 256)" and knowing instantly what's intended, >certainly makes it less efficient to _maintain_; thus it's exactly the kind >of thing to which Dijkstra's famous quotation applies. > >Does that count towards learning something? ;) > Really~ I am really happy to see your comments which help me to be more mature on the solution. I owe you a favor :-) If you would come to Shanghai, I would like to take you around~ >Robin. > Have a good day~ On Sat, May 21, 2016 at 02:41:51AM +, Wei Yang wrote: >In commit <8bf478163e69> ("iommu/vt-d: Split up iommu->domains array"), it >it splits iommu->domains in two levels. Each first level contains 256 >entries of second level. In case of the ndomains is exact a multiple of >256, it would have one more extra first level entry for current >implementation. > >This patch refines this calculation to reduce the extra first level entry. > >Signed-off-by: Wei Yang >--- >drivers/iommu/intel-iommu.c |4 ++-- >1 file changed, 2 insertions(+), 2 deletions(-) > >diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c >index e3061d3..2204ca4 100644 >--- a/drivers/iommu/intel-iommu.c >+++ b/drivers/iommu/intel-iommu.c >@@ -1634,7 +1634,7 @@ static int iommu_init_domains(struct intel_iommu >*iommu) > return -ENOMEM; > } > >- size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **); >+ size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **); > iommu->domains = kzalloc(size, GFP_KERNEL); > > if (iommu->domains) { >@@ -1699,7 +1699,7 @@ static void disable_dmar_iommu(struct intel_iommu >*iommu) >static void free_dmar_iommu(struct intel_iommu *iommu) >{ > if ((iommu->domains) && (iommu->domain_ids)) { >- int elems = (cap_ndoms(iommu->cap) >> 8) + 1; >+ int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8; > int i; > > for (i = 0; i < elems; i++) >-- >1.7.9.5 >> -- Wei Yang Help you, Help me
Re: [PATCH 01/23] all: syscall wrappers: add documentation
On Thu, May 26, 2016 at 11:48:19PM +0300, Yury Norov wrote: > On Wed, May 25, 2016 at 02:28:21PM -0700, David Miller wrote: > > From: Arnd Bergmann> > Date: Wed, 25 May 2016 23:01:06 +0200 > > > > > On Wednesday, May 25, 2016 1:50:39 PM CEST David Miller wrote: > > >> From: Arnd Bergmann > > >> Date: Wed, 25 May 2016 22:47:33 +0200 > > >> > > >> > If we use the normal calling conventions, we could remove these > > >> > overrides > > >> > along with the respective special-case handling in glibc. None of them > > >> > look particularly performance-sensitive, but I could be wrong there. > > >> > > >> You could set the lowest bit in the system call entry pointer to indicate > > >> the upper-half clears should be elided. > > > > > > Right, but that would introduce an extra conditional branch in the syscall > > > hotpath, and likely eliminate the gains from passing the loff_t arguments > > > in a single register instead of a pair. > > > > Ok, then, how much are you really gaining from avoiding a 'shift' and > > an 'or' to build the full 64-bit value? 3 cycles? Maybe 4? > > 4 cycles in kernel and ~same cost in glibc to create a pair. It would take a single instruction per argument in the kernel to do shift+or and maybe 1-2 more instructions to move the remaining arguments in place (we do this for a few wrappers in arch/arm64/kernel/entry32.S). And the glibc counterpart. > And 8 'mov's that exist for every syscall, even yield(). > > > And the executing the wrappers, those have a non-trivial cost too. > > The cost is pretty trivial though. See kernel/compat_wrapper.o: > COMPAT_SYSCALL_WRAP2(creat, const char __user *, pathname, umode_t, mode); > 0: a9bf7bfdstp x29, x30, [sp,#-16]! > 4: 910003fdmov x29, sp > 8: 2a0003e0mov w0, w0 > c: 9400bl 0 > 10: a8c17bfdldp x29, x30, [sp],#16 > 14: d65f03c0ret I would say the above could be more expensive than 8 movs (16 bytes to write, read, a branch and a ret). You can also add the I-cache locality, having wrappers for each syscalls instead of a single place for zeroing the upper half (where no other wrapper is necessary). Can we trick the compiler into doing a tail call optimisation. This could have simply been: COMPAT_SYSCALL_WRAP2(creat, ...): mov w0, w0 b > > Cost wise, this seems like it all cancels out in the end, but what > > do I know? > > I think you know something, and I also think Heiko and other s390 guys > know something as well. So I'd like to listen their arguments here. > > For me spark64 way is looking reasonable only because it's really simple > and takes less coding. I'll try it on some branch and share here what > happened. The kernel code will definitely look simpler ;). It would be good to see if there actually is any performance impact. Even with 16 more cycles on syscall entry, would they be lost in the noise? You don't need a full implementation, just some dummy mov x0, x0 on the entry path. -- Catalin
Re: [PATCH 01/23] all: syscall wrappers: add documentation
On Thu, May 26, 2016 at 11:48:19PM +0300, Yury Norov wrote: > On Wed, May 25, 2016 at 02:28:21PM -0700, David Miller wrote: > > From: Arnd Bergmann > > Date: Wed, 25 May 2016 23:01:06 +0200 > > > > > On Wednesday, May 25, 2016 1:50:39 PM CEST David Miller wrote: > > >> From: Arnd Bergmann > > >> Date: Wed, 25 May 2016 22:47:33 +0200 > > >> > > >> > If we use the normal calling conventions, we could remove these > > >> > overrides > > >> > along with the respective special-case handling in glibc. None of them > > >> > look particularly performance-sensitive, but I could be wrong there. > > >> > > >> You could set the lowest bit in the system call entry pointer to indicate > > >> the upper-half clears should be elided. > > > > > > Right, but that would introduce an extra conditional branch in the syscall > > > hotpath, and likely eliminate the gains from passing the loff_t arguments > > > in a single register instead of a pair. > > > > Ok, then, how much are you really gaining from avoiding a 'shift' and > > an 'or' to build the full 64-bit value? 3 cycles? Maybe 4? > > 4 cycles in kernel and ~same cost in glibc to create a pair. It would take a single instruction per argument in the kernel to do shift+or and maybe 1-2 more instructions to move the remaining arguments in place (we do this for a few wrappers in arch/arm64/kernel/entry32.S). And the glibc counterpart. > And 8 'mov's that exist for every syscall, even yield(). > > > And the executing the wrappers, those have a non-trivial cost too. > > The cost is pretty trivial though. See kernel/compat_wrapper.o: > COMPAT_SYSCALL_WRAP2(creat, const char __user *, pathname, umode_t, mode); > 0: a9bf7bfdstp x29, x30, [sp,#-16]! > 4: 910003fdmov x29, sp > 8: 2a0003e0mov w0, w0 > c: 9400bl 0 > 10: a8c17bfdldp x29, x30, [sp],#16 > 14: d65f03c0ret I would say the above could be more expensive than 8 movs (16 bytes to write, read, a branch and a ret). You can also add the I-cache locality, having wrappers for each syscalls instead of a single place for zeroing the upper half (where no other wrapper is necessary). Can we trick the compiler into doing a tail call optimisation. This could have simply been: COMPAT_SYSCALL_WRAP2(creat, ...): mov w0, w0 b > > Cost wise, this seems like it all cancels out in the end, but what > > do I know? > > I think you know something, and I also think Heiko and other s390 guys > know something as well. So I'd like to listen their arguments here. > > For me spark64 way is looking reasonable only because it's really simple > and takes less coding. I'll try it on some branch and share here what > happened. The kernel code will definitely look simpler ;). It would be good to see if there actually is any performance impact. Even with 16 more cycles on syscall entry, would they be lost in the noise? You don't need a full implementation, just some dummy mov x0, x0 on the entry path. -- Catalin
[PATCH] Use the correct size to set block max sectors
The block sector size should be in unit of 512 bytes, not in bytes. Signed-off-by: Long Li--- drivers/scsi/sd.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 428c03e..4bce17e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) if (sdkp->opt_xfer_blocks && sdkp->opt_xfer_blocks <= dev_max && sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) - rw_max = q->limits.io_opt = + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { + q->limits.io_opt = sdkp->opt_xfer_blocks * sdp->sector_size; + rw_max = (q->limits.io_opt >> 9); + } else rw_max = BLK_DEF_MAX_SECTORS; -- 1.8.5.6
[PATCH] Use the correct size to set block max sectors
The block sector size should be in unit of 512 bytes, not in bytes. Signed-off-by: Long Li --- drivers/scsi/sd.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 428c03e..4bce17e 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2862,9 +2862,11 @@ static int sd_revalidate_disk(struct gendisk *disk) if (sdkp->opt_xfer_blocks && sdkp->opt_xfer_blocks <= dev_max && sdkp->opt_xfer_blocks <= SD_DEF_XFER_BLOCKS && - sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) - rw_max = q->limits.io_opt = + sdkp->opt_xfer_blocks * sdp->sector_size >= PAGE_SIZE) { + q->limits.io_opt = sdkp->opt_xfer_blocks * sdp->sector_size; + rw_max = (q->limits.io_opt >> 9); + } else rw_max = BLK_DEF_MAX_SECTORS; -- 1.8.5.6
Regression in i915 in kernel 4.6.0-git - bisected to f21a21983ef13a031
The latest mainline kernel (commit 3f59de0) shows a regression. The symptom is that as soon as the kernel is started, the display is blanked, and it is never turned on again. This problem was bisected to commit f21a21983ef13a031250c4c3f6018e29a549d0f1 ("drm/i915: Splitting intel_dp_detect"). The adapter in question is the integrated graphics controller on a Toshiba Tecra A50 laptop. The output of "lspci -nnv" for that controller is as follows: 00:02.0 VGA compatible controller [0300]: Intel Corporation 4th Gen Core Processor Integrated Graphics Controller [8086:0416] (rev 06) (prog-if 00 [VGA controller]) Subsystem: Toshiba America Info Systems Device [1179:0002] Flags: bus master, fast devsel, latency 0, IRQ 27 Memory at e000 (64-bit, non-prefetchable) [size=4M] Memory at d000 (64-bit, prefetchable) [size=256M] I/O ports at 4000 [size=64] Expansion ROM at [disabled] Capabilities: [90] MSI: Enable+ Count=1/1 Maskable- 64bit- Capabilities: [d0] Power Management version 2 Capabilities: [a4] PCI Advanced Features Kernel driver in use: i915 Kernel modules: i915 If the kernel is booted with "nomodeset", the display is normal. I will be happy to test any patches, or to answer any further questions. Thanks, Larry
Regression in i915 in kernel 4.6.0-git - bisected to f21a21983ef13a031
The latest mainline kernel (commit 3f59de0) shows a regression. The symptom is that as soon as the kernel is started, the display is blanked, and it is never turned on again. This problem was bisected to commit f21a21983ef13a031250c4c3f6018e29a549d0f1 ("drm/i915: Splitting intel_dp_detect"). The adapter in question is the integrated graphics controller on a Toshiba Tecra A50 laptop. The output of "lspci -nnv" for that controller is as follows: 00:02.0 VGA compatible controller [0300]: Intel Corporation 4th Gen Core Processor Integrated Graphics Controller [8086:0416] (rev 06) (prog-if 00 [VGA controller]) Subsystem: Toshiba America Info Systems Device [1179:0002] Flags: bus master, fast devsel, latency 0, IRQ 27 Memory at e000 (64-bit, non-prefetchable) [size=4M] Memory at d000 (64-bit, prefetchable) [size=256M] I/O ports at 4000 [size=64] Expansion ROM at [disabled] Capabilities: [90] MSI: Enable+ Count=1/1 Maskable- 64bit- Capabilities: [d0] Power Management version 2 Capabilities: [a4] PCI Advanced Features Kernel driver in use: i915 Kernel modules: i915 If the kernel is booted with "nomodeset", the display is normal. I will be happy to test any patches, or to answer any further questions. Thanks, Larry
Re: [PATCH 3/4] dell-wmi: Add information about other WMI event codes
On 22/05/2016 13:36, Pali Rohár wrote: > ACPI DSDT tables have defined other WMI codes, but does not contain any > description when those codes are emitted. Some other codes can be found in > logs on internet. In this patch are all which I saw, but lot of them are > not tested properly (e.g. for duplicate events with AT keyboard). Now we > have all WMI event codes at one place and in future after proper testing > those codes can be correctly enabled or disabled... > > Signed-off-by: Pali Rohár> --- > drivers/platform/x86/dell-wmi.c | 32 > 1 file changed, 32 insertions(+) > > diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c > index 363d927..7aac1dc 100644 > --- a/drivers/platform/x86/dell-wmi.c > +++ b/drivers/platform/x86/dell-wmi.c > @@ -110,6 +110,9 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { > /* BIOS error detected */ > { KE_IGNORE, 0xe00d, { KEY_RESERVED } }, > > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe00e, { KEY_RESERVED } }, */ > + I'm interested in knowing what's the meaning of this 0xe00e. This event is sent multiple times when I suspend/resume my laptop and it's definitely not a keypress. Anyway, I've been using this patch set and didn't notice any issue, so Tested-by: Gabriele Mazzotta > /* Wifi Catcher */ > { KE_KEY,0xe011, { KEY_PROG2 } }, > > @@ -118,21 +121,45 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { > > { KE_IGNORE, 0xe020, { KEY_MUTE } }, > > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe023, { KEY_RESERVED } }, */ > + > + /* Untested, Dell Instant Launch key on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe024, { KEY_RESERVED } }, */ > + > /* Dell Instant Launch key */ > { KE_KEY,0xe025, { KEY_PROG4 } }, > > /* Audio panel key */ > { KE_IGNORE, 0xe026, { KEY_RESERVED } }, > > + /* Untested, Multimedia key on Dell Vostro 3560 */ > + /* { KE_IGNORE, 0xe028, { KEY_RESERVED } }, */ > + > /* Dell Instant Launch key */ > { KE_KEY,0xe029, { KEY_PROG4 } }, > > + /* Untested, Windows Mobility Center button on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe02a, { KEY_RESERVED } }, */ > + > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe02b, { KEY_RESERVED } }, */ > + > + /* Untested, Dell Audio With Preset Switch button on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe02c, { KEY_RESERVED } }, */ > + > { KE_IGNORE, 0xe02e, { KEY_VOLUMEDOWN } }, > { KE_IGNORE, 0xe030, { KEY_VOLUMEUP } }, > { KE_IGNORE, 0xe033, { KEY_KBDILLUMUP } }, > { KE_IGNORE, 0xe034, { KEY_KBDILLUMDOWN } }, > { KE_IGNORE, 0xe03a, { KEY_CAPSLOCK } }, > > + /* NIC Link is Up */ > + { KE_IGNORE, 0xe043, { KEY_RESERVED } }, > + > + /* NIC Link is Down */ > + { KE_IGNORE, 0xe044, { KEY_RESERVED } }, > + > /* >* This entry is very suspicious! >* Originally Matthew Garrett created this dell-wmi driver specially for > @@ -145,7 +172,12 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { >*/ > { KE_IGNORE, 0xe045, { KEY_NUMLOCK } }, > > + /* Scroll lock and also going to tablet mode on portable devices */ > { KE_IGNORE, 0xe046, { KEY_SCROLLLOCK } }, > + > + /* Untested, going from tablet mode on portable devices */ > + /* { KE_IGNORE, 0xe047, { KEY_RESERVED } }, */ > + > { KE_IGNORE, 0xe0f7, { KEY_MUTE } }, > { KE_IGNORE, 0xe0f8, { KEY_VOLUMEDOWN } }, > { KE_IGNORE, 0xe0f9, { KEY_VOLUMEUP } }, >
Re: [PATCH 3/4] dell-wmi: Add information about other WMI event codes
On 22/05/2016 13:36, Pali Rohár wrote: > ACPI DSDT tables have defined other WMI codes, but does not contain any > description when those codes are emitted. Some other codes can be found in > logs on internet. In this patch are all which I saw, but lot of them are > not tested properly (e.g. for duplicate events with AT keyboard). Now we > have all WMI event codes at one place and in future after proper testing > those codes can be correctly enabled or disabled... > > Signed-off-by: Pali Rohár > --- > drivers/platform/x86/dell-wmi.c | 32 > 1 file changed, 32 insertions(+) > > diff --git a/drivers/platform/x86/dell-wmi.c b/drivers/platform/x86/dell-wmi.c > index 363d927..7aac1dc 100644 > --- a/drivers/platform/x86/dell-wmi.c > +++ b/drivers/platform/x86/dell-wmi.c > @@ -110,6 +110,9 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { > /* BIOS error detected */ > { KE_IGNORE, 0xe00d, { KEY_RESERVED } }, > > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe00e, { KEY_RESERVED } }, */ > + I'm interested in knowing what's the meaning of this 0xe00e. This event is sent multiple times when I suspend/resume my laptop and it's definitely not a keypress. Anyway, I've been using this patch set and didn't notice any issue, so Tested-by: Gabriele Mazzotta > /* Wifi Catcher */ > { KE_KEY,0xe011, { KEY_PROG2 } }, > > @@ -118,21 +121,45 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { > > { KE_IGNORE, 0xe020, { KEY_MUTE } }, > > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe023, { KEY_RESERVED } }, */ > + > + /* Untested, Dell Instant Launch key on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe024, { KEY_RESERVED } }, */ > + > /* Dell Instant Launch key */ > { KE_KEY,0xe025, { KEY_PROG4 } }, > > /* Audio panel key */ > { KE_IGNORE, 0xe026, { KEY_RESERVED } }, > > + /* Untested, Multimedia key on Dell Vostro 3560 */ > + /* { KE_IGNORE, 0xe028, { KEY_RESERVED } }, */ > + > /* Dell Instant Launch key */ > { KE_KEY,0xe029, { KEY_PROG4 } }, > > + /* Untested, Windows Mobility Center button on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe02a, { KEY_RESERVED } }, */ > + > + /* Unknown, defined in ACPI DSDT */ > + /* { KE_IGNORE, 0xe02b, { KEY_RESERVED } }, */ > + > + /* Untested, Dell Audio With Preset Switch button on Inspiron 7520 */ > + /* { KE_IGNORE, 0xe02c, { KEY_RESERVED } }, */ > + > { KE_IGNORE, 0xe02e, { KEY_VOLUMEDOWN } }, > { KE_IGNORE, 0xe030, { KEY_VOLUMEUP } }, > { KE_IGNORE, 0xe033, { KEY_KBDILLUMUP } }, > { KE_IGNORE, 0xe034, { KEY_KBDILLUMDOWN } }, > { KE_IGNORE, 0xe03a, { KEY_CAPSLOCK } }, > > + /* NIC Link is Up */ > + { KE_IGNORE, 0xe043, { KEY_RESERVED } }, > + > + /* NIC Link is Down */ > + { KE_IGNORE, 0xe044, { KEY_RESERVED } }, > + > /* >* This entry is very suspicious! >* Originally Matthew Garrett created this dell-wmi driver specially for > @@ -145,7 +172,12 @@ static const struct key_entry dell_wmi_legacy_keymap[] > __initconst = { >*/ > { KE_IGNORE, 0xe045, { KEY_NUMLOCK } }, > > + /* Scroll lock and also going to tablet mode on portable devices */ > { KE_IGNORE, 0xe046, { KEY_SCROLLLOCK } }, > + > + /* Untested, going from tablet mode on portable devices */ > + /* { KE_IGNORE, 0xe047, { KEY_RESERVED } }, */ > + > { KE_IGNORE, 0xe0f7, { KEY_MUTE } }, > { KE_IGNORE, 0xe0f8, { KEY_VOLUMEDOWN } }, > { KE_IGNORE, 0xe0f9, { KEY_VOLUMEUP } }, >
RE: [PATCH V3 1/2] i2c: qup: Fix broken dma when CONFIG_DEBUG_SG is enabled
Hi, >> sg_set_buf expects that the buf parameter passed in should be from >> lowmem and a valid pageframe. This is not true for pages from >> dma_alloc_coherent which can be carveouts, hence the check fails. > >OK, given you mean dma_pool_alloc here, the check fails for the >pageframe because of the pool? Is my understanding correct? > Yes right. Since those are carveouts, there is no valid pageframe, so the check fails. >> Change allocation of sg buffers from dma_coherent memory to kzalloc >> to fix the issue. > >But why can you drop the coherency? > The coherency is not dropped here. dma_map/unmap used makes the buffer coherent before passing it to dmaengine. Previously it was not required. I can add this in description if its not clear. >> @@ -1268,6 +1260,8 @@ static int qup_i2c_xfer_v2(struct i2c_adapter *adap, >> } >> } >> >> +idx = 0; >> + > >This looks like an unrelated change. Ha, wrong. This should have been in a separate patch. This was to fix a initialization issue in dma mode. Sorry, should not have been here, will move it to out. Regards, Sricharan
RE: [PATCH V3 1/2] i2c: qup: Fix broken dma when CONFIG_DEBUG_SG is enabled
Hi, >> sg_set_buf expects that the buf parameter passed in should be from >> lowmem and a valid pageframe. This is not true for pages from >> dma_alloc_coherent which can be carveouts, hence the check fails. > >OK, given you mean dma_pool_alloc here, the check fails for the >pageframe because of the pool? Is my understanding correct? > Yes right. Since those are carveouts, there is no valid pageframe, so the check fails. >> Change allocation of sg buffers from dma_coherent memory to kzalloc >> to fix the issue. > >But why can you drop the coherency? > The coherency is not dropped here. dma_map/unmap used makes the buffer coherent before passing it to dmaengine. Previously it was not required. I can add this in description if its not clear. >> @@ -1268,6 +1260,8 @@ static int qup_i2c_xfer_v2(struct i2c_adapter *adap, >> } >> } >> >> +idx = 0; >> + > >This looks like an unrelated change. Ha, wrong. This should have been in a separate patch. This was to fix a initialization issue in dma mode. Sorry, should not have been here, will move it to out. Regards, Sricharan
Re: [PATCH v2 08/13] ACPICA: Hardware: Add optimized access bit width support
On 05/26/2016 12:55 PM, Boris Ostrovsky wrote: > On 05/26/2016 12:26 PM, Jan Beulich wrote: > Boris Ostrovsky05/25/16 9:17 PM >>> >>> On 05/05/2016 12:58 AM, Lv Zheng wrote: +static u8 +acpi_hw_get_access_bit_width(struct acpi_generic_address *reg, u8 max_bit_width) +{ +u64 address; + +if (!reg->access_width) { +/* + * Detect old register descriptors where only the bit_width field + * makes senses. The target address is copied to handle possible + * alignment issues. + */ +ACPI_MOVE_64_TO_64(, >address); +if (!reg->bit_offset && reg->bit_width && +ACPI_IS_POWER_OF_TWO(reg->bit_width) && +ACPI_IS_ALIGNED(reg->bit_width, 8) && +ACPI_IS_ALIGNED(address, reg->bit_width)) { +return (reg->bit_width); +} else { +if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { +return (32); >>> This (together with "... Add access_width/bit_offset support in >>> acpi_hw_write") breaks Xen guests using older QEMU which doesn't support >>> 4-byte IO accesses. >>> >>> Why not return "reg->bit_width?:max_bit_width" ? This will preserve >>> original behavior. >> Did you figure out why we get here in the first place, instead of taking the >> first "return"? I.e. isn't the issue the apparently wrong use of the second >> ACPI_IS_ALIGNED() above? Afaict it ought to be >> ACPI_IS_ALIGNED(address, reg->bit_width / 8)... > We are trying to access address 0x...b004 (PM1a control) so yes, fixing > alignment check would probably resolve the problem that we are seeing now. > > However, for compatibility purposes we may consider not doing any checks > and simply return bit_width if access_width is not available. Interestingly enough I bisected what I thought was a completely different problem to this patch as well. Something is messed up with 32-bit dom0 booting (so no QEMU) on one machine in my pool. First error that I see is pcieport :00:02.0: can't find IRQ for PCI INT A; probably buggy MP table I'll look at this tomorrow, hopefully. -boris
Re: [PATCH v2 08/13] ACPICA: Hardware: Add optimized access bit width support
On 05/26/2016 12:55 PM, Boris Ostrovsky wrote: > On 05/26/2016 12:26 PM, Jan Beulich wrote: > Boris Ostrovsky 05/25/16 9:17 PM >>> >>> On 05/05/2016 12:58 AM, Lv Zheng wrote: +static u8 +acpi_hw_get_access_bit_width(struct acpi_generic_address *reg, u8 max_bit_width) +{ +u64 address; + +if (!reg->access_width) { +/* + * Detect old register descriptors where only the bit_width field + * makes senses. The target address is copied to handle possible + * alignment issues. + */ +ACPI_MOVE_64_TO_64(, >address); +if (!reg->bit_offset && reg->bit_width && +ACPI_IS_POWER_OF_TWO(reg->bit_width) && +ACPI_IS_ALIGNED(reg->bit_width, 8) && +ACPI_IS_ALIGNED(address, reg->bit_width)) { +return (reg->bit_width); +} else { +if (reg->space_id == ACPI_ADR_SPACE_SYSTEM_IO) { +return (32); >>> This (together with "... Add access_width/bit_offset support in >>> acpi_hw_write") breaks Xen guests using older QEMU which doesn't support >>> 4-byte IO accesses. >>> >>> Why not return "reg->bit_width?:max_bit_width" ? This will preserve >>> original behavior. >> Did you figure out why we get here in the first place, instead of taking the >> first "return"? I.e. isn't the issue the apparently wrong use of the second >> ACPI_IS_ALIGNED() above? Afaict it ought to be >> ACPI_IS_ALIGNED(address, reg->bit_width / 8)... > We are trying to access address 0x...b004 (PM1a control) so yes, fixing > alignment check would probably resolve the problem that we are seeing now. > > However, for compatibility purposes we may consider not doing any checks > and simply return bit_width if access_width is not available. Interestingly enough I bisected what I thought was a completely different problem to this patch as well. Something is messed up with 32-bit dom0 booting (so no QEMU) on one machine in my pool. First error that I see is pcieport :00:02.0: can't find IRQ for PCI INT A; probably buggy MP table I'll look at this tomorrow, hopefully. -boris
[PATCH v6r2 11/12] zsmalloc: page migration support
Follow up Sergey's review >From 2deede28c91910a9d3493feae30bed507e72f213 Mon Sep 17 00:00:00 2001 From: Minchan KimDate: Thu, 5 May 2016 00:01:03 +0900 Subject: [PATCH v6r2] zsmalloc: page migration support This patch introduces run-time migration feature for zspage. For migration, VM uses page.lru field so it would be better to not use page.next field which is unified with page.lru for own purpose. For that, firstly, we can get first object offset of the page via runtime calculation instead of using page.index so we can use page.index as link for page chaining instead of page.next. In case of huge object, it stores handle to page.index instead of next link of page chaining because huge object doesn't need to next link for page chaining. So get_next_page need to identify huge object to return NULL. For it, this patch uses PG_owner_priv_1 flag of the page flag. For migration, it supports three functions * zs_page_isolate It isolates a zspage which includes a subpage VM want to migrate from class so anyone cannot allocate new object from the zspage. We could try to isolate a zspage by the number of subpage so subsequent isolation trial of other subpage of the zpsage shouldn't fail. For that, we introduce zspage.isolated count. With that, zs_page_isolate can know whether zspage is already isolated or not for migration so if it is isolated for migration, subsequent isolation trial can be successful without trying further isolation. * zs_page_migrate First of all, it holds write-side zspage->lock to prevent migrate other subpage in zspage. Then, lock all objects in the page VM want to migrate. The reason we should lock all objects in the page is due to race between zs_map_object and zs_page_migrate. zs_map_object zs_page_migrate pin_tag(handle) obj = handle_to_obj(handle) obj_to_location(obj, , _idx); write_lock(>lock) if (!trypin_tag(handle)) goto unpin_object zspage = get_zspage(page); read_lock(>lock); If zs_page_migrate doesn't do trypin_tag, zs_map_object's page can be stale by migration so it goes crash. If it locks all of objects successfully, it copies content from old page to new one, finally, create new zspage chain with new page. And if it's last isolated subpage in the zspage, put the zspage back to class. * zs_page_putback It returns isolated zspage to right fullness_group list if it fails to migrate a page. If it find a zspage is ZS_EMPTY, it queues zspage freeing to workqueue. See below about async zspage freeing. This patch introduces asynchronous zspage free. The reason to need it is we need page_lock to clear PG_movable but unfortunately, zs_free path should be atomic so the apporach is try to grab page_lock. If it got page_lock of all of pages successfully, it can free zspage immediately. Otherwise, it queues free request and free zspage via workqueue in process context. If zs_free finds the zspage is isolated when it try to free zspage, it delays the freeing until zs_page_putback finds it so it will free free the zspage finally. In this patch, we expand fullness_list from ZS_EMPTY to ZS_FULL. First of all, it will use ZS_EMPTY list for delay freeing. And with adding ZS_FULL list, it makes to identify whether zspage is isolated or not via list_empty(>list) test. Cc: Sergey Senozhatsky Signed-off-by: Minchan Kim --- include/uapi/linux/magic.h | 1 + mm/zsmalloc.c | 793 ++--- 2 files changed, 672 insertions(+), 122 deletions(-) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index d829ce63529d..e398beac67b8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -81,5 +81,6 @@ /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC0x15013346 #define BALLOON_KVM_MAGIC 0x13661366 +#define ZSMALLOC_MAGIC 0x58295829 #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c6fb543cfb98..a80100db16d6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -17,14 +17,14 @@ * * Usage of struct page fields: * page->private: points to zspage - * page->index: offset of the first object starting in this page. - * For the first page, this is always 0, so we use this field - * to store handle for huge object. - * page->next: links together all component pages of a zspage + * page->freelist(index): links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. * * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page + * PG_owner_priv_1: indentifies the huge component page *
[PATCH v6r2 11/12] zsmalloc: page migration support
Follow up Sergey's review >From 2deede28c91910a9d3493feae30bed507e72f213 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 5 May 2016 00:01:03 +0900 Subject: [PATCH v6r2] zsmalloc: page migration support This patch introduces run-time migration feature for zspage. For migration, VM uses page.lru field so it would be better to not use page.next field which is unified with page.lru for own purpose. For that, firstly, we can get first object offset of the page via runtime calculation instead of using page.index so we can use page.index as link for page chaining instead of page.next. In case of huge object, it stores handle to page.index instead of next link of page chaining because huge object doesn't need to next link for page chaining. So get_next_page need to identify huge object to return NULL. For it, this patch uses PG_owner_priv_1 flag of the page flag. For migration, it supports three functions * zs_page_isolate It isolates a zspage which includes a subpage VM want to migrate from class so anyone cannot allocate new object from the zspage. We could try to isolate a zspage by the number of subpage so subsequent isolation trial of other subpage of the zpsage shouldn't fail. For that, we introduce zspage.isolated count. With that, zs_page_isolate can know whether zspage is already isolated or not for migration so if it is isolated for migration, subsequent isolation trial can be successful without trying further isolation. * zs_page_migrate First of all, it holds write-side zspage->lock to prevent migrate other subpage in zspage. Then, lock all objects in the page VM want to migrate. The reason we should lock all objects in the page is due to race between zs_map_object and zs_page_migrate. zs_map_object zs_page_migrate pin_tag(handle) obj = handle_to_obj(handle) obj_to_location(obj, , _idx); write_lock(>lock) if (!trypin_tag(handle)) goto unpin_object zspage = get_zspage(page); read_lock(>lock); If zs_page_migrate doesn't do trypin_tag, zs_map_object's page can be stale by migration so it goes crash. If it locks all of objects successfully, it copies content from old page to new one, finally, create new zspage chain with new page. And if it's last isolated subpage in the zspage, put the zspage back to class. * zs_page_putback It returns isolated zspage to right fullness_group list if it fails to migrate a page. If it find a zspage is ZS_EMPTY, it queues zspage freeing to workqueue. See below about async zspage freeing. This patch introduces asynchronous zspage free. The reason to need it is we need page_lock to clear PG_movable but unfortunately, zs_free path should be atomic so the apporach is try to grab page_lock. If it got page_lock of all of pages successfully, it can free zspage immediately. Otherwise, it queues free request and free zspage via workqueue in process context. If zs_free finds the zspage is isolated when it try to free zspage, it delays the freeing until zs_page_putback finds it so it will free free the zspage finally. In this patch, we expand fullness_list from ZS_EMPTY to ZS_FULL. First of all, it will use ZS_EMPTY list for delay freeing. And with adding ZS_FULL list, it makes to identify whether zspage is isolated or not via list_empty(>list) test. Cc: Sergey Senozhatsky Signed-off-by: Minchan Kim --- include/uapi/linux/magic.h | 1 + mm/zsmalloc.c | 793 ++--- 2 files changed, 672 insertions(+), 122 deletions(-) diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index d829ce63529d..e398beac67b8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -81,5 +81,6 @@ /* Since UDF 2.01 is ISO 13346 based... */ #define UDF_SUPER_MAGIC0x15013346 #define BALLOON_KVM_MAGIC 0x13661366 +#define ZSMALLOC_MAGIC 0x58295829 #endif /* __LINUX_MAGIC_H__ */ diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c6fb543cfb98..a80100db16d6 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -17,14 +17,14 @@ * * Usage of struct page fields: * page->private: points to zspage - * page->index: offset of the first object starting in this page. - * For the first page, this is always 0, so we use this field - * to store handle for huge object. - * page->next: links together all component pages of a zspage + * page->freelist(index): links together all component pages of a zspage + * For the huge page, this is always 0, so we use this field + * to store handle. * * Usage of struct page flags: * PG_private: identifies the first component page * PG_private2: identifies the last component page + * PG_owner_priv_1: indentifies the huge component page * */ @@ -49,6 +49,11 @@ #include #include #include +#include
Re: [PATCH] clk: rockchip: fix cclk error handing
Am Donnerstag, 26. Mai 2016, 21:49:08 schrieb Xing Zheng: > It maybe due to a copy-paste error the error handing should be > cclk not clk. > > Reported-by: Lin Huang> Signed-off-by: Xing Zheng applied to my clk-fixes branch after adapting the commit message a bit. Thanks for catching that Heiko
Re: [PATCH] clk: rockchip: fix cclk error handing
Am Donnerstag, 26. Mai 2016, 21:49:08 schrieb Xing Zheng: > It maybe due to a copy-paste error the error handing should be > cclk not clk. > > Reported-by: Lin Huang > Signed-off-by: Xing Zheng applied to my clk-fixes branch after adapting the commit message a bit. Thanks for catching that Heiko
Re: [GIT PULL] Ceph updates for 4.7-rc1
On Thu, 26 May 2016, Linus Torvalds wrote: > On Thu, May 26, 2016 at 11:31 AM, Linus Torvalds >wrote: > > > > Pulled and then immediately unpulled again. > > .. and having thought it over, I ended up re-pulling again, so now > it's going through my build test. > > Consider this discussion a strong encouragement to *not* do this in > the future - sending me pull requests at the end of the merge window > without them having been in linux-next is a no-no, unless those pull > requests are small and trivial (or have fixes that I'd pull even > outside the merge window, of course). Thank you! We'll be sure we include things in -next well beforehand next time around, especially if it's a big diff like this one. One point of clarification, though: in the past I've squashed down fixes discovered during testing if the branch hasn't hit a stable tree yet (e.g., your tree). AIUI this is(was?) standard procedure for things in -next. Do you want us to avoid squashing if we are creeping up on pull request time, or are you primarily interested in, say, seeing that what has been in -next for a while is substantially the same as what you pull, and has perhaps been there unmodified for at least a few days? Or would you rather see fixup patches if we identify issues in the last few days of testing? Thanks- sage
Re: [GIT PULL] Ceph updates for 4.7-rc1
On Thu, 26 May 2016, Linus Torvalds wrote: > On Thu, May 26, 2016 at 11:31 AM, Linus Torvalds > wrote: > > > > Pulled and then immediately unpulled again. > > .. and having thought it over, I ended up re-pulling again, so now > it's going through my build test. > > Consider this discussion a strong encouragement to *not* do this in > the future - sending me pull requests at the end of the merge window > without them having been in linux-next is a no-no, unless those pull > requests are small and trivial (or have fixes that I'd pull even > outside the merge window, of course). Thank you! We'll be sure we include things in -next well beforehand next time around, especially if it's a big diff like this one. One point of clarification, though: in the past I've squashed down fixes discovered during testing if the branch hasn't hit a stable tree yet (e.g., your tree). AIUI this is(was?) standard procedure for things in -next. Do you want us to avoid squashing if we are creeping up on pull request time, or are you primarily interested in, say, seeing that what has been in -next for a while is substantially the same as what you pull, and has perhaps been there unmodified for at least a few days? Or would you rather see fixup patches if we identify issues in the last few days of testing? Thanks- sage
Re: [PATCH v3 02/12] of: add J-Core cpu bindings
On 05/25/2016 06:04 PM, Rich Felker wrote: > On Wed, May 25, 2016 at 11:22:15AM +0100, Mark Rutland wrote: >> * What state should the CPU be in when it branches to the provided >> address? >> - Must the MMU be off? > > Current models are nommu. As far as I know, we're the first nommu SMP implementation in Linux. >> At some point, you are likely to want CPU hotplug and/or cpuidle. There are hundreds of todo items. At the moment we're just trying to get basic board support in for more or less the design we demoed a year ago at https://lwn.net/Articles/647636/ The roadmap page we've posted is a summary of a summary, it doesn't even mention the DSP designs (plural) we've talked about at some length internally. There's a DMA engine we're not using yet. We've got various other things like ethernet support that the cheap $50 introductory board we're targeting as an entry point hasn't got connectors for. And we didn't do anything for the vga or sound connectors on that board because "boot to a shell prompt on serial prompt" was our first target. (It runs out of initramfs, but since you need the sdcard to load linux from, and thus the bootloader had to have an sdcard driver, it seemed a shame NOT to have an sdcard driver in linux too.) Other than that? Intentionally minimalist first pass. >> We >> didn't provision the arm64 spin-table for either of these and never >> extended it, but you may want to put in place some discoverability now >> to allow future OSs to use that new support while allowing current OSs >> to retain functional (e.g. not requiring a new enable-method string). >> >>> +- >>> +Cache controller node >>> +- >>> + >>> +Required properties: >>> + >>> +- compatible: Must be "jcore,cache". >>> + >>> +- reg: A memory range for the cache controller registers. >> >> There is a well-defined memory map for the cache controller? The icache and dcache are 8k each, have 32 byte cache lines, and they cache (addr>>5)&255 so reading from + or - 8192 bytes will evict that cache line due to aliasing. The icache and dcache each have an enable and flush bit in the processor's control register. (So 4 bits total controlling the cache, I think? I'd have to dig up Niishi-san's slides to check.) Each processor has its own set of control registers. Back when SMP was first implemented there was a lot of talk among the engineers about separating the register stuff out, because right now processor stuff and SOC I/O devices are kinda mashed together. But "release early, release often" won out over endlessly polishing the same stuff in-house before showing it to anybody else. We WANT other people to suggest fixes, but we also want basic support for what's already been working for a year and a half to go into the vanilla kernel at some point. >> If so, please refer to documentation for it here (either in this >> section, or the top of this document if common with other elements >> described herein). During my most recent trip to Japan I sat down with the engineer who wrote the dcache and dram controller and passed on his explanation of them to somebody on the j-core mailing list, about halfway through this message: http://lists.j-core.org/pipermail/j-core/2016-April/38.html I've been meaning to cut that out and put it on a web page on j-core.org, but have been busy with other things. That post also points at comments in the VHDL source that implement the features. That is, alas, the level of documentation we're talking about at the moment. Better is on the todo list. In the meantime you can RTFS if you understand vhdl, or ask the engineers on the j-core list or #j-core freenode channel. The instruction set is based on an existing architecture and the other SOC features in the initial release are as minimal as we could get them and still be useful. (We've got a lot more peripherals implemented than this release includes.) We thought getting working code into the kernel should be a high priority, but apparently everything has to be done before anything can be done? > The current version "jcore,cache" has a single 32-bit control register > per cpu that can be used to enable/disable/flush icache and/or dcache. > There is no finer-grained control. If/when we do larger caches in the > future where it makes sense, there will be a new binding for it. (For > example it may make sense to do one that matches the original SH > memory-mapped cache interface.) The first dache-only Linux support I did last year worked by reading an aliased cache line from sram to evict individual cache lines, and it turned out to have no detectable speed advantage over just blanking the whole thing. Then doing the same "flush by aliasing" trick with icache would have required an 8k jump table and we just went "no" and implemented the flush bits instead, so almost all that code went away. When we implement L2 cache for j-core, we can start caring about granularity again, but
Re: [PATCH v3 02/12] of: add J-Core cpu bindings
On 05/25/2016 06:04 PM, Rich Felker wrote: > On Wed, May 25, 2016 at 11:22:15AM +0100, Mark Rutland wrote: >> * What state should the CPU be in when it branches to the provided >> address? >> - Must the MMU be off? > > Current models are nommu. As far as I know, we're the first nommu SMP implementation in Linux. >> At some point, you are likely to want CPU hotplug and/or cpuidle. There are hundreds of todo items. At the moment we're just trying to get basic board support in for more or less the design we demoed a year ago at https://lwn.net/Articles/647636/ The roadmap page we've posted is a summary of a summary, it doesn't even mention the DSP designs (plural) we've talked about at some length internally. There's a DMA engine we're not using yet. We've got various other things like ethernet support that the cheap $50 introductory board we're targeting as an entry point hasn't got connectors for. And we didn't do anything for the vga or sound connectors on that board because "boot to a shell prompt on serial prompt" was our first target. (It runs out of initramfs, but since you need the sdcard to load linux from, and thus the bootloader had to have an sdcard driver, it seemed a shame NOT to have an sdcard driver in linux too.) Other than that? Intentionally minimalist first pass. >> We >> didn't provision the arm64 spin-table for either of these and never >> extended it, but you may want to put in place some discoverability now >> to allow future OSs to use that new support while allowing current OSs >> to retain functional (e.g. not requiring a new enable-method string). >> >>> +- >>> +Cache controller node >>> +- >>> + >>> +Required properties: >>> + >>> +- compatible: Must be "jcore,cache". >>> + >>> +- reg: A memory range for the cache controller registers. >> >> There is a well-defined memory map for the cache controller? The icache and dcache are 8k each, have 32 byte cache lines, and they cache (addr>>5)&255 so reading from + or - 8192 bytes will evict that cache line due to aliasing. The icache and dcache each have an enable and flush bit in the processor's control register. (So 4 bits total controlling the cache, I think? I'd have to dig up Niishi-san's slides to check.) Each processor has its own set of control registers. Back when SMP was first implemented there was a lot of talk among the engineers about separating the register stuff out, because right now processor stuff and SOC I/O devices are kinda mashed together. But "release early, release often" won out over endlessly polishing the same stuff in-house before showing it to anybody else. We WANT other people to suggest fixes, but we also want basic support for what's already been working for a year and a half to go into the vanilla kernel at some point. >> If so, please refer to documentation for it here (either in this >> section, or the top of this document if common with other elements >> described herein). During my most recent trip to Japan I sat down with the engineer who wrote the dcache and dram controller and passed on his explanation of them to somebody on the j-core mailing list, about halfway through this message: http://lists.j-core.org/pipermail/j-core/2016-April/38.html I've been meaning to cut that out and put it on a web page on j-core.org, but have been busy with other things. That post also points at comments in the VHDL source that implement the features. That is, alas, the level of documentation we're talking about at the moment. Better is on the todo list. In the meantime you can RTFS if you understand vhdl, or ask the engineers on the j-core list or #j-core freenode channel. The instruction set is based on an existing architecture and the other SOC features in the initial release are as minimal as we could get them and still be useful. (We've got a lot more peripherals implemented than this release includes.) We thought getting working code into the kernel should be a high priority, but apparently everything has to be done before anything can be done? > The current version "jcore,cache" has a single 32-bit control register > per cpu that can be used to enable/disable/flush icache and/or dcache. > There is no finer-grained control. If/when we do larger caches in the > future where it makes sense, there will be a new binding for it. (For > example it may make sense to do one that matches the original SH > memory-mapped cache interface.) The first dache-only Linux support I did last year worked by reading an aliased cache line from sram to evict individual cache lines, and it turned out to have no detectable speed advantage over just blanking the whole thing. Then doing the same "flush by aliasing" trick with icache would have required an 8k jump table and we just went "no" and implemented the flush bits instead, so almost all that code went away. When we implement L2 cache for j-core, we can start caring about granularity again, but
RE: [PATCH 2/7] crypto : async implementation for sha1-mb
> From: Dey, Megha > Sent: Thursday, May 19, 2016 5:43 PM > To: herb...@gondor.apana.org.au > Cc: tim.c.c...@linux.intel.com; da...@davemloft.net; linux- > cry...@vger.kernel.org; linux-kernel@vger.kernel.org; Dey, Megha >; Yu, Fenghua ; Megha > Dey > Subject: [PATCH 2/7] crypto : async implementation for sha1-mb > > From: Megha Dey > > Herbert wants the sha1-mb algorithm to have an async implementation: > https://lkml.org/lkml/2016/4/5/286. > Currently, sha1-mb uses an async interface for the outer algorithm and a sync > interface for the inner algorithm. This patch introduces a async interface for > even the inner algorithm. > > Signed-off-by: Megha Dey > Signed-off-by: Tim Chen > --- > arch/x86/crypto/sha-mb/sha1_mb.c | 190 ++-- > --- > crypto/ahash.c | 6 -- > crypto/mcryptd.c | 117 +--- > include/crypto/hash.h| 6 ++ > include/crypto/internal/hash.h | 8 +- > include/crypto/mcryptd.h | 8 +- > 6 files changed, 184 insertions(+), 151 deletions(-) > Hi, Herbert, Any comment/feedback on this patch set? Is there any plan to push it to upstream? Thanks. -Fenghua
RE: [PATCH 2/7] crypto : async implementation for sha1-mb
> From: Dey, Megha > Sent: Thursday, May 19, 2016 5:43 PM > To: herb...@gondor.apana.org.au > Cc: tim.c.c...@linux.intel.com; da...@davemloft.net; linux- > cry...@vger.kernel.org; linux-kernel@vger.kernel.org; Dey, Megha > ; Yu, Fenghua ; Megha > Dey > Subject: [PATCH 2/7] crypto : async implementation for sha1-mb > > From: Megha Dey > > Herbert wants the sha1-mb algorithm to have an async implementation: > https://lkml.org/lkml/2016/4/5/286. > Currently, sha1-mb uses an async interface for the outer algorithm and a sync > interface for the inner algorithm. This patch introduces a async interface for > even the inner algorithm. > > Signed-off-by: Megha Dey > Signed-off-by: Tim Chen > --- > arch/x86/crypto/sha-mb/sha1_mb.c | 190 ++-- > --- > crypto/ahash.c | 6 -- > crypto/mcryptd.c | 117 +--- > include/crypto/hash.h| 6 ++ > include/crypto/internal/hash.h | 8 +- > include/crypto/mcryptd.h | 8 +- > 6 files changed, 184 insertions(+), 151 deletions(-) > Hi, Herbert, Any comment/feedback on this patch set? Is there any plan to push it to upstream? Thanks. -Fenghua
Re: [RFC 1/5] usb: chipidea: Add support for Tegra20/30/114/124
On 05/26/2016 03:17 PM, Stephen Warren wrote: On 05/26/2016 09:40 AM, Thierry Reding wrote: From: Thierry RedingAll of these Tegra SoC generations have a ChipIdea UDC IP block that can be used for device mode communication with a host. Implement rudimentary support that doesn't allow switching between host and device modes. Are you sure this is correct for Tegra20? I ask because for the /host/ mode driver, there's a "has_hostpc" flag which is set to false for Tegra20 and true for all other SoCs. In the U-Boot device mode driver (if not in the kernel driver; I didn't check), there's a concept of "has hostpc" too. I might expect that flag to be set the same way for both drivers. That said, I /think/ the host and device HW are unrelated, so it's possible has_hostpc might be set differently for them. Unfortunately, we haven't enabled the device mode driver for any Tegra20 system in U-Boot so I can't tell whether we should enable has_hostpc for Tegra20's device mode driver. Still, if this code works then I guess it's likely correct... On the other hand, it looks like the kernel device mode driver might auto-detect this; in core.c:hw_device_init(), I see: reg = hw_read(ci, CAP_HCCPARAMS, HCCPARAMS_LEN) >> __ffs(HCCPARAMS_LEN); ci->hw_bank.lpm = reg; ... and in host.c:host_start(), I see: ehci->has_hostpc = ci->hw_bank.lpm;
Re: [RFC 1/5] usb: chipidea: Add support for Tegra20/30/114/124
On 05/26/2016 03:17 PM, Stephen Warren wrote: On 05/26/2016 09:40 AM, Thierry Reding wrote: From: Thierry Reding All of these Tegra SoC generations have a ChipIdea UDC IP block that can be used for device mode communication with a host. Implement rudimentary support that doesn't allow switching between host and device modes. Are you sure this is correct for Tegra20? I ask because for the /host/ mode driver, there's a "has_hostpc" flag which is set to false for Tegra20 and true for all other SoCs. In the U-Boot device mode driver (if not in the kernel driver; I didn't check), there's a concept of "has hostpc" too. I might expect that flag to be set the same way for both drivers. That said, I /think/ the host and device HW are unrelated, so it's possible has_hostpc might be set differently for them. Unfortunately, we haven't enabled the device mode driver for any Tegra20 system in U-Boot so I can't tell whether we should enable has_hostpc for Tegra20's device mode driver. Still, if this code works then I guess it's likely correct... On the other hand, it looks like the kernel device mode driver might auto-detect this; in core.c:hw_device_init(), I see: reg = hw_read(ci, CAP_HCCPARAMS, HCCPARAMS_LEN) >> __ffs(HCCPARAMS_LEN); ci->hw_bank.lpm = reg; ... and in host.c:host_start(), I see: ehci->has_hostpc = ci->hw_bank.lpm;
[PATCH 1/5] lib/mpi: mpi_read_from_buffer(): return error code
mpi_read_from_buffer() reads a MPI from a buffer into a newly allocated MPI instance. It expects the buffer's leading two bytes to contain the number of bits, followed by the actual payload. On failure, it returns NULL and updates the in/out argument ret_nread somewhat inconsistently: - If the given buffer is too short to contain the leading two bytes encoding the number of bits or their value is unsupported, then ret_nread will be cleared. - If the allocation of the resulting MPI instance fails, ret_nread is left as is. The only user of mpi_read_from_buffer(), digsig_verify_rsa(), simply checks for a return value of NULL and returns -ENOMEM if that happens. While this is all of cosmetic nature only, there is another error condition which currently isn't detectable by the caller of mpi_read_from_buffer(): if the given buffer is too small to hold the number of bits as encoded in its first two bytes, the return value will be non-NULL and *ret_nread > 0. In preparation of communicating this condition to the caller, let mpi_read_from_buffer() return error values by means of the ERR_PTR() mechanism. Make the sole caller of mpi_read_from_buffer(), digsig_verify_rsa(), check the return value for IS_ERR() rather than == NULL. If IS_ERR() is true, return the associated error value rather than the fixed -ENOMEM. Signed-off-by: Nicolai Stange--- lib/digsig.c | 12 lib/mpi/mpicoder.c | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1..a121cbc 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -104,16 +104,18 @@ static int digsig_verify_rsa(struct key *key, datap = pkh->mpi; endp = ukp->data + ukp->datalen; - err = -ENOMEM; - for (i = 0; i < pkh->nmpi; i++) { unsigned int remaining = endp - datap; pkey[i] = mpi_read_from_buffer(datap, ); - if (!pkey[i]) + if (IS_ERR(pkey[i])) { + err = PTR_ERR(pkey[i]); goto err; + } datap += remaining; } + err = -ENOMEM; + mblen = mpi_get_nbits(pkey[0]); mlen = DIV_ROUND_UP(mblen, 8); @@ -126,8 +128,10 @@ static int digsig_verify_rsa(struct key *key, nret = siglen; in = mpi_read_from_buffer(sig, ); - if (!in) + if (IS_ERR(in)) { + err = PTR_ERR(in); goto err; + } res = mpi_alloc(mpi_get_nlimbs(in) * 2); if (!res) diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 747606f..275c71e 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -88,12 +88,12 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) MPI val = NULL; if (*ret_nread < 2) - goto leave; + return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); - goto leave; + return ERR_PTR(-EINVAL); } buffer += 2; nread = 2; @@ -102,7 +102,7 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) - return NULL; + return ERR_PTR(-ENOMEM); i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; i %= BYTES_PER_MPI_LIMB; val->nbits = nbits; -- 2.8.2
[PATCH 1/5] lib/mpi: mpi_read_from_buffer(): return error code
mpi_read_from_buffer() reads a MPI from a buffer into a newly allocated MPI instance. It expects the buffer's leading two bytes to contain the number of bits, followed by the actual payload. On failure, it returns NULL and updates the in/out argument ret_nread somewhat inconsistently: - If the given buffer is too short to contain the leading two bytes encoding the number of bits or their value is unsupported, then ret_nread will be cleared. - If the allocation of the resulting MPI instance fails, ret_nread is left as is. The only user of mpi_read_from_buffer(), digsig_verify_rsa(), simply checks for a return value of NULL and returns -ENOMEM if that happens. While this is all of cosmetic nature only, there is another error condition which currently isn't detectable by the caller of mpi_read_from_buffer(): if the given buffer is too small to hold the number of bits as encoded in its first two bytes, the return value will be non-NULL and *ret_nread > 0. In preparation of communicating this condition to the caller, let mpi_read_from_buffer() return error values by means of the ERR_PTR() mechanism. Make the sole caller of mpi_read_from_buffer(), digsig_verify_rsa(), check the return value for IS_ERR() rather than == NULL. If IS_ERR() is true, return the associated error value rather than the fixed -ENOMEM. Signed-off-by: Nicolai Stange --- lib/digsig.c | 12 lib/mpi/mpicoder.c | 6 +++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/lib/digsig.c b/lib/digsig.c index 07be6c1..a121cbc 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -104,16 +104,18 @@ static int digsig_verify_rsa(struct key *key, datap = pkh->mpi; endp = ukp->data + ukp->datalen; - err = -ENOMEM; - for (i = 0; i < pkh->nmpi; i++) { unsigned int remaining = endp - datap; pkey[i] = mpi_read_from_buffer(datap, ); - if (!pkey[i]) + if (IS_ERR(pkey[i])) { + err = PTR_ERR(pkey[i]); goto err; + } datap += remaining; } + err = -ENOMEM; + mblen = mpi_get_nbits(pkey[0]); mlen = DIV_ROUND_UP(mblen, 8); @@ -126,8 +128,10 @@ static int digsig_verify_rsa(struct key *key, nret = siglen; in = mpi_read_from_buffer(sig, ); - if (!in) + if (IS_ERR(in)) { + err = PTR_ERR(in); goto err; + } res = mpi_alloc(mpi_get_nlimbs(in) * 2); if (!res) diff --git a/lib/mpi/mpicoder.c b/lib/mpi/mpicoder.c index 747606f..275c71e 100644 --- a/lib/mpi/mpicoder.c +++ b/lib/mpi/mpicoder.c @@ -88,12 +88,12 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) MPI val = NULL; if (*ret_nread < 2) - goto leave; + return ERR_PTR(-EINVAL); nbits = buffer[0] << 8 | buffer[1]; if (nbits > MAX_EXTERN_MPI_BITS) { pr_info("MPI: mpi too large (%u bits)\n", nbits); - goto leave; + return ERR_PTR(-EINVAL); } buffer += 2; nread = 2; @@ -102,7 +102,7 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) nlimbs = DIV_ROUND_UP(nbytes, BYTES_PER_MPI_LIMB); val = mpi_alloc(nlimbs); if (!val) - return NULL; + return ERR_PTR(-ENOMEM); i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; i %= BYTES_PER_MPI_LIMB; val->nbits = nbits; -- 2.8.2