On Mon, May 18, 2015 at 12:58:40PM +0200, Denys Vlasenko wrote:
> With this .config: http://busybox.net/~vda/kernel_config,
> after uninlining these functions have sizes and callsite counts
> as follows:

Most of this is probably good, thanks. But I'm curious about one:

> cfi_udelay(): 74 bytes, 26 callsites

^^ This is pretty dead-simple. If it's generating bad code, we might
look at fixing it up instead. Almost all of its call sites are with
constant input, so it *should* just become:

        udelay(1);
        cond_resched();

in most cases. For the non-constant cases, we might still do an
out-of-line implementation. Or maybe we just say it's all not worth it,
and we just stick with what you have. But I'd like to consider
alternatives to out-lining this one.

Thanks,
Brian

> cfi_send_gen_cmd(): 153 bytes, 95 callsites
> cfi_build_cmd(): 274 bytes, 123 callsites
> cfi_build_cmd_addr(): 49 bytes, 15 callsites
> cfi_merge_status(): 230 bytes, 3 callsites
> 
> Reduction in code size is about 50,000:
> 
>     text     data      bss       dec     hex filename
> 85842882 22294584 20627456 128764922 7accbfa vmlinux.before
> 85789648 22294616 20627456 128711720 7abfc28 vmlinux
> 
> Signed-off-by: Denys Vlasenko <dvlas...@redhat.com>
> CC: Dan Carpenter <dan.carpen...@oracle.com>
> CC: Jingoo Han <jg1....@samsung.com>
> CC: Brian Norris <computersforpe...@gmail.com>
> CC: Aaron Sierra <asie...@xes-inc.com>
> CC: Artem Bityutskiy <artem.bityuts...@linux.intel.com>
> CC: David Woodhouse <david.woodho...@intel.com>
> CC: linux-...@lists.infradead.org
> CC: linux-kernel@vger.kernel.org
> ---
>  drivers/mtd/chips/cfi_util.c | 188 
> +++++++++++++++++++++++++++++++++++++++++++
>  include/linux/mtd/cfi.h      | 188 
> ++-----------------------------------------
>  2 files changed, 196 insertions(+), 180 deletions(-)
> 
> diff --git a/drivers/mtd/chips/cfi_util.c b/drivers/mtd/chips/cfi_util.c
> index 09c79bd..6f16552 100644
> --- a/drivers/mtd/chips/cfi_util.c
> +++ b/drivers/mtd/chips/cfi_util.c
> @@ -23,6 +23,194 @@
>  #include <linux/mtd/map.h>
>  #include <linux/mtd/cfi.h>
>  
> +void cfi_udelay(int us)
> +{
> +     if (us >= 1000) {
> +             msleep((us+999)/1000);
> +     } else {
> +             udelay(us);
> +             cond_resched();
> +     }
> +}
> +EXPORT_SYMBOL(cfi_udelay);
> +
> +/*
> + * Returns the command address according to the given geometry.
> + */
> +uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
> +                             struct map_info *map, struct cfi_private *cfi)
> +{
> +     unsigned bankwidth = map_bankwidth(map);
> +     unsigned interleave = cfi_interleave(cfi);
> +     unsigned type = cfi->device_type;
> +     uint32_t addr;
> +
> +     addr = (cmd_ofs * type) * interleave;
> +
> +     /* Modify the unlock address if we are in compatibility mode.
> +      * For 16bit devices on 8 bit busses
> +      * and 32bit devices on 16 bit busses
> +      * set the low bit of the alternating bit sequence of the address.
> +      */
> +     if (((type * interleave) > bankwidth) && ((cmd_ofs & 0xff) == 0xaa))
> +             addr |= (type >> 1)*interleave;
> +
> +     return  addr;
> +}
> +EXPORT_SYMBOL(cfi_build_cmd_addr);
> +
> +/*
> + * Transforms the CFI command for the given geometry (bus width & 
> interleave).
> + * It looks too long to be inline, but in the common case it should almost 
> all
> + * get optimised away.
> + */
> +map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private 
> *cfi)
> +{
> +     map_word val = { {0} };
> +     int wordwidth, words_per_bus, chip_mode, chips_per_word;
> +     unsigned long onecmd;
> +     int i;
> +
> +     /* We do it this way to give the compiler a fighting chance
> +        of optimising away all the crap for 'bankwidth' larger than
> +        an unsigned long, in the common case where that support is
> +        disabled */
> +     if (map_bankwidth_is_large(map)) {
> +             wordwidth = sizeof(unsigned long);
> +             words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. 
> normally 1
> +     } else {
> +             wordwidth = map_bankwidth(map);
> +             words_per_bus = 1;
> +     }
> +
> +     chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
> +     chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
> +
> +     /* First, determine what the bit-pattern should be for a single
> +        device, according to chip mode and endianness... */
> +     switch (chip_mode) {
> +     default: BUG();
> +     case 1:
> +             onecmd = cmd;
> +             break;
> +     case 2:
> +             onecmd = cpu_to_cfi16(map, cmd);
> +             break;
> +     case 4:
> +             onecmd = cpu_to_cfi32(map, cmd);
> +             break;
> +     }
> +
> +     /* Now replicate it across the size of an unsigned long, or
> +        just to the bus width as appropriate */
> +     switch (chips_per_word) {
> +     default: BUG();
> +#if BITS_PER_LONG >= 64
> +     case 8:
> +             onecmd |= (onecmd << (chip_mode * 32));
> +#endif
> +     case 4:
> +             onecmd |= (onecmd << (chip_mode * 16));
> +     case 2:
> +             onecmd |= (onecmd << (chip_mode * 8));
> +     case 1:
> +             ;
> +     }
> +
> +     /* And finally, for the multi-word case, replicate it
> +        in all words in the structure */
> +     for (i=0; i < words_per_bus; i++) {
> +             val.x[i] = onecmd;
> +     }
> +
> +     return val;
> +}
> +EXPORT_SYMBOL(cfi_build_cmd);
> +
> +unsigned long cfi_merge_status(map_word val, struct map_info *map,
> +                                        struct cfi_private *cfi)
> +{
> +     int wordwidth, words_per_bus, chip_mode, chips_per_word;
> +     unsigned long onestat, res = 0;
> +     int i;
> +
> +     /* We do it this way to give the compiler a fighting chance
> +        of optimising away all the crap for 'bankwidth' larger than
> +        an unsigned long, in the common case where that support is
> +        disabled */
> +     if (map_bankwidth_is_large(map)) {
> +             wordwidth = sizeof(unsigned long);
> +             words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. 
> normally 1
> +     } else {
> +             wordwidth = map_bankwidth(map);
> +             words_per_bus = 1;
> +     }
> +
> +     chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
> +     chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
> +
> +     onestat = val.x[0];
> +     /* Or all status words together */
> +     for (i=1; i < words_per_bus; i++) {
> +             onestat |= val.x[i];
> +     }
> +
> +     res = onestat;
> +     switch(chips_per_word) {
> +     default: BUG();
> +#if BITS_PER_LONG >= 64
> +     case 8:
> +             res |= (onestat >> (chip_mode * 32));
> +#endif
> +     case 4:
> +             res |= (onestat >> (chip_mode * 16));
> +     case 2:
> +             res |= (onestat >> (chip_mode * 8));
> +     case 1:
> +             ;
> +     }
> +
> +     /* Last, determine what the bit-pattern should be for a single
> +        device, according to chip mode and endianness... */
> +     switch (chip_mode) {
> +     case 1:
> +             break;
> +     case 2:
> +             res = cfi16_to_cpu(map, res);
> +             break;
> +     case 4:
> +             res = cfi32_to_cpu(map, res);
> +             break;
> +     default: BUG();
> +     }
> +     return res;
> +}
> +EXPORT_SYMBOL(cfi_merge_status);
> +
> +/*
> + * Sends a CFI command to a bank of flash for the given geometry.
> + *
> + * Returns the offset in flash where the command was written.
> + * If prev_val is non-null, it will be set to the value at the command 
> address,
> + * before the command was written.
> + */
> +uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base,
> +                             struct map_info *map, struct cfi_private *cfi,
> +                             int type, map_word *prev_val)
> +{
> +     map_word val;
> +     uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi);
> +     val = cfi_build_cmd(cmd, map, cfi);
> +
> +     if (prev_val)
> +             *prev_val = map_read(map, addr);
> +
> +     map_write(map, val, addr);
> +
> +     return addr - base;
> +}
> +EXPORT_SYMBOL(cfi_send_gen_cmd);
> +
>  int __xipram cfi_qry_present(struct map_info *map, __u32 base,
>                            struct cfi_private *cfi)
>  {
> diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
> index 299d7d3..9b57a9b 100644
> --- a/include/linux/mtd/cfi.h
> +++ b/include/linux/mtd/cfi.h
> @@ -296,183 +296,19 @@ struct cfi_private {
>       struct flchip chips[0];  /* per-chip data structure for each chip */
>  };
>  
> -/*
> - * Returns the command address according to the given geometry.
> - */
> -static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
> -                             struct map_info *map, struct cfi_private *cfi)
> -{
> -     unsigned bankwidth = map_bankwidth(map);
> -     unsigned interleave = cfi_interleave(cfi);
> -     unsigned type = cfi->device_type;
> -     uint32_t addr;
> -     
> -     addr = (cmd_ofs * type) * interleave;
> -
> -     /* Modify the unlock address if we are in compatibility mode.
> -      * For 16bit devices on 8 bit busses
> -      * and 32bit devices on 16 bit busses
> -      * set the low bit of the alternating bit sequence of the address.
> -      */
> -     if (((type * interleave) > bankwidth) && ((cmd_ofs & 0xff) == 0xaa))
> -             addr |= (type >> 1)*interleave;
> -
> -     return  addr;
> -}
> -
> -/*
> - * Transforms the CFI command for the given geometry (bus width & 
> interleave).
> - * It looks too long to be inline, but in the common case it should almost 
> all
> - * get optimised away.
> - */
> -static inline map_word cfi_build_cmd(u_long cmd, struct map_info *map, 
> struct cfi_private *cfi)
> -{
> -     map_word val = { {0} };
> -     int wordwidth, words_per_bus, chip_mode, chips_per_word;
> -     unsigned long onecmd;
> -     int i;
> -
> -     /* We do it this way to give the compiler a fighting chance
> -        of optimising away all the crap for 'bankwidth' larger than
> -        an unsigned long, in the common case where that support is
> -        disabled */
> -     if (map_bankwidth_is_large(map)) {
> -             wordwidth = sizeof(unsigned long);
> -             words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. 
> normally 1
> -     } else {
> -             wordwidth = map_bankwidth(map);
> -             words_per_bus = 1;
> -     }
> -
> -     chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
> -     chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
> -
> -     /* First, determine what the bit-pattern should be for a single
> -        device, according to chip mode and endianness... */
> -     switch (chip_mode) {
> -     default: BUG();
> -     case 1:
> -             onecmd = cmd;
> -             break;
> -     case 2:
> -             onecmd = cpu_to_cfi16(map, cmd);
> -             break;
> -     case 4:
> -             onecmd = cpu_to_cfi32(map, cmd);
> -             break;
> -     }
> -
> -     /* Now replicate it across the size of an unsigned long, or
> -        just to the bus width as appropriate */
> -     switch (chips_per_word) {
> -     default: BUG();
> -#if BITS_PER_LONG >= 64
> -     case 8:
> -             onecmd |= (onecmd << (chip_mode * 32));
> -#endif
> -     case 4:
> -             onecmd |= (onecmd << (chip_mode * 16));
> -     case 2:
> -             onecmd |= (onecmd << (chip_mode * 8));
> -     case 1:
> -             ;
> -     }
> +uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs,
> +                             struct map_info *map, struct cfi_private *cfi);
>  
> -     /* And finally, for the multi-word case, replicate it
> -        in all words in the structure */
> -     for (i=0; i < words_per_bus; i++) {
> -             val.x[i] = onecmd;
> -     }
> -
> -     return val;
> -}
> +map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private 
> *cfi);
>  #define CMD(x)  cfi_build_cmd((x), map, cfi)
>  
> -
> -static inline unsigned long cfi_merge_status(map_word val, struct map_info 
> *map,
> -                                        struct cfi_private *cfi)
> -{
> -     int wordwidth, words_per_bus, chip_mode, chips_per_word;
> -     unsigned long onestat, res = 0;
> -     int i;
> -
> -     /* We do it this way to give the compiler a fighting chance
> -        of optimising away all the crap for 'bankwidth' larger than
> -        an unsigned long, in the common case where that support is
> -        disabled */
> -     if (map_bankwidth_is_large(map)) {
> -             wordwidth = sizeof(unsigned long);
> -             words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. 
> normally 1
> -     } else {
> -             wordwidth = map_bankwidth(map);
> -             words_per_bus = 1;
> -     }
> -
> -     chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
> -     chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
> -
> -     onestat = val.x[0];
> -     /* Or all status words together */
> -     for (i=1; i < words_per_bus; i++) {
> -             onestat |= val.x[i];
> -     }
> -
> -     res = onestat;
> -     switch(chips_per_word) {
> -     default: BUG();
> -#if BITS_PER_LONG >= 64
> -     case 8:
> -             res |= (onestat >> (chip_mode * 32));
> -#endif
> -     case 4:
> -             res |= (onestat >> (chip_mode * 16));
> -     case 2:
> -             res |= (onestat >> (chip_mode * 8));
> -     case 1:
> -             ;
> -     }
> -
> -     /* Last, determine what the bit-pattern should be for a single
> -        device, according to chip mode and endianness... */
> -     switch (chip_mode) {
> -     case 1:
> -             break;
> -     case 2:
> -             res = cfi16_to_cpu(map, res);
> -             break;
> -     case 4:
> -             res = cfi32_to_cpu(map, res);
> -             break;
> -     default: BUG();
> -     }
> -     return res;
> -}
> -
> +unsigned long cfi_merge_status(map_word val, struct map_info *map,
> +                                        struct cfi_private *cfi);
>  #define MERGESTATUS(x) cfi_merge_status((x), map, cfi)
>  
> -
> -/*
> - * Sends a CFI command to a bank of flash for the given geometry.
> - *
> - * Returns the offset in flash where the command was written.
> - * If prev_val is non-null, it will be set to the value at the command 
> address,
> - * before the command was written.
> - */
> -static inline uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, 
> uint32_t base,
> +uint32_t cfi_send_gen_cmd(u_char cmd, uint32_t cmd_addr, uint32_t base,
>                               struct map_info *map, struct cfi_private *cfi,
> -                             int type, map_word *prev_val)
> -{
> -     map_word val;
> -     uint32_t addr = base + cfi_build_cmd_addr(cmd_addr, map, cfi);
> -     val = cfi_build_cmd(cmd, map, cfi);
> -
> -     if (prev_val)
> -             *prev_val = map_read(map, addr);
> -
> -     map_write(map, val, addr);
> -
> -     return addr - base;
> -}
> +                             int type, map_word *prev_val);
>  
>  static inline uint8_t cfi_read_query(struct map_info *map, uint32_t addr)
>  {
> @@ -506,15 +342,7 @@ static inline uint16_t cfi_read_query16(struct map_info 
> *map, uint32_t addr)
>       }
>  }
>  
> -static inline void cfi_udelay(int us)
> -{
> -     if (us >= 1000) {
> -             msleep((us+999)/1000);
> -     } else {
> -             udelay(us);
> -             cond_resched();
> -     }
> -}
> +void cfi_udelay(int us);
>  
>  int __xipram cfi_qry_present(struct map_info *map, __u32 base,
>                            struct cfi_private *cfi);
> -- 
> 1.8.1.4
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to