Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version

2012-05-07 Thread Borislav Petkov
breaking thread because it grew too big.

On Fri, May 04, 2012 at 07:48:42AM -0300, Mauro Carvalho Chehab wrote:

[ … ]

 + memset(pos, 0, sizeof(pos));
 + row = 0;
 + chn = 0;
 + debugf4(%s: initializing %d %s\n, __func__, tot_dimms,
 + per_rank ? ranks : dimms);
 + for (i = 0; i  tot_dimms; i++) {
 + chan = csi[row].channels[chn];
 + dimm = EDAC_DIMM_PTR(layer, mci-dimms, n_layers,
 +pos[0], pos[1], pos[2]);
 + dimm-mci = mci;
 +
 + debugf2(%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n, __func__,
 + i, per_rank ? rank : dimm, (dimm - mci-dimms),
 + pos[0], pos[1], pos[2], row, chn);
 +
 + /* Copy DIMM location */
 + for (j = 0; j  n_layers; j++)
 + dimm-location[j] = pos[j];
 +
 + /* Link it to the csrows old API data */
 + chan-dimm = dimm;
 + dimm-csrow = row;
 + dimm-cschannel = chn;
 +
 + /* Increment csrow location */
 + for (j = n_layers - 1; j = 0; j--)
 + if (layers[j].is_virt_csrow)
 + break;

This looks fishy: the for-loop above iterates over layers[j] to break on
the first -is_virt_csrow.

 + row++;

And row here gets incremented unconditionally, independent from the loop
above. And you're not using any results from the loop: j gets reset
below in the next loop.

What's going on?

 + if (row == tot_csrows) {
 + row = 0;
 + chn++;
 + }
 +
 + /* Increment dimm location */
 + for (j = n_layers - 1; j = 0; j--) {
 + pos[j]++;
 + if (pos[j]  layers[j].size)
 + break;
 + pos[j] = 0;
   }
   }
  
 @@ -263,6 +373,46 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, 
 unsigned nr_csrows,
*/
   return mci;
  }
 +EXPORT_SYMBOL_GPL(new_edac_mc_alloc);
 +
 +/**
 + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure
 + * @mc_num:  Memory controller number
 + * @n_layers:Number of layers at the MC hierarchy
 + * layers:   Describes each layer as seen by the Memory Controller
 + * @size_pvt:Size of private storage needed
 + *
 + *
 + * FIXME: drivers handle multi-rank memories in different ways: some
 + * drivers map multi-ranked DIMMs as one DIMM while others
 + * as several DIMMs.
 + *
 + * Everything is kmalloc'ed as one big chunk - more efficient.
 + * It can only be used if all structures have the same lifetime - otherwise
 + * you have to allocate and initialize your own structures.
 + *
 + * Use edac_mc_free() to free mc structures allocated by this function.
 + *
 + * Returns:
 + *   On failure: NULL
 + *   On success: struct mem_ctl_info pointer
 + */
 +

[ … ]

 +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 +   struct mem_ctl_info *mci,
 +   const unsigned long page_frame_number,
 +   const unsigned long offset_in_page,
 +   const unsigned long syndrome,
 +   const int layer0,
 +   const int layer1,
 +   const int layer2,
 +   const char *msg,
 +   const char *other_detail,
 +   const void *mcelog)
  {
 - int len = EDAC_MC_LABEL_LEN * 4;
 - char labels[len + 1];
 - char *pos = labels;
 - int chan;
 - int chars;
 - char *label = NULL;
 + /* FIXME: too much for stack: move it to some pre-alocated area */

I'm assuming all those new FIXMEs are going to be addressed soonish :)

Rest looks ok,
thanks.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version

2012-05-07 Thread Mauro Carvalho Chehab
Em 07-05-2012 10:32, Borislav Petkov escreveu:
 breaking thread because it grew too big.
 
 On Fri, May 04, 2012 at 07:48:42AM -0300, Mauro Carvalho Chehab wrote:
 
 [ … ]
 
 +memset(pos, 0, sizeof(pos));
 +row = 0;
 +chn = 0;
 +debugf4(%s: initializing %d %s\n, __func__, tot_dimms,
 +per_rank ? ranks : dimms);
 +for (i = 0; i  tot_dimms; i++) {
 +chan = csi[row].channels[chn];
 +dimm = EDAC_DIMM_PTR(layer, mci-dimms, n_layers,
 +   pos[0], pos[1], pos[2]);
 +dimm-mci = mci;
 +
 +debugf2(%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n, __func__,
 +i, per_rank ? rank : dimm, (dimm - mci-dimms),
 +pos[0], pos[1], pos[2], row, chn);
 +
 +/* Copy DIMM location */
 +for (j = 0; j  n_layers; j++)
 +dimm-location[j] = pos[j];
 +
 +/* Link it to the csrows old API data */
 +chan-dimm = dimm;
 +dimm-csrow = row;
 +dimm-cschannel = chn;
 +
 +/* Increment csrow location */
 +for (j = n_layers - 1; j = 0; j--)
 +if (layers[j].is_virt_csrow)
 +break;
 
 This looks fishy: the for-loop above iterates over layers[j] to break on
 the first -is_virt_csrow.

That loop is dead code. It was related to the rev_order handling.

I'll drop it.

 
 +row++;
 
 And row here gets incremented unconditionally, independent from the loop
 above. And you're not using any results from the loop: j gets reset
 below in the next loop.
 
 What's going on?
 
 +if (row == tot_csrows) {
 +row = 0;
 +chn++;
 +}
 +
 +/* Increment dimm location */
 +for (j = n_layers - 1; j = 0; j--) {
 +pos[j]++;
 +if (pos[j]  layers[j].size)
 +break;
 +pos[j] = 0;
  }
  }
  
 @@ -263,6 +373,46 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, 
 unsigned nr_csrows,
   */
  return mci;
  }
 +EXPORT_SYMBOL_GPL(new_edac_mc_alloc);
 +
 +/**
 + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info 
 structure
 + * @mc_num: Memory controller number
 + * @n_layers:   Number of layers at the MC hierarchy
 + * layers:  Describes each layer as seen by the Memory Controller
 + * @size_pvt:   Size of private storage needed
 + *
 + *
 + * FIXME: drivers handle multi-rank memories in different ways: some
 + * drivers map multi-ranked DIMMs as one DIMM while others
 + * as several DIMMs.
 + *
 + * Everything is kmalloc'ed as one big chunk - more efficient.
 + * It can only be used if all structures have the same lifetime - otherwise
 + * you have to allocate and initialize your own structures.
 + *
 + * Use edac_mc_free() to free mc structures allocated by this function.
 + *
 + * Returns:
 + *  On failure: NULL
 + *  On success: struct mem_ctl_info pointer
 + */
 +
 
 [ … ]
 
 +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 +  struct mem_ctl_info *mci,
 +  const unsigned long page_frame_number,
 +  const unsigned long offset_in_page,
 +  const unsigned long syndrome,
 +  const int layer0,
 +  const int layer1,
 +  const int layer2,
 +  const char *msg,
 +  const char *other_detail,
 +  const void *mcelog)
  {
 -int len = EDAC_MC_LABEL_LEN * 4;
 -char labels[len + 1];
 -char *pos = labels;
 -int chan;
 -int chars;
 -char *label = NULL;
 +/* FIXME: too much for stack: move it to some pre-alocated area */
 
 I'm assuming all those new FIXMEs are going to be addressed soonish :)

That's the idea ;) Some are fixed just after the drivers conversion.

This stack FIXME I'll solve together with some other cleanups I'm intending 
to write after merging the existing chansesets (as there are still a lot
of patches on this series). The stack usage here is no too much to cause 
any problems, but still I don't want to have large stuff like that at stack.

Regards,
Mauro

Patch without the dead code enclosed.

--

edac: Change internal representation to work with layers

From: Mauro Carvalho Chehab mche...@redhat.com

Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core was written with the idea that memory controllers
are able to directly access csrows.

This is not true for FB-DIMM and RAMBUS memory 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-05-04 Thread Borislav Petkov
On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote:
  +  bool enable_filter,
  +  unsigned pos[EDAC_MAX_LAYERS])
  
  Passing the whole array as an argument instead of only a pointer to it?
 
 This is C, and not C++ or Pascal. Only the pointer is passed here. The size
 of the array is used for type check only.

Right, and you can see where he still has trouble. And by he I mean me :).

[ … ]

  +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
  +struct mem_ctl_info *mci,
  +const unsigned long page_frame_number,
  +const unsigned long offset_in_page,
  +const unsigned long syndrome,
  +const int layer0,
  +const int layer1,
  +const int layer2,
  
  Instead of passing each layer as an arg, you can prepare the array pos[]
  in each edac_mc_hanlde_*() and pass around a pointer to it - you need it
  anyway in the edac_mc_inc*() functions.
 
 Yes, but the changes at the drivers will be more complex, without any reason:
 before each call to this function, they would need to create and fill a 
 temporary
 array.
 
 As there are only 3 layers, in the worse case, this way is simpler and more
 efficient. We can review it, if we ever need more than 3 layers.

I see, the edac_mc_handle_error is the main interface for all edac drivers, ok.

[ … ]

  +  bool enable_filter = false;
  
  What does this enable_filter thing mean:
  
  if (pos[i] = 0)
  enable_filter = true;
  
  This absolutely needs explanation and better naming!
 
 Renamed it to enable_per_layer_report.

Or detailed_dimm_report or ...

 The code that implement it seems self-explained: 
 
 ..
   if (enable_filter  dimm-nr_pages) {
   if (p != label) {
   strcpy(p, OTHER_LABEL);
   p += strlen(OTHER_LABEL);
   }
   strcpy(p, dimm-label);
   p += strlen(p);
   *p = '\0';
 
 ..
 
   if (!enable_filter) {
   strcpy(label, any memory);
   } else {
   debugf4(%s: csrow/channel to increment: (%d,%d)\n,
   __func__, row, chan);
   if (p == label)
   strcpy(label, unknown memory);
   if (type == HW_EVENT_ERR_CORRECTED) {
   if (row = 0) {
   mci-csrows[row].ce_count++;
   if (chan = 0)
   
 mci-csrows[row].channels[chan].ce_count++;
   }
   } else
   if (row = 0)
   mci-csrows[row].ue_count++;
   }
 
 Theis flag indicates if is there any useful information about the affected
 DIMM(s) provided by the EDAC driver. If this is provided, the DIMM location 
 labels are
 filtered and reported, and the per-layer error counters are incremented.
 
 As it was discussed on previous reviews, with FB-DIMM MCs, and/or when mirror 
 mode/lockstep mode is enabled, the memory controller points errors to 2 DIMMs 
 (or 4 DIMMs, when both mirror mode and lockstep mode are enabled) on most 
 memory
 controllers, under some conditions. The edac_mc_handle_fbd_ue() function call 
 were
 created due to that.
 
 When comparing with the old code, enable_filter = false would be equivalent 
 to call
 edac_mc_handle_ce_no_info/edac_mc_handle_ue_no_info.
 
 I'm adding a comment about it.

Much better, thanks.

Btw, I have to admit, this is a pretty strange way of handling the case
where layers are { -1, -1, -1 }, i.e. edac_mc_handle_error is called
with the no info hint.

I'm wondering whether it wouldn't be more readable if you could do

edac_mc_handle_error(HW_EVENT_ERR_INFO_INVALID | ..)

or similar and define such a flag which simply states that. But you'll
have to change enum hw_event_mc_err_type to a bitfield to allow more
than one set bit.

Hmm.


[ … ]

  The SCRUB_SW_SRC piece can be another function.
 
 It is now part of the edac_ce_error().

Hm, I can't find this function on your experimental branch on
infradead but it is mentioned in the inlined patch below, what's going
on? Which patch should I be looking at now?

[ … ]

 The following patch addresses the pointed issues. I've updated them
 on my experimental branch at infradead:
   git://git.infradead.org/users/mchehab/edac.git experimental

Ok, I checked this one out but can't find the edac_ce_error() function
as already stated above, pls check.

 They'll also be soon available at:
   git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git 
 hw_events_v18

Will review the patch below now and reply in another mail.

Thanks.

 
 Regards,
 Mauro
 
 -
 
 edac: Change internal representation to work with layers
 
 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-05-04 Thread Mauro Carvalho Chehab
Em 04-05-2012 06:52, Borislav Petkov escreveu:
 On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote:
 +  bool enable_filter,
 +  unsigned pos[EDAC_MAX_LAYERS])

 Passing the whole array as an argument instead of only a pointer to it?

 This is C, and not C++ or Pascal. Only the pointer is passed here. The size
 of the array is used for type check only.
 
 Right, and you can see where he still has trouble. And by he I mean me :).

:)
 
 [ … ]
 
 +void edac_mc_handle_error(const enum hw_event_mc_err_type type,
 +struct mem_ctl_info *mci,
 +const unsigned long page_frame_number,
 +const unsigned long offset_in_page,
 +const unsigned long syndrome,
 +const int layer0,
 +const int layer1,
 +const int layer2,

 Instead of passing each layer as an arg, you can prepare the array pos[]
 in each edac_mc_hanlde_*() and pass around a pointer to it - you need it
 anyway in the edac_mc_inc*() functions.

 Yes, but the changes at the drivers will be more complex, without any reason:
 before each call to this function, they would need to create and fill a 
 temporary
 array.

 As there are only 3 layers, in the worse case, this way is simpler and more
 efficient. We can review it, if we ever need more than 3 layers.
 
 I see, the edac_mc_handle_error is the main interface for all edac drivers, 
 ok.
 
 [ … ]
 
 +  bool enable_filter = false;

 What does this enable_filter thing mean:

 if (pos[i] = 0)
 enable_filter = true;

 This absolutely needs explanation and better naming!

 Renamed it to enable_per_layer_report.
 
 Or detailed_dimm_report or ...

Detail is used on another context; enable_per_layer_report won't generate
any doubts for anyone reading the code.

 The code that implement it seems self-explained: 

 ..
  if (enable_filter  dimm-nr_pages) {
  if (p != label) {
  strcpy(p, OTHER_LABEL);
  p += strlen(OTHER_LABEL);
  }
  strcpy(p, dimm-label);
  p += strlen(p);
  *p = '\0';

 ..

  if (!enable_filter) {
  strcpy(label, any memory);
  } else {
  debugf4(%s: csrow/channel to increment: (%d,%d)\n,
  __func__, row, chan);
  if (p == label)
  strcpy(label, unknown memory);
  if (type == HW_EVENT_ERR_CORRECTED) {
  if (row = 0) {
  mci-csrows[row].ce_count++;
  if (chan = 0)
  
 mci-csrows[row].channels[chan].ce_count++;
  }
  } else
  if (row = 0)
  mci-csrows[row].ue_count++;
  }

 Theis flag indicates if is there any useful information about the affected
 DIMM(s) provided by the EDAC driver. If this is provided, the DIMM location 
 labels are
 filtered and reported, and the per-layer error counters are incremented.

 As it was discussed on previous reviews, with FB-DIMM MCs, and/or when 
 mirror 
 mode/lockstep mode is enabled, the memory controller points errors to 2 
 DIMMs 
 (or 4 DIMMs, when both mirror mode and lockstep mode are enabled) on most 
 memory
 controllers, under some conditions. The edac_mc_handle_fbd_ue() function 
 call were
 created due to that.

 When comparing with the old code, enable_filter = false would be 
 equivalent to call
 edac_mc_handle_ce_no_info/edac_mc_handle_ue_no_info.

 I'm adding a comment about it.
 
 Much better, thanks.
 
 Btw, I have to admit, this is a pretty strange way of handling the case
 where layers are { -1, -1, -1 }, i.e. edac_mc_handle_error is called
 with the no info hint.

Well, negative values are used on Linux to indicate error conditions, so this 
is not
that strange. Also, it allows partial no info, as, on some cases, a channel or
a csrow may not be known. So, this allows code simplification at the drivers. 

For example, look at this hunk on the amd64_edac conversion patch:

@@ -1585,16 +1609,10 @@ static void f1x_map_sysaddr_to_csrow(struct 
mem_ctl_info *mci, u64 sys_addr,
if (dct_ganging_enabled(pvt))
chan = get_channel_from_ecc_syndrome(mci, syndrome);
 
-   if (chan = 0)
-   edac_mc_handle_ce(mci, page, offset, syndrome, csrow, chan,
- EDAC_MOD_STR);
-   else
-   /*
-* Channel unknown, report all channels on this CSROW as failed.
-*/
-   for (chan = 0; chan  mci-csrows[csrow].nr_channels; chan++)
-   edac_mc_handle_ce(mci, page, offset, syndrome,
- csrow, chan, EDAC_MOD_STR);
+

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version

2012-05-04 Thread Borislav Petkov
On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote:
 edac: Change internal representation to work with layers
 
 From: Mauro Carvalho Chehab mche...@redhat.com
 
 Change the EDAC internal representation to work with non-csrow
 based memory controllers.
 
 There are lots of those memory controllers nowadays, and more
 are coming. So, the EDAC internal representation needs to be
 changed, in order to work with those memory controllers, while
 preserving backward compatibility with the old ones.
 
 The edac core was written with the idea that memory controllers
 are able to directly access csrows.
 
 This is not true for FB-DIMM and RAMBUS memory controllers.
 
 Also, some recent advanced memory controllers don't present a per-csrows
 view. Instead, they view memories as DIMMs, instead of ranks.
 
 So, change the allocation and error report routines to allow
 them to work with all types of architectures.
 
 This will allow the removal of several hacks with FB-DIMM and RAMBUS
 memory controllers.
 
 Also, several tests were done on different platforms using different
 x86 drivers.
 
 TODO: a multi-rank DIMMs are currently represented by multiple DIMM
 entries in struct dimm_info. That means that changing a label for one
 rank won't change the same label for the other ranks at the same DIMM.
 This bug is present since the beginning of the EDAC, so it is not a big
 deal. However, on several drivers, it is possible to fix this issue, but
 it should be a per-driver fix, as the csrow = DIMM arrangement may not
 be equal for all. So, don't try to fix it here yet.
 
 I tried to make this patch as short as possible, preceding it with
 several other patches that simplified the logic here. Yet, as the
 internal API changes, all drivers need changes. The changes are
 generally bigger in the drivers for FB-DIMMs.
 
 Cc: Aristeu Rozanski aroza...@redhat.com
 Cc: Doug Thompson nor...@yahoo.com
 Cc: Borislav Petkov borislav.pet...@amd.com
 Cc: Mark Gross mark.gr...@intel.com
 Cc: Jason Uhlenkott juhle...@akamai.com
 Cc: Tim Small t...@buttersideup.com
 Cc: Ranganathan Desikan r...@jetztechnologies.com
 Cc: Arvind R. arvin...@gmail.com
 Cc: Olof Johansson o...@lixom.net
 Cc: Egor Martovetsky e...@pasemi.com
 Cc: Chris Metcalf cmetc...@tilera.com
 Cc: Michal Marek mma...@suse.cz
 Cc: Jiri Kosina jkos...@suse.cz
 Cc: Joe Perches j...@perches.com
 Cc: Dmitry Eremin-Solenikov dbarysh...@gmail.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: Hitoshi Mitake h.mit...@gmail.com
 Cc: Andrew Morton a...@linux-foundation.org
 Cc: Niklas Söderlund niklas.soderl...@ericsson.com
 Cc: Shaohui Xie shaohui@freescale.com
 Cc: Josh Boyer jwbo...@gmail.com
 Cc: linuxppc-dev@lists.ozlabs.org
 Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com
 
 ---
 v18: Addresses the pointed issues on v17 review
 
 diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
 index e48ab31..1286c5e 100644
 --- a/drivers/edac/edac_core.h
 +++ b/drivers/edac/edac_core.h
 @@ -447,8 +447,12 @@ static inline void pci_write_bits32(struct pci_dev 
 *pdev, int offset,
  
  #endif   /* CONFIG_PCI */
  
 -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned 
 nr_csrows,
 -   unsigned nr_chans, int edac_index);
 +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
 +unsigned nr_chans, int edac_index);
 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 +unsigned n_layers,
 +struct edac_mc_layer *layers,
 +unsigned sz_pvt);
  extern int edac_mc_add_mc(struct mem_ctl_info *mci);
  extern void edac_mc_free(struct mem_ctl_info *mci);
  extern struct mem_ctl_info *edac_mc_find(int idx);
 @@ -467,24 +471,78 @@ extern int edac_mc_find_csrow_by_page(struct 
 mem_ctl_info *mci,
   * reporting logic and function interface - reduces conditional
   * statement clutter and extra function arguments.
   */
 -extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
 -   unsigned long page_frame_number,
 -   unsigned long offset_in_page,
 -   unsigned long syndrome, int row, int channel,
 -   const char *msg);
 -extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci,
 -   const char *msg);
 -extern void edac_mc_handle_ue(struct mem_ctl_info *mci,
 -   unsigned long page_frame_number,
 -   unsigned long offset_in_page, int row,
 -   const char *msg);
 -extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci,
 -   const char *msg);
 -extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int 
 csrow,
 -   unsigned int channel0, 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version

2012-05-04 Thread Mauro Carvalho Chehab
Em 04-05-2012 07:16, Borislav Petkov escreveu:
 On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote:
 edac: Change internal representation to work with layers
...
  /**
 - * edac_mc_alloc: Allocate a struct mem_ctl_info structure
 - * @size_pvt:   size of private storage needed
 - * @nr_csrows:  Number of CWROWS needed for this MC
 - * @nr_chans:   Number of channels for the MC
 + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info 
 structure
 + * @mc_num: Memory controller number
 + * @n_layers:   Number of MC hierarchy layers
 + * layers:  Describes each layer as seen by the Memory Controller
 + * @size_pvt:   size of private storage needed
 + *
 + *
 + * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report
 + * such DIMMS properly, but the CSROWS-based ones will likely do the wrong
 
  DIMMs   csrow-based
 
 + * thing, as two chip select values are used for dual-rank memories (and 4, 
 for
 + * quad-rank ones). I suspect that this issue could be solved inside the 
 EDAC
 + * core for SDRAM memories, but it requires further study at JEDEC JESD 21C.
 
 The paragraph above is still in, let me repeat my last note:
 
 This last paragraph sounds innacurately, especially the likely
 adverbs make it even more confusing. The gist of what you're saying is
 already present in the commit message anyway, so drop it here pls.

There are two similar comments one for edac_mc_alloc and another for 
new_edac_mc_alloc.
It seems that I fixed just one of them.

 
 + *
 + * In summary, solving this issue is not easy, as it requires a lot of 
 testing.
 
 As before:
 
 Also superfluous and has nothing to do with edac_mc_alloc().
 
 Pls remove it.

Dropped both paragraphs.

 
   * Everything is kmalloc'ed as one big chunk - more efficient.
   * Only can be used if all structures have the same lifetime - otherwise
 @@ -168,22 +196,49 @@ void *edac_align_ptr(void **p, unsigned size, int 
 n_elems)
   *
   * Use edac_mc_free() to free mc structures allocated by this function.
   *
 + * NOTE: drivers handle multi-rank memories in different ways: in some
 + * drivers, one multi-rank memory stick is mapped as one entry, while, in
 + * others, a single multi-rank memory stick would be mapped into several
 + * entries. Currently, this function will allocate multiple struct dimm_info
 + * on such scenarios, as grouping the multiple ranks require drivers change.
 + *
   * Returns:
   *  NULL allocation failed
   *  struct mem_ctl_info pointer
 
 Ok, this patch still doesn't contain all the changes I requested for
 although you said you did them. Is this not the latest version? I'll
 wait for you to sort it out before I continue reviewing...
 
 Thanks.
 

Patch enclosed (and updated at Infradead).

Regards,
Mauro

-

edac: Change internal representation to work with layers

Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core was written with the idea that memory controllers
are able to directly access csrows.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMMs, instead of ranks.

So, change the allocation and error report routines to allow
them to work with all types of architectures.

This will allow the removal of several hacks with FB-DIMM and RAMBUS
memory controllers.

Also, several tests were done on different platforms using different
x86 drivers.

TODO: a multi-rank DIMMs are currently represented by multiple DIMM
entries in struct dimm_info. That means that changing a label for one
rank won't change the same label for the other ranks at the same DIMM.
This bug is present since the beginning of the EDAC, so it is not a big
deal. However, on several drivers, it is possible to fix this issue, but
it should be a per-driver fix, as the csrow = DIMM arrangement may not
be equal for all. So, don't try to fix it here yet.

I tried to make this patch as short as possible, preceding it with
several other patches that simplified the logic here. Yet, as the
internal API changes, all drivers need changes. The changes are
generally bigger in the drivers for FB-DIMMs.

Cc: Aristeu Rozanski aroza...@redhat.com
Cc: Doug Thompson nor...@yahoo.com
Cc: Borislav Petkov borislav.pet...@amd.com
Cc: Mark Gross mark.gr...@intel.com
Cc: Jason Uhlenkott juhle...@akamai.com
Cc: Tim Small t...@buttersideup.com
Cc: Ranganathan Desikan r...@jetztechnologies.com
Cc: Arvind R. arvin...@gmail.com
Cc: Olof Johansson o...@lixom.net
Cc: Egor Martovetsky e...@pasemi.com
Cc: Chris Metcalf cmetc...@tilera.com
Cc: Michal Marek 

[PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-05-02 Thread Borislav Petkov
Starting a new thread because the old one grew too big and
is out of my screen. Patch below is git-formatted from
git://git.infradead.org/users/mchehab/edac.git

 commit 447b7929e633027ffe131f2f8f246bba5690cee7
 Author: Mauro Carvalho Chehab mche...@redhat.com
 Date:   Wed Apr 18 15:20:50 2012 -0300
 
 edac: Change internal representation to work with layers
 
 Change the EDAC internal representation to work with non-csrow
 based memory controllers.
 
 There are lots of those memory controllers nowadays, and more
 are coming. So, the EDAC internal representation needs to be
 changed, in order to work with those memory controllers, while
 preserving backward compatibility with the old ones.
 
 The edac core was written with the idea that memory controllers
 are able to directly access csrows.
 
 This is not true for FB-DIMM and RAMBUS memory controllers.
 
 Also, some recent advanced memory controllers don't present a per-csrows
 view. Instead, they view memories as DIMMs, instead of ranks.
 
 So, change the allocation and error report routines to allow
 them to work with all types of architectures.
 
 This will allow the removal of several hacks with FB-DIMM and RAMBUS
 memory controllers.
 
 Also, several tests were done on different platforms using different
 x86 drivers.
 
 TODO: a multi-rank DIMMs are currently represented by multiple DIMM
 entries in struct dimm_info. That means that changing a label for one
 rank won't change the same label for the other ranks at the same DIMM.
 This bug is present since the beginning of the EDAC, so it is not a big
 deal. However, on several drivers, it is possible to fix this issue, but
 it should be a per-driver fix, as the csrow = DIMM arrangement may not
 be equal for all. So, don't try to fix it here yet.
 
 I tried to make this patch as short as possible, preceding it with
 several other patches that simplified the logic here. Yet, as the
 internal API changes, all drivers need changes. The changes are
 generally bigger in the drivers for FB-DIMMs.

snip already reviewed stuff

   /* Figure out the offsets of the various items from the start of an mc
* structure.  We want the alignment of each item to be at least as
 @@ -191,12 +253,28 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, 
 unsigned nr_csrows,
* hardcode everything into a single struct.
*/
   mci = edac_align_ptr(ptr, sizeof(*mci), 1);
 - csi = edac_align_ptr(ptr, sizeof(*csi), nr_csrows);
 - chi = edac_align_ptr(ptr, sizeof(*chi), nr_csrows * nr_chans);
 - dimm = edac_align_ptr(ptr, sizeof(*dimm), nr_csrows * nr_chans);
 + layer = edac_align_ptr(ptr, sizeof(*layer), n_layers);
 + csi = edac_align_ptr(ptr, sizeof(*csi), tot_csrows);
 + chi = edac_align_ptr(ptr, sizeof(*chi), tot_csrows * tot_channels);
 + dimm = edac_align_ptr(ptr, sizeof(*dimm), tot_dimms);
 + count = 1;
 + for (i = 0; i  n_layers; i++) {
 + count *= layers[i].size;
 + debugf4(%s: errcount layer %d size %d\n, __func__, i, count);
 + ce_per_layer[i] = edac_align_ptr(ptr, sizeof(u32), count);
 + ue_per_layer[i] = edac_align_ptr(ptr, sizeof(u32), count);
 + tot_errcount += 2 * count;
 + }
 +
 + debugf4(%s: allocating %d error counters\n, __func__, tot_errcount);
   pvt = edac_align_ptr(ptr, sz_pvt, 1);
   size = ((unsigned long)pvt) + sz_pvt;
  
 + debugf1(%s(): allocating %u bytes for mci data (%d %s, %d 
 csrows/channels)\n,
 + __func__, size,
 + tot_dimms,
 + per_rank ? ranks : dimms,
 + tot_csrows * tot_channels);
   mci = kzalloc(size, GFP_KERNEL);
   if (mci == NULL)
   return NULL;
 @@ -204,42 +282,101 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, 
 unsigned nr_csrows,
   /* Adjust pointers so they point within the memory we just allocated
* rather than an imaginary chunk of memory located at address 0.
*/
 + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned 
 long)layer));
   csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi));
   chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi));
   dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm));
 + for (i = 0; i  n_layers; i++) {
 + mci-ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
 long)ce_per_layer[i]));
 + mci-ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned 
 long)ue_per_layer[i]));
 + }
   pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL;
  
   /* setup index and various internal pointers */
   mci-mc_idx = edac_index;
   mci-csrows = csi;
   mci-dimms  = dimm;
 + mci-tot_dimms = tot_dimms;
   mci-pvt_info = pvt;
 - mci-nr_csrows 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Borislav Petkov
On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote:
  Hey, are you looking at compiled code or at source code? Because I'm
  looking at source code, and it is a pretty safe bet the majority of the
  people here do that too.
 
 What I said is that, from source code POV, a code where the loop variables are
 initialized just before the loop is easier to read it when the initialization
 of those vars are on another part of the code.
 
 That's basically why the for syntax starts with a var initialization clause.
 
 The tot_dimms  friends are loop vars: their value is calculated within the 
 loop.
 
 At the object code, this won't bring any difference.
 
  
  it, either by using registers for those vars or by moving the 
  initialization
  to the top of the function.
 
  This function is too complex, so it is better to initialize those vars
  just before the loops that are calculating those totals.
  
  Simply initialize those variables at declaration time and that's it.
  Initializing them before the loop doesn't make the function less complex
  - splitting it and sanitizing it does.
 
 Initializing loop-calculated vars just before the loop makes the code easier
 to read, and may avoid issues that might happen during code lifecycle.

This is getting ridiculous: the variable declaration and initialization
are on the same screen as the loop (unless one uses a screen which can
only show less than 40ish lines).

So the argument about making the code easier to read is bogus.

This function is already cluttered with a lot of crap, and is very large
so adding more lines which can simply be stashed away at declaration
time is better readability.

Besides, every modern editor can jump to the declaration of a local
variable so that the user can see to what it is initialized to.

 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 +unsigned n_layers,
 +struct edac_mc_layer *layers,
 +bool rev_order,
 +unsigned sz_pvt)
  {
   void *ptr = NULL;
   struct mem_ctl_info *mci;
 - struct csrow_info *csi, *csrow;
 + struct edac_mc_layer *layer;
 + struct csrow_info *csi, *csr;
   struct rank_info *chi, *chp, *chan;
   struct dimm_info *dimm;
 + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
   void *pvt;
 - unsigned size;
 - int row, chn;
 + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
 + unsigned tot_csrows, tot_channels, tot_errcount = 0;
 + int i, j;
   int err;
 + int row, chn;
 + bool per_rank = false;
 +
 + BUG_ON(n_layers  EDAC_MAX_LAYERS || n_layers == 0);
 + /*
 +  * Calculate the total amount of dimms and csrows/cschannels while
 +  * in the old API emulation mode
 +  */
 + tot_dimms = 1;
 + tot_channels = 1;
 + tot_csrows = 1;
 + for (i = 0; i  n_layers; i++) {
 + tot_dimms *= layers[i].size;
 + if (layers[i].is_virt_csrow)
 + tot_csrows *= layers[i].size;
 + else
 + tot_channels *= layers[i].size;
 +
 + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
 + per_rank = true;


-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Borislav Petkov
On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
  [   10.486440] EDAC MC: DCT0 chip selects:
  [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
  [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
  [   10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB
  [   10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB
  [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM 
  Bank Address Mapping): 0x0088
  [   10.486455] EDAC MC: DCT1 chip selects:
  [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
  [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
  [   10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB
  [   10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB
  [   10.486467] EDAC amd64: using x8 syndromes.
  [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 
  0x00083100
  [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; 
  all DIMMs support ECC: yes
  [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
  [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 
  64b
  [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs 
  present: L0: yes L1: yes L2: no L3: no
  [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 
  bits - need more decoding
  [   10.486488] EDAC amd64: MCT channel count: 2
  [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): 
  allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
  [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 
  (0:0:0): row 0, chan 0
  [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 
  (0:1:0): row 0, chan 1
  [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 
  (1:0:0): row 1, chan 0
  [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 
  (1:1:0): row 1, chan 1
  [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 
  (2:0:0): row 2, chan 0
  [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 
  (2:1:0): row 2, chan 1
  [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 
  (3:0:0): row 3, chan 0
  [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 
  (3:1:0): row 3, chan 1
  [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 
  (4:0:0): row 4, chan 0
  [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 
  (4:1:0): row 4, chan 1
  [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 
  (5:0:0): row 5, chan 0
  [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 
  (5:1:0): row 5, chan 1
  [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 
  (6:0:0): row 6, chan 0
  [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 
  (6:1:0): row 6, chan 1
  [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 
  (7:0:0): row 7, chan 0
  [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 
  (7:1:0): row 7, chan 1
  
  DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
  
  Now your change is showing 16 ranks. Still b0rked.
  
 No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.
 
 As I said before when you've pointed this bug (likel at v3 review), 
 edac_mc_alloc
 doesn't know how many ranks are filled, as the driver logic first calls it to 
 allocate for the max amount of ranks, and then fills the rank with their info 
 (or let them untouched with 0 pages, if they're empty).

Basically you're saying you're generating dimm_info structs for all
_possible_ dimms and the loop where this debug message comes from goes
and marrily initializes them all although some of them are empty:

+   for (i = 0; i  tot_dimms; i++) {
+   chan = csi[row].channels[chn];
+   dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers,
+  pos[0], pos[1], pos[2]);
+   dimm-mci = mci;
+
+   debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, 
__func__,
+   i, (dimm - mci-dimms),
+   pos[0], pos[1], pos[2], row, chn);
+
+   /* Copy DIMM location */
+   for (j = 0; j  n_layers; j++)
+   dimm-location[j] = pos[j];
...

definitely superfluous.

Oh well, looking at edac_mc_alloc, it used to allocate structs for all
csrows on the controller even though some of them were empty...

Ok, then please remove this debug call because it is misleading. Having

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci 
data (16 ranks, 16 csrows/channels)

is enough.

You probably want to say how many channels/csrows there are, though:

[   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci 
data (16 ranks, 8 csrows, 2 channels)

or 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 05:15, Borislav Petkov escreveu:
 On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
 [   10.486440] EDAC MC: DCT0 chip selects:
 [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM 
 Bank Address Mapping): 0x0088
 [   10.486455] EDAC MC: DCT1 chip selects:
 [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486467] EDAC amd64: using x8 syndromes.
 [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 
 0x00083100
 [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; 
 all DIMMs support ECC: yes
 [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
 [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 
 64b
 [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs 
 present: L0: yes L1: yes L2: no L3: no
 [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 
 bits - need more decoding
 [   10.486488] EDAC amd64: MCT channel count: 2
 [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): 
 allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
 [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 
 (0:0:0): row 0, chan 0
 [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 
 (0:1:0): row 0, chan 1
 [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 
 (1:0:0): row 1, chan 0
 [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 
 (1:1:0): row 1, chan 1
 [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 
 (2:0:0): row 2, chan 0
 [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 
 (2:1:0): row 2, chan 1
 [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 
 (3:0:0): row 3, chan 0
 [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 
 (3:1:0): row 3, chan 1
 [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 
 (4:0:0): row 4, chan 0
 [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 
 (4:1:0): row 4, chan 1
 [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 
 (5:0:0): row 5, chan 0
 [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 
 (5:1:0): row 5, chan 1
 [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 
 (6:0:0): row 6, chan 0
 [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 
 (6:1:0): row 6, chan 1
 [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 
 (7:0:0): row 7, chan 0
 [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 
 (7:1:0): row 7, chan 1

 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.

 Now your change is showing 16 ranks. Still b0rked.

 No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.

 As I said before when you've pointed this bug (likel at v3 review), 
 edac_mc_alloc
 doesn't know how many ranks are filled, as the driver logic first calls it 
 to 
 allocate for the max amount of ranks, and then fills the rank with their 
 info 
 (or let them untouched with 0 pages, if they're empty).
 
 Basically you're saying you're generating dimm_info structs for all
 _possible_ dimms and the loop where this debug message comes from goes
 and marrily initializes them all although some of them are empty:
 
 +   for (i = 0; i  tot_dimms; i++) {
 +   chan = csi[row].channels[chn];
 +   dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers,
 +  pos[0], pos[1], pos[2]);
 +   dimm-mci = mci;
 +
 +   debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, 
 __func__,
 +   i, (dimm - mci-dimms),
 +   pos[0], pos[1], pos[2], row, chn);
 +
 +   /* Copy DIMM location */
 +   for (j = 0; j  n_layers; j++)
 +   dimm-location[j] = pos[j];
 ...
 
 definitely superfluous.
 
 Oh well, looking at edac_mc_alloc, it used to allocate structs for all
 csrows on the controller even though some of them were empty...
 
 Ok, then please remove this debug call because it is misleading. Having
 
 [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci 
 data (16 ranks, 16 csrows/channels)
 
 is enough.
 
 You probably want to say how many channels/csrows there are, though:
 
 [   10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci 
 data (16 ranks, 8 csrows, 2 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Borislav Petkov
On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
 It seems you have a very short memory.

Oh, puh-lease, let's don't start with the insults now. You're not a
saint yourself. And maybe the fact that I'm having hard time grasping
your code is maybe because it is a load of crap and you seem to generate
a lot of senseless drivel when explaining what it does. And don't get me
started on the patch bombs.

So, let's stay constructive here before I, as the last and only one
person reviewing this stinking pile stops messing with it (I got other
stuff to do, you know) and NACK it completely.

 For example, this is the mapping used by the second memory controller of the 
 SB machine
 I'm using on my tests:
 
 [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
 ...
 [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 
 bytes for mci data (12 dimms, 12 csrows/channels)
 [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms
 [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): 
 row 0, chan 0
 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): 
 row 0, chan 1
 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): 
 row 0, chan 2
 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): 
 row 0, chan 3
 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): 
 row 1, chan 0
 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): 
 row 1, chan 1
 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): 
 row 1, chan 2
 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): 
 row 1, chan 3
 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): 
 row 2, chan 0
 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): 
 row 2, chan 1
 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): 
 row 2, chan 2
 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): 
 row 2, chan 3
 
 With the above info, it is clear that the DIMM located at mc#1, channel#3 
 slot#2 is
 called dimm11 at the new API, and corresponds to csrow 2, channel 3 for a 
 legacy
 EDAC API call.

Are all those DIMM slots above populated? What happens if they're not,
are you issuing the same dimm0-dimm11 lines for slots which aren't even
populated?

I have a much better idea: Generally, this debug info should come from
the specific driver that allocates the dimm descriptors, not from the
EDAC core. This way, you know in the driver which slots are populated
and those which are not should be omitted.

This way it says initializing 12 dimms and the user thinks there are
12 DIMMs on his system where this might not be true.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 04:59, Borislav Petkov escreveu:
 On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote:
 Hey, are you looking at compiled code or at source code? Because I'm
 looking at source code, and it is a pretty safe bet the majority of the
 people here do that too.

 What I said is that, from source code POV, a code where the loop variables 
 are
 initialized just before the loop is easier to read it when the initialization
 of those vars are on another part of the code.

 That's basically why the for syntax starts with a var initialization 
 clause.

 The tot_dimms  friends are loop vars: their value is calculated within the 
 loop.

 At the object code, this won't bring any difference.


 it, either by using registers for those vars or by moving the 
 initialization
 to the top of the function.

 This function is too complex, so it is better to initialize those vars
 just before the loops that are calculating those totals.

 Simply initialize those variables at declaration time and that's it.
 Initializing them before the loop doesn't make the function less complex
 - splitting it and sanitizing it does.

 Initializing loop-calculated vars just before the loop makes the code easier
 to read, and may avoid issues that might happen during code lifecycle.
 
 This is getting ridiculous:

With this I fully agree: you're nacking patches because it is not the way you
write your code, not because the code there is doing anything wrong.

If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why
should I do a change that will obfuscate the code?

 the variable declaration and initialization
 are on the same screen as the loop (unless one uses a screen which can
 only show less than 40ish lines).
 
 So the argument about making the code easier to read is bogus.
 
 This function is already cluttered with a lot of crap, and is very large
 so adding more lines which can simply be stashed away at declaration
 time is better readability.
 
 Besides, every modern editor can jump to the declaration of a local
 variable so that the user can see to what it is initialized to.

The editor used by te developer is not relevant. This is not a reason
to obfuscate the code.

 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 +unsigned n_layers,
 +struct edac_mc_layer *layers,
 +bool rev_order,
 +unsigned sz_pvt)
  {
   void *ptr = NULL;
   struct mem_ctl_info *mci;
 - struct csrow_info *csi, *csrow;
 + struct edac_mc_layer *layer;
 + struct csrow_info *csi, *csr;
   struct rank_info *chi, *chp, *chan;
   struct dimm_info *dimm;
 + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
   void *pvt;
 - unsigned size;
 - int row, chn;
 + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
 + unsigned tot_csrows, tot_channels, tot_errcount = 0;
 + int i, j;
   int err;
 + int row, chn;
 + bool per_rank = false;
 +
 + BUG_ON(n_layers  EDAC_MAX_LAYERS || n_layers == 0);
 + /*
 +  * Calculate the total amount of dimms and csrows/cschannels while
 +  * in the old API emulation mode
 +  */
 + tot_dimms = 1;
 + tot_channels = 1;
 + tot_csrows = 1;
 + for (i = 0; i  n_layers; i++) {
 + tot_dimms *= layers[i].size;
 + if (layers[i].is_virt_csrow)
 + tot_csrows *= layers[i].size;
 + else
 + tot_channels *= layers[i].size;
 +
 + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
 + per_rank = true;

Regards,
Mauro

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 05:15, Borislav Petkov escreveu:
 On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote:
 [   10.486440] EDAC MC: DCT0 chip selects:
 [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM 
 Bank Address Mapping): 0x0088
 [   10.486455] EDAC MC: DCT1 chip selects:
 [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486467] EDAC amd64: using x8 syndromes.
 [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 
 0x00083100
 [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; 
 all DIMMs support ECC: yes
 [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
 [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 
 64b
 [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs 
 present: L0: yes L1: yes L2: no L3: no
 [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 
 bits - need more decoding
 [   10.486488] EDAC amd64: MCT channel count: 2
 [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): 
 allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels)
 [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 
 (0:0:0): row 0, chan 0
 [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 
 (0:1:0): row 0, chan 1
 [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 
 (1:0:0): row 1, chan 0
 [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 
 (1:1:0): row 1, chan 1
 [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 
 (2:0:0): row 2, chan 0
 [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 
 (2:1:0): row 2, chan 1
 [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 
 (3:0:0): row 3, chan 0
 [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 
 (3:1:0): row 3, chan 1
 [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 
 (4:0:0): row 4, chan 0
 [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 
 (4:1:0): row 4, chan 1
 [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 
 (5:0:0): row 5, chan 0
 [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 
 (5:1:0): row 5, chan 1
 [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 
 (6:0:0): row 6, chan 0
 [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 
 (6:1:0): row 6, chan 1
 [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 
 (7:0:0): row 7, chan 0
 [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 
 (7:1:0): row 7, chan 1

 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.

 Now your change is showing 16 ranks. Still b0rked.

 No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.

 As I said before when you've pointed this bug (likel at v3 review), 
 edac_mc_alloc
 doesn't know how many ranks are filled, as the driver logic first calls it 
 to 
 allocate for the max amount of ranks, and then fills the rank with their 
 info 
 (or let them untouched with 0 pages, if they're empty).
 
 Basically you're saying you're generating dimm_info structs for all
 _possible_ dimms and the loop where this debug message comes from goes
 and marrily initializes them all although some of them are empty:
 
 +   for (i = 0; i  tot_dimms; i++) {
 +   chan = csi[row].channels[chn];
 +   dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers,
 +  pos[0], pos[1], pos[2]);
 +   dimm-mci = mci;
 +
 +   debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, 
 __func__,
 +   i, (dimm - mci-dimms),
 +   pos[0], pos[1], pos[2], row, chn);
 +
 +   /* Copy DIMM location */
 +   for (j = 0; j  n_layers; j++)
 +   dimm-location[j] = pos[j];
 ...
 
 definitely superfluous.

This is the way the EDAC core works: everything is allocated, on one shot, when 
this
function is called, and, on most drivers, before the code that probes how many 
DIMMS/ranks
got initialized. That happens because the edac_mc_alloc() arguments provide the 
total
amount of ranks/dimms, but doesn't say anything about what is used there.

Changing from this model to another model that would dynamically initialize the 
per-dimm/rank
data is possible, but that would require another set of patches that will touch 
on all

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 08:11, Borislav Petkov escreveu:
 On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

 For example, this is the mapping used by the second memory controller of the 
 SB machine
 I'm using on my tests:

 [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
 ...
 [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 
 bytes for mci data (12 dimms, 12 csrows/channels)
 [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 
 dimms
 [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): 
 row 0, chan 0
 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): 
 row 0, chan 1
 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): 
 row 0, chan 2
 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): 
 row 0, chan 3
 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): 
 row 1, chan 0
 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): 
 row 1, chan 1
 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): 
 row 1, chan 2
 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): 
 row 1, chan 3
 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): 
 row 2, chan 0
 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): 
 row 2, chan 1
 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): 
 row 2, chan 2
 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): 
 row 2, chan 3

 With the above info, it is clear that the DIMM located at mc#1, channel#3 
 slot#2 is
 called dimm11 at the new API, and corresponds to csrow 2, channel 3 for 
 a legacy
 EDAC API call.
 
 Are all those DIMM slots above populated? What happens if they're not,
 are you issuing the same dimm0-dimm11 lines for slots which aren't even
 populated?
 
 I have a much better idea: Generally, this debug info should come from
 the specific driver that allocates the dimm descriptors, not from the
 EDAC core. This way, you know in the driver which slots are populated
 and those which are not should be omitted.

The drivers don't allocate the dimm descriptors. They're allocated by the
core.

 This way it says initializing 12 dimms and the user thinks there are
 12 DIMMs on his system where this might not be true.


I'm OK to remove the initializing 12 dimms message. It doesn't add anything
new.

With regards do the other messages, if the debug messages are not clear, 
then let's fix them, instead of removing. What if we print, instead,
on a message like:

row 1, chan 1 will represent dimm5 (1:2:0) if not empty

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Borislav Petkov
On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
 Em 30-04-2012 08:11, Borislav Petkov escreveu:
  On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
 
  For example, this is the mapping used by the second memory controller of 
  the SB machine
  I'm using on my tests:
 
  [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2)
  ...
  [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 
  bytes for mci data (12 dimms, 12 csrows/channels)
  [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 
  dimms
  [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): 
  row 0, chan 0
  [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): 
  row 0, chan 1
  [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): 
  row 0, chan 2
  [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): 
  row 0, chan 3
  [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): 
  row 1, chan 0
  [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): 
  row 1, chan 1
  [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): 
  row 1, chan 2
  [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): 
  row 1, chan 3
  [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): 
  row 2, chan 0
  [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): 
  row 2, chan 1
  [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 
  (3:1:0): row 2, chan 2
  [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 
  (3:2:0): row 2, chan 3
 
  With the above info, it is clear that the DIMM located at mc#1, channel#3 
  slot#2 is
  called dimm11 at the new API, and corresponds to csrow 2, channel 3 
  for a legacy
  EDAC API call.
  
  Are all those DIMM slots above populated? What happens if they're not,
  are you issuing the same dimm0-dimm11 lines for slots which aren't even
  populated?
  
  I have a much better idea: Generally, this debug info should come from
  the specific driver that allocates the dimm descriptors, not from the
  EDAC core. This way, you know in the driver which slots are populated
  and those which are not should be omitted.
 
 The drivers don't allocate the dimm descriptors. They're allocated by the
 core.

I know that. The drivers call into EDAC core using edac_mc_alloc, this
is what I meant above.

  This way it says initializing 12 dimms and the user thinks there are
  12 DIMMs on his system where this might not be true.
 
 
 I'm OK to remove the initializing 12 dimms message. It doesn't add anything
 new.
 
 With regards do the other messages, if the debug messages are not clear, 
 then let's fix them, instead of removing. What if we print, instead,
 on a message like:
 
   row 1, chan 1 will represent dimm5 (1:2:0) if not empty

How about the following instead: the specific driver calls
edac_mc_alloc(), it gets the allocated dimm array in mci-dimms
_without_ dumping each dimm%d line. Then, each driver figures out which
subset of that dimms array actually has populated slots and prints only
the populated rank/slot/...

This information is much more valuable than saying how many _possible_
slots the edac core has allocated.

Then, each driver can decide whether it makes sense to dump that info or
not.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Borislav Petkov
On Mon, Apr 30, 2012 at 08:23:42AM -0300, Mauro Carvalho Chehab wrote:
 With this I fully agree: you're nacking patches because it is not the way you

Where? Have I written Nacked-by somewhere?

 write your code, not because the code there is doing anything wrong.
 
 If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why
 should I do a change that will obfuscate the code?

What obfuscation are you talking about? Having the initialization of
variables along with their declaration is not it.

Now let's look what you're doing:

  + unsigned tot_csrows, tot_channels, tot_errcount = 0;
  + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];

just to reassign 1 to some of them

  + tot_dimms = 1;
  + tot_channels = 1;
  + tot_csrows = 1;

a couple of lines below.

Now this is misleading.

Now let's look at what I'm proposing

unsigned tot_dimms = 1;
unsigned tot_csrows = 1;
unsigned tot_channels = 1;

How is this an obfuscation? It is basic code layout practices.

 The editor used by te developer is not relevant. This is not a reason
 to obfuscate the code.
 
  +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
  +unsigned n_layers,
  +struct edac_mc_layer *layers,
  +bool rev_order,
  +unsigned sz_pvt)
   {
void *ptr = NULL;
struct mem_ctl_info *mci;
  - struct csrow_info *csi, *csrow;
  + struct edac_mc_layer *layer;
  + struct csrow_info *csi, *csr;
struct rank_info *chi, *chp, *chan;
struct dimm_info *dimm;
  + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
void *pvt;
  - unsigned size;
  - int row, chn;
  + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
  + unsigned tot_csrows, tot_channels, tot_errcount = 0;
  + int i, j;
int err;
  + int row, chn;
  + bool per_rank = false;
  +
  + BUG_ON(n_layers  EDAC_MAX_LAYERS || n_layers == 0);
  + /*
  +  * Calculate the total amount of dimms and csrows/cschannels while
  +  * in the old API emulation mode
  +  */
  + tot_dimms = 1;
  + tot_channels = 1;
  + tot_csrows = 1;
  + for (i = 0; i  n_layers; i++) {
  + tot_dimms *= layers[i].size;
  + if (layers[i].is_virt_csrow)
  + tot_csrows *= layers[i].size;
  + else
  + tot_channels *= layers[i].size;
  +
  + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT)
  + per_rank = true;

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 09:38, Borislav Petkov escreveu:
 On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
 Em 30-04-2012 08:11, Borislav Petkov escreveu:
 On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:

 This way it says initializing 12 dimms and the user thinks there are
 12 DIMMs on his system where this might not be true.


 I'm OK to remove the initializing 12 dimms message. It doesn't add anything
 new.

 With regards do the other messages, if the debug messages are not clear, 
 then let's fix them, instead of removing. What if we print, instead,
 on a message like:

  row 1, chan 1 will represent dimm5 (1:2:0) if not empty
 
 How about the following instead: the specific driver calls
 edac_mc_alloc(), it gets the allocated dimm array in mci-dimms
 _without_ dumping each dimm%d line. Then, each driver figures out which
 subset of that dimms array actually has populated slots and prints only
 the populated rank/slot/...
 
 This information is much more valuable than saying how many _possible_
 slots the edac core has allocated.
 
 Then, each driver can decide whether it makes sense to dump that info or
 not.

No, that would add extra complexity at the drivers level just due to debug
messages. I think that the better is to move this printk to the debug-specific
routine that is called only when the dimm is filled (edac_mc_dump_dimm).

With this cange, the message will be printed only for the filled dimms.

This is a cleanup patch, so I'll write it, together with the change that
will get rid of the loop that uses KERN_CONT. It will use a function added
by a latter patch at edac_mc_sysfs so it can't be merged on this patch
anyway.

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-30 Thread Mauro Carvalho Chehab
Em 30-04-2012 10:00, Mauro Carvalho Chehab escreveu:
 Em 30-04-2012 09:38, Borislav Petkov escreveu:
 On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote:
 Em 30-04-2012 08:11, Borislav Petkov escreveu:
 On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote:
 
 This way it says initializing 12 dimms and the user thinks there are
 12 DIMMs on his system where this might not be true.


 I'm OK to remove the initializing 12 dimms message. It doesn't add 
 anything
 new.

 With regards do the other messages, if the debug messages are not clear, 
 then let's fix them, instead of removing. What if we print, instead,
 on a message like:

 row 1, chan 1 will represent dimm5 (1:2:0) if not empty

 How about the following instead: the specific driver calls
 edac_mc_alloc(), it gets the allocated dimm array in mci-dimms
 _without_ dumping each dimm%d line. Then, each driver figures out which
 subset of that dimms array actually has populated slots and prints only
 the populated rank/slot/...

 This information is much more valuable than saying how many _possible_
 slots the edac core has allocated.

 Then, each driver can decide whether it makes sense to dump that info or
 not.
 
 No, that would add extra complexity at the drivers level just due to debug
 messages. I think that the better is to move this printk to the debug-specific
 routine that is called only when the dimm is filled (edac_mc_dump_dimm).
 
 With this cange, the message will be printed only for the filled dimms.
 
 This is a cleanup patch, so I'll write it, together with the change that
 will get rid of the loop that uses KERN_CONT. It will use a function added
 by a latter patch at edac_mc_sysfs so it can't be merged on this patch
 anyway.

The following patch dos the debug cleanup. I'll add at the end of my tree.

Regards,
Mauro.


From: Mauro Carvalho Chehab mche...@redhat.com
Date: Mon, 30 Apr 2012 10:24:43 -0300
Subject: [PATCH] edac_mc: Cleanup per-dimm_info debug messages

The edac_mc_alloc() routine allocates one dimm_info device for all
possible memories, including the non-filled ones. The debug messages
there are somewhat confusing. So, cleans them, by moving the code
that prints the memory location to edac_mc, and using it on both
edac_mc_sysfs and edac_mc.

After this patch, a dimm-based memory controller will print the debug
info as:

[  728.430828] EDAC DEBUG: edac_mc_dump_dimm:   dimm2: channel 0 slot 2 mapped 
as virtual row 0, chan 2
[  728.430834] EDAC DEBUG: edac_mc_dump_dimm:   dimm-label = 
'mc#0channel#0slot#2'
[  728.430839] EDAC DEBUG: edac_mc_dump_dimm:   dimm-nr_pages = 0x0
[  728.430846] EDAC DEBUG: edac_mc_dump_dimm:   dimm-grain = 0
[  728.430850] EDAC DEBUG: edac_mc_dump_dimm:   dimm-nr_pages = 0x0

(a rank-based memory controller would print, instead, rank2
 on the above debug info)

Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index d8278b3..1bc2843 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -40,6 +40,25 @@
 static DEFINE_MUTEX(mem_ctls_mutex);
 static LIST_HEAD(mc_devices);
 
+unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf,
+int len)
+{
+   struct mem_ctl_info *mci = dimm-mci;
+   int i, n, count = 0;
+   char *p = buf;
+
+   for (i = 0; i  mci-n_layers; i++) {
+   n = snprintf(p, len, %s %d ,
+ edac_layer_name[mci-layers[i].type],
+ dimm-location[i]);
+   p += n;
+   len -= n;
+   count += n;
+   }
+
+   return count;
+}
+
 #ifdef CONFIG_EDAC_DEBUG
 
 static void edac_mc_dump_channel(struct rank_info *chan)
@@ -50,20 +69,18 @@ static void edac_mc_dump_channel(struct rank_info *chan)
debugf4(\tchannel-dimm = %p\n, chan-dimm);
 }
 
-static void edac_mc_dump_dimm(struct dimm_info *dimm)
+static void edac_mc_dump_dimm(struct dimm_info *dimm, int number)
 {
-   int i;
+   char location[80];
+
+   edac_dimm_info_location(dimm, location, sizeof(location));
 
debugf4(\tdimm = %p\n, dimm);
+   debugf4(\t%s%i: %smapped as virtual row %d, chan %d\n,
+   dimm-mci-mem_is_per_rank ? rank : dimm,
+   number, location, dimm-csrow, dimm-cschannel);
debugf4(\tdimm-label = '%s'\n, dimm-label);
debugf4(\tdimm-nr_pages = 0x%x\n, dimm-nr_pages);
-   debugf4(\tdimm location );
-   for (i = 0; i  dimm-mci-n_layers; i++) {
-   printk(KERN_CONT %d, dimm-location[i]);
-   if (i  dimm-mci-n_layers - 1)
-   printk(KERN_CONT .);
-   }
-   printk(KERN_CONT \n);
debugf4(\tdimm-grain = %d\n, dimm-grain);
debugf4(\tdimm-nr_pages = 0x%x\n, dimm-nr_pages);
 }
@@ -337,8 +354,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned edac_index,
memset(pos, 0, sizeof(pos));
row = 0;

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-29 Thread Mauro Carvalho Chehab
Em 28-04-2012 06:05, Borislav Petkov escreveu:
 On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote:
 The fix for it were in another patch[1], as calling them as rank is
 needed also at the sysfs API.
 
 No, this doesn't fix it either:
 
 [   10.486440] EDAC MC: DCT0 chip selects:
 [   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank 
 Address Mapping): 0x0088
 [   10.486455] EDAC MC: DCT1 chip selects:
 [   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [   10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB
 [   10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB
 [   10.486467] EDAC amd64: using x8 syndromes.
 [   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 
 0x00083100
 [   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all 
 DIMMs support ECC: yes
 [   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
 [   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 
 64b
 [   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs 
 present: L0: yes L1: yes L2: no L3: no
 [   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 
 bits - need more decoding
 [   10.486488] EDAC amd64: MCT channel count: 2
 [   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 
 3692 bytes for mci data (16 ranks, 16 csrows/channels)
 [   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 
 (0:0:0): row 0, chan 0
 [   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 
 (0:1:0): row 0, chan 1
 [   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 
 (1:0:0): row 1, chan 0
 [   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 
 (1:1:0): row 1, chan 1
 [   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 
 (2:0:0): row 2, chan 0
 [   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 
 (2:1:0): row 2, chan 1
 [   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 
 (3:0:0): row 3, chan 0
 [   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 
 (3:1:0): row 3, chan 1
 [   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 
 (4:0:0): row 4, chan 0
 [   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 
 (4:1:0): row 4, chan 1
 [   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 
 (5:0:0): row 5, chan 0
 [   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 
 (5:1:0): row 5, chan 1
 [   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 
 (6:0:0): row 6, chan 0
 [   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 
 (6:1:0): row 6, chan 1
 [   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 
 (7:0:0): row 7, chan 0
 [   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 
 (7:1:0): row 7, chan 1
 
 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.
 
 Now your change is showing 16 ranks. Still b0rked.
 
No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK.

As I said before when you've pointed this bug (likel at v3 review), 
edac_mc_alloc
doesn't know how many ranks are filled, as the driver logic first calls it to 
allocate for the max amount of ranks, and then fills the rank with their info 
(or let them untouched with 0 pages, if they're empty).

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-29 Thread Mauro Carvalho Chehab
Em 28-04-2012 14:07, Joe Perches escreveu:
 On Sat, 2012-04-28 at 11:16 +0200, Borislav Petkov wrote:
 On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote:
 All those local variables should be sorted in a reverse christmas tree
 order:

u32 this_is_the_longest_array_name[LENGTH];
void *shorter_named_variable;
unsigned long size;
int i;

...

 Why? There's nothing at the CodingStyle saying about how the vars should
 be ordered. If you want to enforce some particular order, please do it
 yourself, but apply it consistently among the entire subsystem.

 First of all, this way it is more readable.
 
 Not in my opinion, and blindly using reverse christmas tree
 can separate variables that should be declared together.

I agree with Joe. The order won't make the code easier or harder to
read, nor it would improve code performance.

 Second of all, maybe we should hold it down in CodingStyle.

Different developers have different opinions about how to order includes, 
functions, vars, etc. 

So, this is not at CodingStyle because there's no consensus about it, and 
because this is not relevant for code understanding.

A reviewer should not reject a patch just because he doesn't like the
order that the developer used.

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-29 Thread Mauro Carvalho Chehab
Em 28-04-2012 06:16, Borislav Petkov escreveu:
 On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote:
 

 Also, is it valid to have n_layers == 0? The memcpy call below
 will do nothing.

 Changed to:
  BUG_ON(n_layers  EDAC_MAX_LAYERS || n_layers == 0);
 
 Really? Look below.

Weird, not sure what happened here... it seems I sent the version before
this change.

The patch I've made has the correct BUG_ON at:

http://git.infradead.org/users/mchehab/edac.git/commitdiff/447b7929e633027ffe131f2f8f246bba5690cee7

 

 +  /*
 +   * Calculate the total amount of dimms and csrows/cschannels while
 +   * in the old API emulation mode
 +   */
 +  tot_dimms = 1;
 +  tot_cschannels = 1;
 +  tot_csrows = 1;

 Those initializations can be done above at variable declaration time.

 Yes, but the compiled code will be the same anyway, as gcc will optimize
 
 Hey, are you looking at compiled code or at source code? Because I'm
 looking at source code, and it is a pretty safe bet the majority of the
 people here do that too.

What I said is that, from source code POV, a code where the loop variables are
initialized just before the loop is easier to read it when the initialization
of those vars are on another part of the code.

That's basically why the for syntax starts with a var initialization clause.

The tot_dimms  friends are loop vars: their value is calculated within the 
loop.

At the object code, this won't bring any difference.

 
 it, either by using registers for those vars or by moving the initialization
 to the top of the function.

 This function is too complex, so it is better to initialize those vars
 just before the loops that are calculating those totals.
 
 Simply initialize those variables at declaration time and that's it.
 Initializing them before the loop doesn't make the function less complex
 - splitting it and sanitizing it does.

Initializing loop-calculated vars just before the loop makes the code easier
to read, and may avoid issues that might happen during code lifecycle.
 

 +  for (i = 0; i  n_layers; i++) {
 +  tot_dimms *= layers[i].size;
 +  if (layers[i].is_virt_csrow)
 +  tot_csrows *= layers[i].size;
 +  else
 +  tot_cschannels *= layers[i].size;
 +  }
 
 [..]
 
 -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
 -unsigned nr_chans, int edac_index)
 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 +   unsigned n_layers,
 +   struct edac_mc_layer *layers,
 +   bool rev_order,
 +   unsigned sz_pvt)
  {
  void *ptr = NULL;
  struct mem_ctl_info *mci;
 -struct csrow_info *csi, *csrow;
 +struct edac_mc_layer *layer;
 +struct csrow_info *csi, *csr;
  struct rank_info *chi, *chp, *chan;
  struct dimm_info *dimm;
 +u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
  void *pvt;
 -unsigned size;
 -int row, chn;
 +unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
 +unsigned tot_csrows, tot_channels, tot_errcount = 0;
 +int i, j;
  int err;
 +int row, chn;
 +bool per_rank = false;
 +
 +BUG_ON(n_layers  EDAC_MAX_LAYERS);
 
   ^^
 

Let me re-send it, with the right BUG_ON there.

commit 447b7929e633027ffe131f2f8f246bba5690cee7
Author: Mauro Carvalho Chehab mche...@redhat.com
Date:   Wed Apr 18 15:20:50 2012 -0300

edac: Change internal representation to work with layers

Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core was written with the idea that memory controllers
are able to directly access csrows.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMMs, instead of ranks.

So, change the allocation and error report routines to allow
them to work with all types of architectures.

This will allow the removal of several hacks with FB-DIMM and RAMBUS
memory controllers.

Also, several tests were done on different platforms using different
x86 drivers.

TODO: a multi-rank DIMMs are currently represented by multiple DIMM
entries in struct dimm_info. That means that changing a label for one
rank won't change the same label for the other ranks at the same DIMM.
This bug is present since the beginning of the EDAC, so it is not a big
deal. However, on several drivers, it is possible to fix this 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-29 Thread Mauro Carvalho Chehab
Em 28-04-2012 05:52, Borislav Petkov escreveu:
 On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote:
 Yes. This is a common issue at the EDAC core: on several places, it calls the
 edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, 
 while
 the debug macros already handles that. I suspect that, in the past, the 
 __func__
 were not at the macros, but some patch added it there, and forgot to fix the
 occurrences of its call.
 
 The patch that added it is d357cbb445208 and you reviewed it.

And you wrote the patch that caused it.

 
 This is something that needs to be reviewed at the entire EDAC core (and 
 likely
 at the drivers).
 
 Looks like a job for a newbie to get her/his feet wet with kernel work.

 
 I opted to not touch on this at the existing debug logic, as I think that the
 better is to address all those issues on one separate patch, after fixing the
 EDAC core bugs.
 
 No,
 
 you simply need to remove the __func__ argument in your newly added debug 
 call:
 
 debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, 
 __func__,
 i, (dimm - mci-dimms),
 pos[0], pos[1], pos[2], row, chn);
 
 And while you're at it, remove the rest of the __func__ arguments from
 your newly added debugfX calls.

A single patch fixing this everywhere at drivers/edac is better and clearer 
than adding 
an unrelated fix on this patch. This is already complex enough to add more 
unrelated
things there.

Also, a simple perl/coccinelle script can replace all such __func__ occurrences 
on one shot.

Regards,
Mauro

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-29 Thread Joe Perches
On Sun, 2012-04-29 at 12:11 -0300, Mauro Carvalho Chehab wrote:
 Em 29-04-2012 11:25, Mauro Carvalho Chehab escreveu:
  Em 28-04-2012 05:52, Borislav Petkov escreveu:
  On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote:
  Yes. This is a common issue at the EDAC core: on several places, it calls 
  the
  edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, 
  while
  the debug macros already handles that. I suspect that, in the past, the 
  __func__
  were not at the macros, but some patch added it there, and forgot to fix 
  the
  occurrences of its call.
  The patch that added it is d357cbb445208 and you reviewed it.
  And you wrote the patch that caused it.

And Boris should have also written the follow-on patches that
removed most/all of the debugfX and __func__ uses.

  A single patch fixing this everywhere at drivers/edac is better and clearer 
  than adding 
  an unrelated fix on this patch. This is already complex enough to add more 
  unrelated
  things there.
  
  Also, a simple perl/coccinelle script can replace all such __func__ 
  occurrences 
  on one shot.

You make it sound simple, but it'd be a pretty complicated
cocci script.  Some of the changes would have to be inspected
or changed by hand in any case.

[]

 Most of the issues can be solved with the above script-based patch. 
 
 There are still 171 places (12 places at the core, the rest are on the 
 drivers)
 that will require a more sophisticated patch or that requires a manual fix.
[]
 From: Mauro Carvalho Chehab mche...@redhat.com
 Date: Sun, 29 Apr 2012 11:59:14 -0300
 Subject: [PATCH] edac: Don't add __func__ or __FILE__ for debugf[0-9] msgs

Thanks Mauro, you shouldn't have had to do this.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Borislav Petkov
On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote:
 Yes. This is a common issue at the EDAC core: on several places, it calls the
 edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while
 the debug macros already handles that. I suspect that, in the past, the 
 __func__
 were not at the macros, but some patch added it there, and forgot to fix the
 occurrences of its call.

The patch that added it is d357cbb445208 and you reviewed it.

 This is something that needs to be reviewed at the entire EDAC core (and 
 likely
 at the drivers).

Looks like a job for a newbie to get her/his feet wet with kernel work.

 I opted to not touch on this at the existing debug logic, as I think that the
 better is to address all those issues on one separate patch, after fixing the
 EDAC core bugs.

No,

you simply need to remove the __func__ argument in your newly added debug call:

debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, 
__func__,
i, (dimm - mci-dimms),
pos[0], pos[1], pos[2], row, chn);

And while you're at it, remove the rest of the __func__ arguments from
your newly added debugfX calls.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Borislav Petkov
On Fri, Apr 27, 2012 at 04:24:28PM -0300, Mauro Carvalho Chehab wrote:
 Em 27-04-2012 15:11, Luck, Tony escreveu:
  +for (i = 0; i  dimm-mci-n_layers; i++) {
  +printk(KERN_CONT %d, dimm-location[i]);
  +if (i  dimm-mci-n_layers - 1)
  +printk(KERN_CONT .);
  +}
  +printk(KERN_CONT \n);
 
  This looks hacky but I don't have a good suggestion what to do instead
  here. Maybe snprintf into a complete string which you can issue with
  debugf4()...
 
  This is not hacky. There are several places at the Kernel doing loops like
  that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this
  macro was added later - probably to avoid checkpatch.pl complains).
  
  There is some benefit to one printk == one output line ... it means
  that console output will not be (as) jumbled if multiple cpus are
  printk'ing at the same time.
 
 Ok, but this message only appears when all the conditions below are met:
   - the driver is compiled with EDAC_DEBUG;
   - the edac_core is modprobed with edac_debug_level=4;
   - during the driver modprobe, when the EDAC driver is being registered.

That means nothing.

 Even on several-core machines, those messages won't mangle, in practice.
 
 Let's not over-design a simple debug message.

No, let's design a simple debug message correctly, regardless of when it
appears.

 
 Regards,
 Mauro
 

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Borislav Petkov
On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote:
 The fix for it were in another patch[1], as calling them as rank is
 needed also at the sysfs API.

No, this doesn't fix it either:

[   10.486440] EDAC MC: DCT0 chip selects:
[   10.486443] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[   10.486445] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[   10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB
[   10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB
[   10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank 
Address Mapping): 0x0088
[   10.486455] EDAC MC: DCT1 chip selects:
[   10.486458] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[   10.486460] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[   10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB
[   10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB
[   10.486467] EDAC amd64: using x8 syndromes.
[   10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 
0x00083100
[   10.486472] EDAC DEBUG: amd64_dump_dramcfg_low:   DIMM type: buffered; all 
DIMMs support ECC: yes
[   10.486475] EDAC DEBUG: amd64_dump_dramcfg_low:   PAR/ERR parity: enabled
[   10.486478] EDAC DEBUG: amd64_dump_dramcfg_low:   DCT 128bit mode width: 64b
[   10.486481] EDAC DEBUG: amd64_dump_dramcfg_low:   x4 logical DIMMs present: 
L0: yes L1: yes L2: no L3: no
[   10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits 
- need more decoding
[   10.486488] EDAC amd64: MCT channel count: 2
[   10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 
3692 bytes for mci data (16 ranks, 16 csrows/channels)
[   10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 
(0:0:0): row 0, chan 0
[   10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 
(0:1:0): row 0, chan 1
[   10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 
(1:0:0): row 1, chan 0
[   10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 
(1:1:0): row 1, chan 1
[   10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 
(2:0:0): row 2, chan 0
[   10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 
(2:1:0): row 2, chan 1
[   10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 
(3:0:0): row 3, chan 0
[   10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 
(3:1:0): row 3, chan 1
[   10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 
(4:0:0): row 4, chan 0
[   10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 
(4:1:0): row 4, chan 1
[   10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 
(5:0:0): row 5, chan 0
[   10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 
(5:1:0): row 5, chan 1
[   10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 
(6:0:0): row 6, chan 0
[   10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 
(6:1:0): row 6, chan 1
[   10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 
(7:0:0): row 7, chan 0
[   10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 
(7:1:0): row 7, chan 1

DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total.

Now your change is showing 16 ranks. Still b0rked.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Borislav Petkov
On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote:

[..]

  +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
  + unsigned n_layers,
  + struct edac_mc_layer *layers,
  + bool rev_order,
  + unsigned sz_pvt)
  
  strange function argument vertical alignment
  
   {
 void *ptr = NULL;
 struct mem_ctl_info *mci;
  -  struct csrow_info *csi, *csrow;
  +  struct edac_mc_layer *lay;
  
  As before, call this layers pls.
  
  +  struct csrow_info *csi, *csr;
 struct rank_info *chi, *chp, *chan;
 struct dimm_info *dimm;
  +  u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
 void *pvt;
  -  unsigned size;
  -  int row, chn;
  +  unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
  +  unsigned tot_csrows, tot_cschannels;
  
  No need to call this tot_cschannels - tot_channels should be enough.
  
  +  int i, j;
 int err;
  +  int row, chn;
  
  All those local variables should be sorted in a reverse christmas tree
  order:
  
  u32 this_is_the_longest_array_name[LENGTH];
  void *shorter_named_variable;
  unsigned long size;
  int i;
  
  ...
 
 Why? There's nothing at the CodingStyle saying about how the vars should
 be ordered. If you want to enforce some particular order, please do it
 yourself, but apply it consistently among the entire subsystem.

First of all, this way it is more readable. Second of all, maybe we
should hold it down in CodingStyle. Third of all, you touch this code so you
could fix it up to be more readable while you're at it.

  +
  +  BUG_ON(n_layers  EDAC_MAX_LAYERS);
  
  
  Push this BUG_ON up into edac_mc_alloc as the first thing this function
  does.
 
 It is already the first thing at the function.

Ah, that happens later in the patchset where you rename it back to
edac_mc_alloc, ok.

  Also, is it valid to have n_layers == 0? The memcpy call below
  will do nothing.
 
 Changed to:
   BUG_ON(n_layers  EDAC_MAX_LAYERS || n_layers == 0);

Really? Look below.

 
  +  /*
  +   * Calculate the total amount of dimms and csrows/cschannels while
  +   * in the old API emulation mode
  +   */
  +  tot_dimms = 1;
  +  tot_cschannels = 1;
  +  tot_csrows = 1;
  
  Those initializations can be done above at variable declaration time.
 
 Yes, but the compiled code will be the same anyway, as gcc will optimize

Hey, are you looking at compiled code or at source code? Because I'm
looking at source code, and it is a pretty safe bet the majority of the
people here do that too.

 it, either by using registers for those vars or by moving the initialization
 to the top of the function.
 
 This function is too complex, so it is better to initialize those vars
 just before the loops that are calculating those totals.

Simply initialize those variables at declaration time and that's it.
Initializing them before the loop doesn't make the function less complex
- splitting it and sanitizing it does.

  
  +  for (i = 0; i  n_layers; i++) {
  +  tot_dimms *= layers[i].size;
  +  if (layers[i].is_virt_csrow)
  +  tot_csrows *= layers[i].size;
  +  else
  +  tot_cschannels *= layers[i].size;
  +  }

[..]

 -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
 - unsigned nr_chans, int edac_index)
 +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
 +unsigned n_layers,
 +struct edac_mc_layer *layers,
 +bool rev_order,
 +unsigned sz_pvt)
  {
   void *ptr = NULL;
   struct mem_ctl_info *mci;
 - struct csrow_info *csi, *csrow;
 + struct edac_mc_layer *layer;
 + struct csrow_info *csi, *csr;
   struct rank_info *chi, *chp, *chan;
   struct dimm_info *dimm;
 + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS];
   void *pvt;
 - unsigned size;
 - int row, chn;
 + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS];
 + unsigned tot_csrows, tot_channels, tot_errcount = 0;
 + int i, j;
   int err;
 + int row, chn;
 + bool per_rank = false;
 +
 + BUG_ON(n_layers  EDAC_MAX_LAYERS);

^^

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Joe Perches
On Sat, 2012-04-28 at 11:16 +0200, Borislav Petkov wrote:
 On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote:
   All those local variables should be sorted in a reverse christmas tree
   order:
   
 u32 this_is_the_longest_array_name[LENGTH];
 void *shorter_named_variable;
 unsigned long size;
 int i;
   
 ...
  
  Why? There's nothing at the CodingStyle saying about how the vars should
  be ordered. If you want to enforce some particular order, please do it
  yourself, but apply it consistently among the entire subsystem.
 
 First of all, this way it is more readable.

Not in my opinion, and blindly using reverse christmas tree
can separate variables that should be declared together.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-28 Thread Joe Perches
On Sat, 2012-04-28 at 10:52 +0200, Borislav Petkov wrote:
 On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote:
  Yes. This is a common issue at the EDAC core: on several places, it calls 
  the
  edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, 
  while
  the debug macros already handles that. I suspect that, in the past, the 
  __func__
  were not at the macros, but some patch added it there, and forgot to fix the
  occurrences of its call.
 
 The patch that added it is d357cbb445208 and you reviewed it.
 
  This is something that needs to be reviewed at the entire EDAC core (and 
  likely
  at the drivers).
 
 Looks like a job for a newbie to get her/his feet wet with kernel work.

Looks to me more like a lazy maintainer/developer who doesn't
want to bother with a few minutes work.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Borislav Petkov
Btw,

this patch gives

[8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 
(0:0:0): row 0, chan 0
[8.287594] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: dimm1 
(0:1:0): row 0, chan 1
[8.296784] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: dimm2 
(1:0:0): row 1, chan 0
[8.305968] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: dimm3 
(1:1:0): row 1, chan 1
[8.315144] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: dimm4 
(2:0:0): row 2, chan 0
[8.324326] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: dimm5 
(2:1:0): row 2, chan 1
[8.333502] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: dimm6 
(3:0:0): row 3, chan 0
[8.342684] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: dimm7 
(3:1:0): row 3, chan 1
[8.351860] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: dimm8 
(4:0:0): row 4, chan 0
[8.361049] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: dimm9 
(4:1:0): row 4, chan 1
[8.370227] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: dimm10 
(5:0:0): row 5, chan 0
[8.379582] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: dimm11 
(5:1:0): row 5, chan 1
[8.388941] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: dimm12 
(6:0:0): row 6, chan 0
[8.398315] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: dimm13 
(6:1:0): row 6, chan 1
[8.407680] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: dimm14 
(7:0:0): row 7, chan 0
[8.417047] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: dimm15 
(7:1:0): row 7, chan 1

and the memory controller has the following chip selects

[8.137662] EDAC MC: DCT0 chip selects:
[8.150291] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[8.155349] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[8.160408] EDAC amd64: MC: 4: 0MB 5: 0MB
[8.165475] EDAC amd64: MC: 6: 0MB 7: 0MB
[8.180499] EDAC MC: DCT1 chip selects:
[8.184693] EDAC amd64: MC: 0:  2048MB 1:  2048MB
[8.189753] EDAC amd64: MC: 2:  2048MB 3:  2048MB
[8.194812] EDAC amd64: MC: 4: 0MB 5: 0MB
[8.199875] EDAC amd64: MC: 6: 0MB 7: 0MB

Those are 4 dual-ranked DIMMs on this node, DCT0 is one channel and DCT1
is another and I have 4 ranks per channel. Having dimm0-dimm15 is very
misleading and has nothing to do with the reality. So, if this is to use
your nomenclature with layers, I'll have dimm0-dimm7 where each dimm is
a rank.

Or, the most correct thing to do would be to have dimm0-dimm3, each
dual-ranked.

So either tot_dimms is computed wrongly or there's a more serious error
somewhere.

I've reviewed almost the half patch, will review the rest when/if we
sort out the above issue first.

Thanks.

On Tue, Apr 24, 2012 at 03:15:41PM -0300, Mauro Carvalho Chehab wrote:
 Change the EDAC internal representation to work with non-csrow
 based memory controllers.
 
 There are lots of those memory controllers nowadays, and more
 are coming. So, the EDAC internal representation needs to be
 changed, in order to work with those memory controllers, while
 preserving backward compatibility with the old ones.
 
 The edac core were written with the idea that memory controllers

was

 are able to directly access csrows, and that the channels are
 used inside a csrows select.

This sounds funny, simply remove that second part about the channels.

 This is not true for FB-DIMM and RAMBUS memory controllers.
 
 Also, some recent advanced memory controllers don't present a per-csrows
 view. Instead, they view memories as DIMM's, instead of ranks, accessed

DIMMs instead of ranks.

Remove the rest.

 via csrow/channel.
 
 So, change the allocation and error report routines to allow
 them to work with all types of architectures.
 
 This will allow the removal of several hacks on FB-DIMM and RAMBUS

   with

 memory controllers on the next patches.

. Remove the rest.

 
 Also, several tests were done on different platforms using different
 x86 drivers.
 
 TODO: a multi-rank DIMM's are currently represented by multiple DIMM

Multi-rank DIMMs

 entries at struct dimm_info. That means that changing a label for one

  in

 rank won't change the same label for the other ranks at the same dimm.

   of the same DIMM.

 Such bug is there since the beginning of the EDAC, so it is not a big

  This bug is present ..

 deal. However, on several drivers, it is possible to fix this issue, but

remove on

 it should be a per-driver fix, as the csrow = DIMM arrangement may not
 be equal for all. So, don't try to fix it here yet.
 
 PS.: I tried to make this patch as short as possible, preceding it with

Remove PS.

 several other patches that simplified the logic here. Yet, as the
 internal API changes, all drivers need changes. The 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Joe Perches
On Fri, 2012-04-27 at 15:33 +0200, Borislav Petkov wrote:
 this patch gives
 
 [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 
 (0:0:0): row 0, chan 0

One too many __func__'s in some combination of the
pr_fmt and/or dbg call and/or the actual call site?

  diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
[]
  @@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev 
  *pdev, int offset,
   
   #endif /* CONFIG_PCI */
   
  -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned 
  nr_csrows,
  - unsigned nr_chans, int edac_index);
  +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
  +  unsigned nr_chans, int edac_index);
 
 Why not extern?

Using extern function prototypes in .h files
isn't generally necessary nor is extern the
more common kernel style.

  +static inline void edac_mc_handle_ce(struct mem_ctl_info *mci,
unsigned long page_frame_number,
unsigned long offset_in_page,
unsigned long syndrome, int row, int channel,
  - const char *msg);
 
 Strange alignment, pls do
 
 static inline void edac_mc_handle_ce(struct...,
unsigned...,
...,
...);
 

or

static inline
void edac_mc_handle_ce(struct ..., etc)

or

static inline void
edac_mc_handle_ce(...)


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Borislav Petkov
On Fri, Apr 27, 2012 at 10:11:35AM -0400, Joe Perches wrote:
   -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned 
   nr_csrows,
   -   unsigned nr_chans, int edac_index);
   +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
   +unsigned nr_chans, int edac_index);
  
  Why not extern?
 
 Using extern function prototypes in .h files
 isn't generally necessary nor is extern the
 more common kernel style.

Searching for it, there's this discussion, for example:
http://gcc.gnu.org/ml/gcc/2009-04/msg00812.html

Maybe we should put a small note in Documentation/CodingStyle what the
kernel preference is and hold people to it.

-- 
Regards/Gruss,
Boris.

Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Mauro Carvalho Chehab
Em 27-04-2012 10:33, Borislav Petkov escreveu:
 Btw,
 
 this patch gives
 
 [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 
 (0:0:0): row 0, chan 0
 [8.287594] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: dimm1 
 (0:1:0): row 0, chan 1
 [8.296784] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: dimm2 
 (1:0:0): row 1, chan 0
 [8.305968] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: dimm3 
 (1:1:0): row 1, chan 1
 [8.315144] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: dimm4 
 (2:0:0): row 2, chan 0
 [8.324326] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: dimm5 
 (2:1:0): row 2, chan 1
 [8.333502] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: dimm6 
 (3:0:0): row 3, chan 0
 [8.342684] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: dimm7 
 (3:1:0): row 3, chan 1
 [8.351860] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: dimm8 
 (4:0:0): row 4, chan 0
 [8.361049] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: dimm9 
 (4:1:0): row 4, chan 1
 [8.370227] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: dimm10 
 (5:0:0): row 5, chan 0
 [8.379582] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: dimm11 
 (5:1:0): row 5, chan 1
 [8.388941] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: dimm12 
 (6:0:0): row 6, chan 0
 [8.398315] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: dimm13 
 (6:1:0): row 6, chan 1
 [8.407680] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: dimm14 
 (7:0:0): row 7, chan 0
 [8.417047] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: dimm15 
 (7:1:0): row 7, chan 1
 
 and the memory controller has the following chip selects
 
 [8.137662] EDAC MC: DCT0 chip selects:
 [8.150291] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [8.155349] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [8.160408] EDAC amd64: MC: 4: 0MB 5: 0MB
 [8.165475] EDAC amd64: MC: 6: 0MB 7: 0MB
 [8.180499] EDAC MC: DCT1 chip selects:
 [8.184693] EDAC amd64: MC: 0:  2048MB 1:  2048MB
 [8.189753] EDAC amd64: MC: 2:  2048MB 3:  2048MB
 [8.194812] EDAC amd64: MC: 4: 0MB 5: 0MB
 [8.199875] EDAC amd64: MC: 6: 0MB 7: 0MB
 
 Those are 4 dual-ranked DIMMs on this node, DCT0 is one channel and DCT1
 is another and I have 4 ranks per channel. Having dimm0-dimm15 is very
 misleading and has nothing to do with the reality. So, if this is to use
 your nomenclature with layers, I'll have dimm0-dimm7 where each dimm is
 a rank.
 
 Or, the most correct thing to do would be to have dimm0-dimm3, each
 dual-ranked.
 
 So either tot_dimms is computed wrongly or there's a more serious error
 somewhere.
 
 I've reviewed almost the half patch, will review the rest when/if we
 sort out the above issue first.
 
 Thanks.


The fix for it were in another patch[1], as calling them as rank is needed
also at the sysfs API.

[1] 
http://lists-archives.com/linux-kernel/27623222-edac-add-a-new-per-dimm-api-and-make-the-old-per-virtual-rank-api-obsolete.html

I can just merge the fix on this patch, with the enclosed diff.

Regards,
Mauro

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 4d4d8b7..e0d9481 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -86,7 +86,7 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci)
debugf4(\tmci-edac_check = %p\n, mci-edac_check);
debugf3(\tmci-nr_csrows = %d, csrows = %p\n,
mci-nr_csrows, mci-csrows);
-   debugf3(\tmci-nr_dimms = %d, dimns = %p\n,
+   debugf3(\tmci-nr_dimms = %d, dimms = %p\n,
mci-tot_dimms, mci-dimms);
debugf3(\tdev = %p\n, mci-dev);
debugf3(\tmod_name:ctl_name = %s:%s\n, mci-mod_name, mci-ctl_name);
@@ -183,10 +183,6 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems)
  * @size_pvt:  size of private storage needed
  *
  *
- * FIXME: drivers handle multi-rank memories on different ways: on some
- * drivers, one multi-rank memory is mapped as one DIMM, while, on others,
- * a single multi-rank DIMM would be mapped into several dimms.
- *
  * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report
  * such DIMMS properly, but the CSROWS-based ones will likely do the wrong
  * thing, as two chip select values are used for dual-rank memories (and 4, for
@@ -201,6 +197,12 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems)
  *
  * Use edac_mc_free() to free mc structures allocated by this function.
  *
+ * NOTE: drivers handle multi-rank memories on different ways: on some
+ * drivers, one multi-rank memory is mapped as one entry, while, on others,
+ * a single multi-rank DIMM would be mapped into several entries. Currently,
+ * this function will allocate multiple struct dimm_info on such scenarios,
+ * as grouping the multiple ranks require drivers change.
+ *
  * Returns:
  * NULL allocation failed
  * struct 

Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Mauro Carvalho Chehab
Em 27-04-2012 11:11, Joe Perches escreveu:
 On Fri, 2012-04-27 at 15:33 +0200, Borislav Petkov wrote:
 this patch gives

 [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 
 (0:0:0): row 0, chan 0
 
 One too many __func__'s in some combination of the
 pr_fmt and/or dbg call and/or the actual call site?

Yes. This is a common issue at the EDAC core: on several places, it calls the
edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while
the debug macros already handles that. I suspect that, in the past, the __func__
were not at the macros, but some patch added it there, and forgot to fix the
occurrences of its call.

This is something that needs to be reviewed at the entire EDAC core (and likely
at the drivers).

I opted to not touch on this at the existing debug logic, as I think that the
better is to address all those issues on one separate patch, after fixing the
EDAC core bugs.
 
 diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
 []
 @@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev 
 *pdev, int offset,
  
  #endif /* CONFIG_PCI */
  
 -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned 
 nr_csrows,
 - unsigned nr_chans, int edac_index);
 +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
 +  unsigned nr_chans, int edac_index);

 Why not extern?
 
 Using extern function prototypes in .h files
 isn't generally necessary nor is extern the
 more common kernel style.

Yes. I never add extern on the code I write.

While CodingStyle doesn't explicitly say anything about that, its spirit
seem to indicate to that the right thing is avoid using it, like, for 
example:
Chapter 4: Naming

C is a Spartan language, and so should your naming be.

(also on other places, like avoiding to use {} for single-statement if's).

So, useless clauses like extern doesn't seem to be the best choice.

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Luck, Tony
 +   for (i = 0; i  dimm-mci-n_layers; i++) {
 +   printk(KERN_CONT %d, dimm-location[i]);
 +   if (i  dimm-mci-n_layers - 1)
 +   printk(KERN_CONT .);
 +   }
 +   printk(KERN_CONT \n);
 
 This looks hacky but I don't have a good suggestion what to do instead
 here. Maybe snprintf into a complete string which you can issue with
 debugf4()...

 This is not hacky. There are several places at the Kernel doing loops like
 that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this
 macro was added later - probably to avoid checkpatch.pl complains).

There is some benefit to one printk == one output line ... it means
that console output will not be (as) jumbled if multiple cpus are
printk'ing at the same time.

-Tony
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-27 Thread Mauro Carvalho Chehab
Em 27-04-2012 15:11, Luck, Tony escreveu:
 +  for (i = 0; i  dimm-mci-n_layers; i++) {
 +  printk(KERN_CONT %d, dimm-location[i]);
 +  if (i  dimm-mci-n_layers - 1)
 +  printk(KERN_CONT .);
 +  }
 +  printk(KERN_CONT \n);

 This looks hacky but I don't have a good suggestion what to do instead
 here. Maybe snprintf into a complete string which you can issue with
 debugf4()...

 This is not hacky. There are several places at the Kernel doing loops like
 that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this
 macro was added later - probably to avoid checkpatch.pl complains).
 
 There is some benefit to one printk == one output line ... it means
 that console output will not be (as) jumbled if multiple cpus are
 printk'ing at the same time.

Ok, but this message only appears when all the conditions below are met:
- the driver is compiled with EDAC_DEBUG;
- the edac_core is modprobed with edac_debug_level=4;
- during the driver modprobe, when the EDAC driver is being registered.

Even on several-core machines, those messages won't mangle, in practice.

Let's not over-design a simple debug message.

Regards,
Mauro
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH EDACv16 1/2] edac: Change internal representation to work with layers

2012-04-24 Thread Mauro Carvalho Chehab
Change the EDAC internal representation to work with non-csrow
based memory controllers.

There are lots of those memory controllers nowadays, and more
are coming. So, the EDAC internal representation needs to be
changed, in order to work with those memory controllers, while
preserving backward compatibility with the old ones.

The edac core were written with the idea that memory controllers
are able to directly access csrows, and that the channels are
used inside a csrows select.

This is not true for FB-DIMM and RAMBUS memory controllers.

Also, some recent advanced memory controllers don't present a per-csrows
view. Instead, they view memories as DIMM's, instead of ranks, accessed
via csrow/channel.

So, change the allocation and error report routines to allow
them to work with all types of architectures.

This will allow the removal of several hacks on FB-DIMM and RAMBUS
memory controllers on the next patches.

Also, several tests were done on different platforms using different
x86 drivers.

TODO: a multi-rank DIMM's are currently represented by multiple DIMM
entries at struct dimm_info. That means that changing a label for one
rank won't change the same label for the other ranks at the same dimm.
Such bug is there since the beginning of the EDAC, so it is not a big
deal. However, on several drivers, it is possible to fix this issue, but
it should be a per-driver fix, as the csrow = DIMM arrangement may not
be equal for all. So, don't try to fix it here yet.

PS.: I tried to make this patch as short as possible, preceding it with
several other patches that simplified the logic here. Yet, as the
internal API changes, all drivers need changes. The changes are
generally bigger on the drivers for FB-DIMM's.

FIXME: while the FB-DIMMs are not converted to use the new
design, uncorrected errors will show just one channel. In
the past, all changes were on a big patch with about 150K.
As it needed to be split, in order to be accepted by the
EDAC ML at vger, we've opted to have this small drawback.
As an advantage, it is now easier to review the patch series.

Cc: Aristeu Rozanski aroza...@redhat.com
Cc: Doug Thompson nor...@yahoo.com
Cc: Borislav Petkov borislav.pet...@amd.com
Cc: Mark Gross mark.gr...@intel.com
Cc: Jason Uhlenkott juhle...@akamai.com
Cc: Tim Small t...@buttersideup.com
Cc: Ranganathan Desikan r...@jetztechnologies.com
Cc: Arvind R. arvin...@gmail.com
Cc: Olof Johansson o...@lixom.net
Cc: Egor Martovetsky e...@pasemi.com
Cc: Chris Metcalf cmetc...@tilera.com
Cc: Michal Marek mma...@suse.cz
Cc: Jiri Kosina jkos...@suse.cz
Cc: Joe Perches j...@perches.com
Cc: Dmitry Eremin-Solenikov dbarysh...@gmail.com
Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
Cc: Hitoshi Mitake h.mit...@gmail.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Niklas Söderlund niklas.soderl...@ericsson.com
Cc: Shaohui Xie shaohui@freescale.com
Cc: Josh Boyer jwbo...@gmail.com
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com
---

v16: Only context changes

 drivers/edac/edac_core.h |   92 ++-
 drivers/edac/edac_mc.c   |  682 --
 include/linux/edac.h |   40 ++-
 3 files changed, 526 insertions(+), 288 deletions(-)

diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index e48ab31..7201bb1 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev *pdev, 
int offset,
 
 #endif /* CONFIG_PCI */
 
-extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
- unsigned nr_chans, int edac_index);
+struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows,
+  unsigned nr_chans, int edac_index);
+struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index,
+  unsigned n_layers,
+  struct edac_mc_layer *layers,
+  bool rev_order,
+  unsigned sz_pvt);
 extern int edac_mc_add_mc(struct mem_ctl_info *mci);
 extern void edac_mc_free(struct mem_ctl_info *mci);
 extern struct mem_ctl_info *edac_mc_find(int idx);
@@ -467,24 +472,80 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info 
*mci,
  * reporting logic and function interface - reduces conditional
  * statement clutter and extra function arguments.
  */
-extern void edac_mc_handle_ce(struct mem_ctl_info *mci,
+
+void edac_mc_handle_error(const enum hw_event_mc_err_type type,
+ struct mem_ctl_info *mci,
+ const unsigned long page_frame_number,
+ const unsigned long offset_in_page,
+ const unsigned long syndrome,
+ const int layer0,
+ const int layer1,
+ const int