Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version
breaking thread because it grew too big. On Fri, May 04, 2012 at 07:48:42AM -0300, Mauro Carvalho Chehab wrote: [ … ] + memset(pos, 0, sizeof(pos)); + row = 0; + chn = 0; + debugf4(%s: initializing %d %s\n, __func__, tot_dimms, + per_rank ? ranks : dimms); + for (i = 0; i tot_dimms; i++) { + chan = csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(layer, mci-dimms, n_layers, +pos[0], pos[1], pos[2]); + dimm-mci = mci; + + debugf2(%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n, __func__, + i, per_rank ? rank : dimm, (dimm - mci-dimms), + pos[0], pos[1], pos[2], row, chn); + + /* Copy DIMM location */ + for (j = 0; j n_layers; j++) + dimm-location[j] = pos[j]; + + /* Link it to the csrows old API data */ + chan-dimm = dimm; + dimm-csrow = row; + dimm-cschannel = chn; + + /* Increment csrow location */ + for (j = n_layers - 1; j = 0; j--) + if (layers[j].is_virt_csrow) + break; This looks fishy: the for-loop above iterates over layers[j] to break on the first -is_virt_csrow. + row++; And row here gets incremented unconditionally, independent from the loop above. And you're not using any results from the loop: j gets reset below in the next loop. What's going on? + if (row == tot_csrows) { + row = 0; + chn++; + } + + /* Increment dimm location */ + for (j = n_layers - 1; j = 0; j--) { + pos[j]++; + if (pos[j] layers[j].size) + break; + pos[j] = 0; } } @@ -263,6 +373,46 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, */ return mci; } +EXPORT_SYMBOL_GPL(new_edac_mc_alloc); + +/** + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure + * @mc_num: Memory controller number + * @n_layers:Number of layers at the MC hierarchy + * layers: Describes each layer as seen by the Memory Controller + * @size_pvt:Size of private storage needed + * + * + * FIXME: drivers handle multi-rank memories in different ways: some + * drivers map multi-ranked DIMMs as one DIMM while others + * as several DIMMs. + * + * Everything is kmalloc'ed as one big chunk - more efficient. + * It can only be used if all structures have the same lifetime - otherwise + * you have to allocate and initialize your own structures. + * + * Use edac_mc_free() to free mc structures allocated by this function. + * + * Returns: + * On failure: NULL + * On success: struct mem_ctl_info pointer + */ + [ … ] +void edac_mc_handle_error(const enum hw_event_mc_err_type type, + struct mem_ctl_info *mci, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + const unsigned long syndrome, + const int layer0, + const int layer1, + const int layer2, + const char *msg, + const char *other_detail, + const void *mcelog) { - int len = EDAC_MC_LABEL_LEN * 4; - char labels[len + 1]; - char *pos = labels; - int chan; - int chars; - char *label = NULL; + /* FIXME: too much for stack: move it to some pre-alocated area */ I'm assuming all those new FIXMEs are going to be addressed soonish :) Rest looks ok, thanks. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version
Em 07-05-2012 10:32, Borislav Petkov escreveu: breaking thread because it grew too big. On Fri, May 04, 2012 at 07:48:42AM -0300, Mauro Carvalho Chehab wrote: [ … ] +memset(pos, 0, sizeof(pos)); +row = 0; +chn = 0; +debugf4(%s: initializing %d %s\n, __func__, tot_dimms, +per_rank ? ranks : dimms); +for (i = 0; i tot_dimms; i++) { +chan = csi[row].channels[chn]; +dimm = EDAC_DIMM_PTR(layer, mci-dimms, n_layers, + pos[0], pos[1], pos[2]); +dimm-mci = mci; + +debugf2(%s: %d: %s%zd (%d:%d:%d): row %d, chan %d\n, __func__, +i, per_rank ? rank : dimm, (dimm - mci-dimms), +pos[0], pos[1], pos[2], row, chn); + +/* Copy DIMM location */ +for (j = 0; j n_layers; j++) +dimm-location[j] = pos[j]; + +/* Link it to the csrows old API data */ +chan-dimm = dimm; +dimm-csrow = row; +dimm-cschannel = chn; + +/* Increment csrow location */ +for (j = n_layers - 1; j = 0; j--) +if (layers[j].is_virt_csrow) +break; This looks fishy: the for-loop above iterates over layers[j] to break on the first -is_virt_csrow. That loop is dead code. It was related to the rev_order handling. I'll drop it. +row++; And row here gets incremented unconditionally, independent from the loop above. And you're not using any results from the loop: j gets reset below in the next loop. What's going on? +if (row == tot_csrows) { +row = 0; +chn++; +} + +/* Increment dimm location */ +for (j = n_layers - 1; j = 0; j--) { +pos[j]++; +if (pos[j] layers[j].size) +break; +pos[j] = 0; } } @@ -263,6 +373,46 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, */ return mci; } +EXPORT_SYMBOL_GPL(new_edac_mc_alloc); + +/** + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure + * @mc_num: Memory controller number + * @n_layers: Number of layers at the MC hierarchy + * layers: Describes each layer as seen by the Memory Controller + * @size_pvt: Size of private storage needed + * + * + * FIXME: drivers handle multi-rank memories in different ways: some + * drivers map multi-ranked DIMMs as one DIMM while others + * as several DIMMs. + * + * Everything is kmalloc'ed as one big chunk - more efficient. + * It can only be used if all structures have the same lifetime - otherwise + * you have to allocate and initialize your own structures. + * + * Use edac_mc_free() to free mc structures allocated by this function. + * + * Returns: + * On failure: NULL + * On success: struct mem_ctl_info pointer + */ + [ … ] +void edac_mc_handle_error(const enum hw_event_mc_err_type type, + struct mem_ctl_info *mci, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + const unsigned long syndrome, + const int layer0, + const int layer1, + const int layer2, + const char *msg, + const char *other_detail, + const void *mcelog) { -int len = EDAC_MC_LABEL_LEN * 4; -char labels[len + 1]; -char *pos = labels; -int chan; -int chars; -char *label = NULL; +/* FIXME: too much for stack: move it to some pre-alocated area */ I'm assuming all those new FIXMEs are going to be addressed soonish :) That's the idea ;) Some are fixed just after the drivers conversion. This stack FIXME I'll solve together with some other cleanups I'm intending to write after merging the existing chansesets (as there are still a lot of patches on this series). The stack usage here is no too much to cause any problems, but still I don't want to have large stuff like that at stack. Regards, Mauro Patch without the dead code enclosed. -- edac: Change internal representation to work with layers From: Mauro Carvalho Chehab mche...@redhat.com Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote: + bool enable_filter, + unsigned pos[EDAC_MAX_LAYERS]) Passing the whole array as an argument instead of only a pointer to it? This is C, and not C++ or Pascal. Only the pointer is passed here. The size of the array is used for type check only. Right, and you can see where he still has trouble. And by he I mean me :). [ … ] +void edac_mc_handle_error(const enum hw_event_mc_err_type type, +struct mem_ctl_info *mci, +const unsigned long page_frame_number, +const unsigned long offset_in_page, +const unsigned long syndrome, +const int layer0, +const int layer1, +const int layer2, Instead of passing each layer as an arg, you can prepare the array pos[] in each edac_mc_hanlde_*() and pass around a pointer to it - you need it anyway in the edac_mc_inc*() functions. Yes, but the changes at the drivers will be more complex, without any reason: before each call to this function, they would need to create and fill a temporary array. As there are only 3 layers, in the worse case, this way is simpler and more efficient. We can review it, if we ever need more than 3 layers. I see, the edac_mc_handle_error is the main interface for all edac drivers, ok. [ … ] + bool enable_filter = false; What does this enable_filter thing mean: if (pos[i] = 0) enable_filter = true; This absolutely needs explanation and better naming! Renamed it to enable_per_layer_report. Or detailed_dimm_report or ... The code that implement it seems self-explained: .. if (enable_filter dimm-nr_pages) { if (p != label) { strcpy(p, OTHER_LABEL); p += strlen(OTHER_LABEL); } strcpy(p, dimm-label); p += strlen(p); *p = '\0'; .. if (!enable_filter) { strcpy(label, any memory); } else { debugf4(%s: csrow/channel to increment: (%d,%d)\n, __func__, row, chan); if (p == label) strcpy(label, unknown memory); if (type == HW_EVENT_ERR_CORRECTED) { if (row = 0) { mci-csrows[row].ce_count++; if (chan = 0) mci-csrows[row].channels[chan].ce_count++; } } else if (row = 0) mci-csrows[row].ue_count++; } Theis flag indicates if is there any useful information about the affected DIMM(s) provided by the EDAC driver. If this is provided, the DIMM location labels are filtered and reported, and the per-layer error counters are incremented. As it was discussed on previous reviews, with FB-DIMM MCs, and/or when mirror mode/lockstep mode is enabled, the memory controller points errors to 2 DIMMs (or 4 DIMMs, when both mirror mode and lockstep mode are enabled) on most memory controllers, under some conditions. The edac_mc_handle_fbd_ue() function call were created due to that. When comparing with the old code, enable_filter = false would be equivalent to call edac_mc_handle_ce_no_info/edac_mc_handle_ue_no_info. I'm adding a comment about it. Much better, thanks. Btw, I have to admit, this is a pretty strange way of handling the case where layers are { -1, -1, -1 }, i.e. edac_mc_handle_error is called with the no info hint. I'm wondering whether it wouldn't be more readable if you could do edac_mc_handle_error(HW_EVENT_ERR_INFO_INVALID | ..) or similar and define such a flag which simply states that. But you'll have to change enum hw_event_mc_err_type to a bitfield to allow more than one set bit. Hmm. [ … ] The SCRUB_SW_SRC piece can be another function. It is now part of the edac_ce_error(). Hm, I can't find this function on your experimental branch on infradead but it is mentioned in the inlined patch below, what's going on? Which patch should I be looking at now? [ … ] The following patch addresses the pointed issues. I've updated them on my experimental branch at infradead: git://git.infradead.org/users/mchehab/edac.git experimental Ok, I checked this one out but can't find the edac_ce_error() function as already stated above, pls check. They'll also be soon available at: git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-edac.git hw_events_v18 Will review the patch below now and reply in another mail. Thanks. Regards, Mauro - edac: Change internal representation to work with layers
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 04-05-2012 06:52, Borislav Petkov escreveu: On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote: + bool enable_filter, + unsigned pos[EDAC_MAX_LAYERS]) Passing the whole array as an argument instead of only a pointer to it? This is C, and not C++ or Pascal. Only the pointer is passed here. The size of the array is used for type check only. Right, and you can see where he still has trouble. And by he I mean me :). :) [ … ] +void edac_mc_handle_error(const enum hw_event_mc_err_type type, +struct mem_ctl_info *mci, +const unsigned long page_frame_number, +const unsigned long offset_in_page, +const unsigned long syndrome, +const int layer0, +const int layer1, +const int layer2, Instead of passing each layer as an arg, you can prepare the array pos[] in each edac_mc_hanlde_*() and pass around a pointer to it - you need it anyway in the edac_mc_inc*() functions. Yes, but the changes at the drivers will be more complex, without any reason: before each call to this function, they would need to create and fill a temporary array. As there are only 3 layers, in the worse case, this way is simpler and more efficient. We can review it, if we ever need more than 3 layers. I see, the edac_mc_handle_error is the main interface for all edac drivers, ok. [ … ] + bool enable_filter = false; What does this enable_filter thing mean: if (pos[i] = 0) enable_filter = true; This absolutely needs explanation and better naming! Renamed it to enable_per_layer_report. Or detailed_dimm_report or ... Detail is used on another context; enable_per_layer_report won't generate any doubts for anyone reading the code. The code that implement it seems self-explained: .. if (enable_filter dimm-nr_pages) { if (p != label) { strcpy(p, OTHER_LABEL); p += strlen(OTHER_LABEL); } strcpy(p, dimm-label); p += strlen(p); *p = '\0'; .. if (!enable_filter) { strcpy(label, any memory); } else { debugf4(%s: csrow/channel to increment: (%d,%d)\n, __func__, row, chan); if (p == label) strcpy(label, unknown memory); if (type == HW_EVENT_ERR_CORRECTED) { if (row = 0) { mci-csrows[row].ce_count++; if (chan = 0) mci-csrows[row].channels[chan].ce_count++; } } else if (row = 0) mci-csrows[row].ue_count++; } Theis flag indicates if is there any useful information about the affected DIMM(s) provided by the EDAC driver. If this is provided, the DIMM location labels are filtered and reported, and the per-layer error counters are incremented. As it was discussed on previous reviews, with FB-DIMM MCs, and/or when mirror mode/lockstep mode is enabled, the memory controller points errors to 2 DIMMs (or 4 DIMMs, when both mirror mode and lockstep mode are enabled) on most memory controllers, under some conditions. The edac_mc_handle_fbd_ue() function call were created due to that. When comparing with the old code, enable_filter = false would be equivalent to call edac_mc_handle_ce_no_info/edac_mc_handle_ue_no_info. I'm adding a comment about it. Much better, thanks. Btw, I have to admit, this is a pretty strange way of handling the case where layers are { -1, -1, -1 }, i.e. edac_mc_handle_error is called with the no info hint. Well, negative values are used on Linux to indicate error conditions, so this is not that strange. Also, it allows partial no info, as, on some cases, a channel or a csrow may not be known. So, this allows code simplification at the drivers. For example, look at this hunk on the amd64_edac conversion patch: @@ -1585,16 +1609,10 @@ static void f1x_map_sysaddr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr, if (dct_ganging_enabled(pvt)) chan = get_channel_from_ecc_syndrome(mci, syndrome); - if (chan = 0) - edac_mc_handle_ce(mci, page, offset, syndrome, csrow, chan, - EDAC_MOD_STR); - else - /* -* Channel unknown, report all channels on this CSROW as failed. -*/ - for (chan = 0; chan mci-csrows[csrow].nr_channels; chan++) - edac_mc_handle_ce(mci, page, offset, syndrome, - csrow, chan, EDAC_MOD_STR); +
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version
On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote: edac: Change internal representation to work with layers From: Mauro Carvalho Chehab mche...@redhat.com Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMMs, instead of ranks. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks with FB-DIMM and RAMBUS memory controllers. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMMs are currently represented by multiple DIMM entries in struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same DIMM. This bug is present since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this issue, but it should be a per-driver fix, as the csrow = DIMM arrangement may not be equal for all. So, don't try to fix it here yet. I tried to make this patch as short as possible, preceding it with several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The changes are generally bigger in the drivers for FB-DIMMs. Cc: Aristeu Rozanski aroza...@redhat.com Cc: Doug Thompson nor...@yahoo.com Cc: Borislav Petkov borislav.pet...@amd.com Cc: Mark Gross mark.gr...@intel.com Cc: Jason Uhlenkott juhle...@akamai.com Cc: Tim Small t...@buttersideup.com Cc: Ranganathan Desikan r...@jetztechnologies.com Cc: Arvind R. arvin...@gmail.com Cc: Olof Johansson o...@lixom.net Cc: Egor Martovetsky e...@pasemi.com Cc: Chris Metcalf cmetc...@tilera.com Cc: Michal Marek mma...@suse.cz Cc: Jiri Kosina jkos...@suse.cz Cc: Joe Perches j...@perches.com Cc: Dmitry Eremin-Solenikov dbarysh...@gmail.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: Hitoshi Mitake h.mit...@gmail.com Cc: Andrew Morton a...@linux-foundation.org Cc: Niklas Söderlund niklas.soderl...@ericsson.com Cc: Shaohui Xie shaohui@freescale.com Cc: Josh Boyer jwbo...@gmail.com Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com --- v18: Addresses the pointed issues on v17 review diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index e48ab31..1286c5e 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -447,8 +447,12 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset, #endif /* CONFIG_PCI */ -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, +unsigned nr_chans, int edac_index); +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, +unsigned n_layers, +struct edac_mc_layer *layers, +unsigned sz_pvt); extern int edac_mc_add_mc(struct mem_ctl_info *mci); extern void edac_mc_free(struct mem_ctl_info *mci); extern struct mem_ctl_info *edac_mc_find(int idx); @@ -467,24 +471,78 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * reporting logic and function interface - reduces conditional * statement clutter and extra function arguments. */ -extern void edac_mc_handle_ce(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, - unsigned long syndrome, int row, int channel, - const char *msg); -extern void edac_mc_handle_ce_no_info(struct mem_ctl_info *mci, - const char *msg); -extern void edac_mc_handle_ue(struct mem_ctl_info *mci, - unsigned long page_frame_number, - unsigned long offset_in_page, int row, - const char *msg); -extern void edac_mc_handle_ue_no_info(struct mem_ctl_info *mci, - const char *msg); -extern void edac_mc_handle_fbd_ue(struct mem_ctl_info *mci, unsigned int csrow, - unsigned int channel0,
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers, second version
Em 04-05-2012 07:16, Borislav Petkov escreveu: On Thu, May 03, 2012 at 11:16:54AM -0300, Mauro Carvalho Chehab wrote: edac: Change internal representation to work with layers ... /** - * edac_mc_alloc: Allocate a struct mem_ctl_info structure - * @size_pvt: size of private storage needed - * @nr_csrows: Number of CWROWS needed for this MC - * @nr_chans: Number of channels for the MC + * edac_mc_alloc: Allocate and partially fill a struct mem_ctl_info structure + * @mc_num: Memory controller number + * @n_layers: Number of MC hierarchy layers + * layers: Describes each layer as seen by the Memory Controller + * @size_pvt: size of private storage needed + * + * + * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report + * such DIMMS properly, but the CSROWS-based ones will likely do the wrong DIMMs csrow-based + * thing, as two chip select values are used for dual-rank memories (and 4, for + * quad-rank ones). I suspect that this issue could be solved inside the EDAC + * core for SDRAM memories, but it requires further study at JEDEC JESD 21C. The paragraph above is still in, let me repeat my last note: This last paragraph sounds innacurately, especially the likely adverbs make it even more confusing. The gist of what you're saying is already present in the commit message anyway, so drop it here pls. There are two similar comments one for edac_mc_alloc and another for new_edac_mc_alloc. It seems that I fixed just one of them. + * + * In summary, solving this issue is not easy, as it requires a lot of testing. As before: Also superfluous and has nothing to do with edac_mc_alloc(). Pls remove it. Dropped both paragraphs. * Everything is kmalloc'ed as one big chunk - more efficient. * Only can be used if all structures have the same lifetime - otherwise @@ -168,22 +196,49 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) * * Use edac_mc_free() to free mc structures allocated by this function. * + * NOTE: drivers handle multi-rank memories in different ways: in some + * drivers, one multi-rank memory stick is mapped as one entry, while, in + * others, a single multi-rank memory stick would be mapped into several + * entries. Currently, this function will allocate multiple struct dimm_info + * on such scenarios, as grouping the multiple ranks require drivers change. + * * Returns: * NULL allocation failed * struct mem_ctl_info pointer Ok, this patch still doesn't contain all the changes I requested for although you said you did them. Is this not the latest version? I'll wait for you to sort it out before I continue reviewing... Thanks. Patch enclosed (and updated at Infradead). Regards, Mauro - edac: Change internal representation to work with layers Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMMs, instead of ranks. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks with FB-DIMM and RAMBUS memory controllers. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMMs are currently represented by multiple DIMM entries in struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same DIMM. This bug is present since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this issue, but it should be a per-driver fix, as the csrow = DIMM arrangement may not be equal for all. So, don't try to fix it here yet. I tried to make this patch as short as possible, preceding it with several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The changes are generally bigger in the drivers for FB-DIMMs. Cc: Aristeu Rozanski aroza...@redhat.com Cc: Doug Thompson nor...@yahoo.com Cc: Borislav Petkov borislav.pet...@amd.com Cc: Mark Gross mark.gr...@intel.com Cc: Jason Uhlenkott juhle...@akamai.com Cc: Tim Small t...@buttersideup.com Cc: Ranganathan Desikan r...@jetztechnologies.com Cc: Arvind R. arvin...@gmail.com Cc: Olof Johansson o...@lixom.net Cc: Egor Martovetsky e...@pasemi.com Cc: Chris Metcalf cmetc...@tilera.com Cc: Michal Marek
[PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Starting a new thread because the old one grew too big and is out of my screen. Patch below is git-formatted from git://git.infradead.org/users/mchehab/edac.git commit 447b7929e633027ffe131f2f8f246bba5690cee7 Author: Mauro Carvalho Chehab mche...@redhat.com Date: Wed Apr 18 15:20:50 2012 -0300 edac: Change internal representation to work with layers Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMMs, instead of ranks. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks with FB-DIMM and RAMBUS memory controllers. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMMs are currently represented by multiple DIMM entries in struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same DIMM. This bug is present since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this issue, but it should be a per-driver fix, as the csrow = DIMM arrangement may not be equal for all. So, don't try to fix it here yet. I tried to make this patch as short as possible, preceding it with several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The changes are generally bigger in the drivers for FB-DIMMs. snip already reviewed stuff /* Figure out the offsets of the various items from the start of an mc * structure. We want the alignment of each item to be at least as @@ -191,12 +253,28 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, * hardcode everything into a single struct. */ mci = edac_align_ptr(ptr, sizeof(*mci), 1); - csi = edac_align_ptr(ptr, sizeof(*csi), nr_csrows); - chi = edac_align_ptr(ptr, sizeof(*chi), nr_csrows * nr_chans); - dimm = edac_align_ptr(ptr, sizeof(*dimm), nr_csrows * nr_chans); + layer = edac_align_ptr(ptr, sizeof(*layer), n_layers); + csi = edac_align_ptr(ptr, sizeof(*csi), tot_csrows); + chi = edac_align_ptr(ptr, sizeof(*chi), tot_csrows * tot_channels); + dimm = edac_align_ptr(ptr, sizeof(*dimm), tot_dimms); + count = 1; + for (i = 0; i n_layers; i++) { + count *= layers[i].size; + debugf4(%s: errcount layer %d size %d\n, __func__, i, count); + ce_per_layer[i] = edac_align_ptr(ptr, sizeof(u32), count); + ue_per_layer[i] = edac_align_ptr(ptr, sizeof(u32), count); + tot_errcount += 2 * count; + } + + debugf4(%s: allocating %d error counters\n, __func__, tot_errcount); pvt = edac_align_ptr(ptr, sz_pvt, 1); size = ((unsigned long)pvt) + sz_pvt; + debugf1(%s(): allocating %u bytes for mci data (%d %s, %d csrows/channels)\n, + __func__, size, + tot_dimms, + per_rank ? ranks : dimms, + tot_csrows * tot_channels); mci = kzalloc(size, GFP_KERNEL); if (mci == NULL) return NULL; @@ -204,42 +282,101 @@ struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, /* Adjust pointers so they point within the memory we just allocated * rather than an imaginary chunk of memory located at address 0. */ + layer = (struct edac_mc_layer *)(((char *)mci) + ((unsigned long)layer)); csi = (struct csrow_info *)(((char *)mci) + ((unsigned long)csi)); chi = (struct rank_info *)(((char *)mci) + ((unsigned long)chi)); dimm = (struct dimm_info *)(((char *)mci) + ((unsigned long)dimm)); + for (i = 0; i n_layers; i++) { + mci-ce_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ce_per_layer[i])); + mci-ue_per_layer[i] = (u32 *)((char *)mci + ((unsigned long)ue_per_layer[i])); + } pvt = sz_pvt ? (((char *)mci) + ((unsigned long)pvt)) : NULL; /* setup index and various internal pointers */ mci-mc_idx = edac_index; mci-csrows = csi; mci-dimms = dimm; + mci-tot_dimms = tot_dimms; mci-pvt_info = pvt; - mci-nr_csrows
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote: Hey, are you looking at compiled code or at source code? Because I'm looking at source code, and it is a pretty safe bet the majority of the people here do that too. What I said is that, from source code POV, a code where the loop variables are initialized just before the loop is easier to read it when the initialization of those vars are on another part of the code. That's basically why the for syntax starts with a var initialization clause. The tot_dimms friends are loop vars: their value is calculated within the loop. At the object code, this won't bring any difference. it, either by using registers for those vars or by moving the initialization to the top of the function. This function is too complex, so it is better to initialize those vars just before the loops that are calculating those totals. Simply initialize those variables at declaration time and that's it. Initializing them before the loop doesn't make the function less complex - splitting it and sanitizing it does. Initializing loop-calculated vars just before the loop makes the code easier to read, and may avoid issues that might happen during code lifecycle. This is getting ridiculous: the variable declaration and initialization are on the same screen as the loop (unless one uses a screen which can only show less than 40ish lines). So the argument about making the code easier to read is bogus. This function is already cluttered with a lot of crap, and is very large so adding more lines which can simply be stashed away at declaration time is better readability. Besides, every modern editor can jump to the declaration of a local variable so that the user can see to what it is initialized to. +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, +unsigned n_layers, +struct edac_mc_layer *layers, +bool rev_order, +unsigned sz_pvt) { void *ptr = NULL; struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *layer; + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; - unsigned size; - int row, chn; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; + unsigned tot_csrows, tot_channels, tot_errcount = 0; + int i, j; int err; + int row, chn; + bool per_rank = false; + + BUG_ON(n_layers EDAC_MAX_LAYERS || n_layers == 0); + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + tot_dimms = 1; + tot_channels = 1; + tot_csrows = 1; + for (i = 0; i n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_channels *= layers[i].size; + + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x0088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked. No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc doesn't know how many ranks are filled, as the driver logic first calls it to allocate for the max amount of ranks, and then fills the rank with their info (or let them untouched with 0 pages, if they're empty). Basically you're saying you're generating dimm_info structs for all _possible_ dimms and the loop where this debug message comes from goes and marrily initializes them all although some of them are empty: + for (i = 0; i tot_dimms; i++) { + chan = csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers, + pos[0], pos[1], pos[2]); + dimm-mci = mci; + + debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, __func__, + i, (dimm - mci-dimms), + pos[0], pos[1], pos[2], row, chn); + + /* Copy DIMM location */ + for (j = 0; j n_layers; j++) + dimm-location[j] = pos[j]; ... definitely superfluous. Oh well, looking at edac_mc_alloc, it used to allocate structs for all csrows on the controller even though some of them were empty... Ok, then please remove this debug call because it is misleading. Having [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) is enough. You probably want to say how many channels/csrows there are, though: [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2 channels) or
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 05:15, Borislav Petkov escreveu: On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x0088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked. No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc doesn't know how many ranks are filled, as the driver logic first calls it to allocate for the max amount of ranks, and then fills the rank with their info (or let them untouched with 0 pages, if they're empty). Basically you're saying you're generating dimm_info structs for all _possible_ dimms and the loop where this debug message comes from goes and marrily initializes them all although some of them are empty: + for (i = 0; i tot_dimms; i++) { + chan = csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers, + pos[0], pos[1], pos[2]); + dimm-mci = mci; + + debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, __func__, + i, (dimm - mci-dimms), + pos[0], pos[1], pos[2], row, chn); + + /* Copy DIMM location */ + for (j = 0; j n_layers; j++) + dimm-location[j] = pos[j]; ... definitely superfluous. Oh well, looking at edac_mc_alloc, it used to allocate structs for all csrows on the controller even though some of them were empty... Ok, then please remove this debug call because it is misleading. Having [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) is enough. You probably want to say how many channels/csrows there are, though: [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: allocating 3692 bytes for mci data (16 ranks, 8 csrows, 2
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: It seems you have a very short memory. Oh, puh-lease, let's don't start with the insults now. You're not a saint yourself. And maybe the fact that I'm having hard time grasping your code is maybe because it is a load of crap and you seem to generate a lot of senseless drivel when explaining what it does. And don't get me started on the patch bombs. So, let's stay constructive here before I, as the last and only one person reviewing this stinking pile stops messing with it (I got other stuff to do, you know) and NACK it completely. For example, this is the mapping used by the second memory controller of the SB machine I'm using on my tests: [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) ... [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is called dimm11 at the new API, and corresponds to csrow 2, channel 3 for a legacy EDAC API call. Are all those DIMM slots above populated? What happens if they're not, are you issuing the same dimm0-dimm11 lines for slots which aren't even populated? I have a much better idea: Generally, this debug info should come from the specific driver that allocates the dimm descriptors, not from the EDAC core. This way, you know in the driver which slots are populated and those which are not should be omitted. This way it says initializing 12 dimms and the user thinks there are 12 DIMMs on his system where this might not be true. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 04:59, Borislav Petkov escreveu: On Sun, Apr 29, 2012 at 11:16:53AM -0300, Mauro Carvalho Chehab wrote: Hey, are you looking at compiled code or at source code? Because I'm looking at source code, and it is a pretty safe bet the majority of the people here do that too. What I said is that, from source code POV, a code where the loop variables are initialized just before the loop is easier to read it when the initialization of those vars are on another part of the code. That's basically why the for syntax starts with a var initialization clause. The tot_dimms friends are loop vars: their value is calculated within the loop. At the object code, this won't bring any difference. it, either by using registers for those vars or by moving the initialization to the top of the function. This function is too complex, so it is better to initialize those vars just before the loops that are calculating those totals. Simply initialize those variables at declaration time and that's it. Initializing them before the loop doesn't make the function less complex - splitting it and sanitizing it does. Initializing loop-calculated vars just before the loop makes the code easier to read, and may avoid issues that might happen during code lifecycle. This is getting ridiculous: With this I fully agree: you're nacking patches because it is not the way you write your code, not because the code there is doing anything wrong. If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why should I do a change that will obfuscate the code? the variable declaration and initialization are on the same screen as the loop (unless one uses a screen which can only show less than 40ish lines). So the argument about making the code easier to read is bogus. This function is already cluttered with a lot of crap, and is very large so adding more lines which can simply be stashed away at declaration time is better readability. Besides, every modern editor can jump to the declaration of a local variable so that the user can see to what it is initialized to. The editor used by te developer is not relevant. This is not a reason to obfuscate the code. +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, +unsigned n_layers, +struct edac_mc_layer *layers, +bool rev_order, +unsigned sz_pvt) { void *ptr = NULL; struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *layer; + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; - unsigned size; - int row, chn; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; + unsigned tot_csrows, tot_channels, tot_errcount = 0; + int i, j; int err; + int row, chn; + bool per_rank = false; + + BUG_ON(n_layers EDAC_MAX_LAYERS || n_layers == 0); + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + tot_dimms = 1; + tot_channels = 1; + tot_csrows = 1; + for (i = 0; i n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_channels *= layers[i].size; + + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 05:15, Borislav Petkov escreveu: On Sun, Apr 29, 2012 at 10:49:44AM -0300, Mauro Carvalho Chehab wrote: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x0088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked. No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc doesn't know how many ranks are filled, as the driver logic first calls it to allocate for the max amount of ranks, and then fills the rank with their info (or let them untouched with 0 pages, if they're empty). Basically you're saying you're generating dimm_info structs for all _possible_ dimms and the loop where this debug message comes from goes and marrily initializes them all although some of them are empty: + for (i = 0; i tot_dimms; i++) { + chan = csi[row].channels[chn]; + dimm = EDAC_DIMM_PTR(lay, mci-dimms, n_layers, + pos[0], pos[1], pos[2]); + dimm-mci = mci; + + debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, __func__, + i, (dimm - mci-dimms), + pos[0], pos[1], pos[2], row, chn); + + /* Copy DIMM location */ + for (j = 0; j n_layers; j++) + dimm-location[j] = pos[j]; ... definitely superfluous. This is the way the EDAC core works: everything is allocated, on one shot, when this function is called, and, on most drivers, before the code that probes how many DIMMS/ranks got initialized. That happens because the edac_mc_alloc() arguments provide the total amount of ranks/dimms, but doesn't say anything about what is used there. Changing from this model to another model that would dynamically initialize the per-dimm/rank data is possible, but that would require another set of patches that will touch on all
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 08:11, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: For example, this is the mapping used by the second memory controller of the SB machine I'm using on my tests: [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) ... [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is called dimm11 at the new API, and corresponds to csrow 2, channel 3 for a legacy EDAC API call. Are all those DIMM slots above populated? What happens if they're not, are you issuing the same dimm0-dimm11 lines for slots which aren't even populated? I have a much better idea: Generally, this debug info should come from the specific driver that allocates the dimm descriptors, not from the EDAC core. This way, you know in the driver which slots are populated and those which are not should be omitted. The drivers don't allocate the dimm descriptors. They're allocated by the core. This way it says initializing 12 dimms and the user thinks there are 12 DIMMs on his system where this might not be true. I'm OK to remove the initializing 12 dimms message. It doesn't add anything new. With regards do the other messages, if the debug messages are not clear, then let's fix them, instead of removing. What if we print, instead, on a message like: row 1, chan 1 will represent dimm5 (1:2:0) if not empty Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote: Em 30-04-2012 08:11, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: For example, this is the mapping used by the second memory controller of the SB machine I'm using on my tests: [52803.640043] EDAC DEBUG: sbridge_probe: Registering MC#1 (2 of 2) ... [52803.640062] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc(): allocating 7196 bytes for mci data (12 dimms, 12 csrows/channels) [52803.640070] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: initializing 12 dimms [52803.640072] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [52803.640074] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [52803.640077] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 2: dimm2 (0:2:0): row 0, chan 2 [52803.640080] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 3: dimm3 (1:0:0): row 0, chan 3 [52803.640083] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 4: dimm4 (1:1:0): row 1, chan 0 [52803.640086] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 5: dimm5 (1:2:0): row 1, chan 1 [52803.640089] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 6: dimm6 (2:0:0): row 1, chan 2 [52803.640092] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 7: dimm7 (2:1:0): row 1, chan 3 [52803.640095] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 8: dimm8 (2:2:0): row 2, chan 0 [52803.640098] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 9: dimm9 (3:0:0): row 2, chan 1 [52803.640101] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 10: dimm10 (3:1:0): row 2, chan 2 [52803.640104] EDAC DEBUG: edac_mc_alloc: edac_mc_alloc: 11: dimm11 (3:2:0): row 2, chan 3 With the above info, it is clear that the DIMM located at mc#1, channel#3 slot#2 is called dimm11 at the new API, and corresponds to csrow 2, channel 3 for a legacy EDAC API call. Are all those DIMM slots above populated? What happens if they're not, are you issuing the same dimm0-dimm11 lines for slots which aren't even populated? I have a much better idea: Generally, this debug info should come from the specific driver that allocates the dimm descriptors, not from the EDAC core. This way, you know in the driver which slots are populated and those which are not should be omitted. The drivers don't allocate the dimm descriptors. They're allocated by the core. I know that. The drivers call into EDAC core using edac_mc_alloc, this is what I meant above. This way it says initializing 12 dimms and the user thinks there are 12 DIMMs on his system where this might not be true. I'm OK to remove the initializing 12 dimms message. It doesn't add anything new. With regards do the other messages, if the debug messages are not clear, then let's fix them, instead of removing. What if we print, instead, on a message like: row 1, chan 1 will represent dimm5 (1:2:0) if not empty How about the following instead: the specific driver calls edac_mc_alloc(), it gets the allocated dimm array in mci-dimms _without_ dumping each dimm%d line. Then, each driver figures out which subset of that dimms array actually has populated slots and prints only the populated rank/slot/... This information is much more valuable than saying how many _possible_ slots the edac core has allocated. Then, each driver can decide whether it makes sense to dump that info or not. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Mon, Apr 30, 2012 at 08:23:42AM -0300, Mauro Carvalho Chehab wrote: With this I fully agree: you're nacking patches because it is not the way you Where? Have I written Nacked-by somewhere? write your code, not because the code there is doing anything wrong. If you point anything wrong on the way I wrote, then I'll fix. Otherwise, why should I do a change that will obfuscate the code? What obfuscation are you talking about? Having the initialization of variables along with their declaration is not it. Now let's look what you're doing: + unsigned tot_csrows, tot_channels, tot_errcount = 0; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; just to reassign 1 to some of them + tot_dimms = 1; + tot_channels = 1; + tot_csrows = 1; a couple of lines below. Now this is misleading. Now let's look at what I'm proposing unsigned tot_dimms = 1; unsigned tot_csrows = 1; unsigned tot_channels = 1; How is this an obfuscation? It is basic code layout practices. The editor used by te developer is not relevant. This is not a reason to obfuscate the code. +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, +unsigned n_layers, +struct edac_mc_layer *layers, +bool rev_order, +unsigned sz_pvt) { void *ptr = NULL; struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *layer; + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; - unsigned size; - int row, chn; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; + unsigned tot_csrows, tot_channels, tot_errcount = 0; + int i, j; int err; + int row, chn; + bool per_rank = false; + + BUG_ON(n_layers EDAC_MAX_LAYERS || n_layers == 0); + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + tot_dimms = 1; + tot_channels = 1; + tot_csrows = 1; + for (i = 0; i n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_channels *= layers[i].size; + + if (layers[i].type == EDAC_MC_LAYER_CHIP_SELECT) + per_rank = true; -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 09:38, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote: Em 30-04-2012 08:11, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: This way it says initializing 12 dimms and the user thinks there are 12 DIMMs on his system where this might not be true. I'm OK to remove the initializing 12 dimms message. It doesn't add anything new. With regards do the other messages, if the debug messages are not clear, then let's fix them, instead of removing. What if we print, instead, on a message like: row 1, chan 1 will represent dimm5 (1:2:0) if not empty How about the following instead: the specific driver calls edac_mc_alloc(), it gets the allocated dimm array in mci-dimms _without_ dumping each dimm%d line. Then, each driver figures out which subset of that dimms array actually has populated slots and prints only the populated rank/slot/... This information is much more valuable than saying how many _possible_ slots the edac core has allocated. Then, each driver can decide whether it makes sense to dump that info or not. No, that would add extra complexity at the drivers level just due to debug messages. I think that the better is to move this printk to the debug-specific routine that is called only when the dimm is filled (edac_mc_dump_dimm). With this cange, the message will be printed only for the filled dimms. This is a cleanup patch, so I'll write it, together with the change that will get rid of the loop that uses KERN_CONT. It will use a function added by a latter patch at edac_mc_sysfs so it can't be merged on this patch anyway. Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 30-04-2012 10:00, Mauro Carvalho Chehab escreveu: Em 30-04-2012 09:38, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 08:45:09AM -0300, Mauro Carvalho Chehab wrote: Em 30-04-2012 08:11, Borislav Petkov escreveu: On Mon, Apr 30, 2012 at 07:58:33AM -0300, Mauro Carvalho Chehab wrote: This way it says initializing 12 dimms and the user thinks there are 12 DIMMs on his system where this might not be true. I'm OK to remove the initializing 12 dimms message. It doesn't add anything new. With regards do the other messages, if the debug messages are not clear, then let's fix them, instead of removing. What if we print, instead, on a message like: row 1, chan 1 will represent dimm5 (1:2:0) if not empty How about the following instead: the specific driver calls edac_mc_alloc(), it gets the allocated dimm array in mci-dimms _without_ dumping each dimm%d line. Then, each driver figures out which subset of that dimms array actually has populated slots and prints only the populated rank/slot/... This information is much more valuable than saying how many _possible_ slots the edac core has allocated. Then, each driver can decide whether it makes sense to dump that info or not. No, that would add extra complexity at the drivers level just due to debug messages. I think that the better is to move this printk to the debug-specific routine that is called only when the dimm is filled (edac_mc_dump_dimm). With this cange, the message will be printed only for the filled dimms. This is a cleanup patch, so I'll write it, together with the change that will get rid of the loop that uses KERN_CONT. It will use a function added by a latter patch at edac_mc_sysfs so it can't be merged on this patch anyway. The following patch dos the debug cleanup. I'll add at the end of my tree. Regards, Mauro. From: Mauro Carvalho Chehab mche...@redhat.com Date: Mon, 30 Apr 2012 10:24:43 -0300 Subject: [PATCH] edac_mc: Cleanup per-dimm_info debug messages The edac_mc_alloc() routine allocates one dimm_info device for all possible memories, including the non-filled ones. The debug messages there are somewhat confusing. So, cleans them, by moving the code that prints the memory location to edac_mc, and using it on both edac_mc_sysfs and edac_mc. After this patch, a dimm-based memory controller will print the debug info as: [ 728.430828] EDAC DEBUG: edac_mc_dump_dimm: dimm2: channel 0 slot 2 mapped as virtual row 0, chan 2 [ 728.430834] EDAC DEBUG: edac_mc_dump_dimm: dimm-label = 'mc#0channel#0slot#2' [ 728.430839] EDAC DEBUG: edac_mc_dump_dimm: dimm-nr_pages = 0x0 [ 728.430846] EDAC DEBUG: edac_mc_dump_dimm: dimm-grain = 0 [ 728.430850] EDAC DEBUG: edac_mc_dump_dimm: dimm-nr_pages = 0x0 (a rank-based memory controller would print, instead, rank2 on the above debug info) Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index d8278b3..1bc2843 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -40,6 +40,25 @@ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); +unsigned edac_dimm_info_location(struct dimm_info *dimm, char *buf, +int len) +{ + struct mem_ctl_info *mci = dimm-mci; + int i, n, count = 0; + char *p = buf; + + for (i = 0; i mci-n_layers; i++) { + n = snprintf(p, len, %s %d , + edac_layer_name[mci-layers[i].type], + dimm-location[i]); + p += n; + len -= n; + count += n; + } + + return count; +} + #ifdef CONFIG_EDAC_DEBUG static void edac_mc_dump_channel(struct rank_info *chan) @@ -50,20 +69,18 @@ static void edac_mc_dump_channel(struct rank_info *chan) debugf4(\tchannel-dimm = %p\n, chan-dimm); } -static void edac_mc_dump_dimm(struct dimm_info *dimm) +static void edac_mc_dump_dimm(struct dimm_info *dimm, int number) { - int i; + char location[80]; + + edac_dimm_info_location(dimm, location, sizeof(location)); debugf4(\tdimm = %p\n, dimm); + debugf4(\t%s%i: %smapped as virtual row %d, chan %d\n, + dimm-mci-mem_is_per_rank ? rank : dimm, + number, location, dimm-csrow, dimm-cschannel); debugf4(\tdimm-label = '%s'\n, dimm-label); debugf4(\tdimm-nr_pages = 0x%x\n, dimm-nr_pages); - debugf4(\tdimm location ); - for (i = 0; i dimm-mci-n_layers; i++) { - printk(KERN_CONT %d, dimm-location[i]); - if (i dimm-mci-n_layers - 1) - printk(KERN_CONT .); - } - printk(KERN_CONT \n); debugf4(\tdimm-grain = %d\n, dimm-grain); debugf4(\tdimm-nr_pages = 0x%x\n, dimm-nr_pages); } @@ -337,8 +354,6 @@ struct mem_ctl_info *edac_mc_alloc(unsigned edac_index, memset(pos, 0, sizeof(pos)); row = 0;
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 28-04-2012 06:05, Borislav Petkov escreveu: On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote: The fix for it were in another patch[1], as calling them as rank is needed also at the sysfs API. No, this doesn't fix it either: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x0088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked. No, DCT0+DCT1 have 16 ranks, 8 filled and 8 empty. So, it is OK. As I said before when you've pointed this bug (likel at v3 review), edac_mc_alloc doesn't know how many ranks are filled, as the driver logic first calls it to allocate for the max amount of ranks, and then fills the rank with their info (or let them untouched with 0 pages, if they're empty). Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 28-04-2012 14:07, Joe Perches escreveu: On Sat, 2012-04-28 at 11:16 +0200, Borislav Petkov wrote: On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote: All those local variables should be sorted in a reverse christmas tree order: u32 this_is_the_longest_array_name[LENGTH]; void *shorter_named_variable; unsigned long size; int i; ... Why? There's nothing at the CodingStyle saying about how the vars should be ordered. If you want to enforce some particular order, please do it yourself, but apply it consistently among the entire subsystem. First of all, this way it is more readable. Not in my opinion, and blindly using reverse christmas tree can separate variables that should be declared together. I agree with Joe. The order won't make the code easier or harder to read, nor it would improve code performance. Second of all, maybe we should hold it down in CodingStyle. Different developers have different opinions about how to order includes, functions, vars, etc. So, this is not at CodingStyle because there's no consensus about it, and because this is not relevant for code understanding. A reviewer should not reject a patch just because he doesn't like the order that the developer used. Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 28-04-2012 06:16, Borislav Petkov escreveu: On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote: Also, is it valid to have n_layers == 0? The memcpy call below will do nothing. Changed to: BUG_ON(n_layers EDAC_MAX_LAYERS || n_layers == 0); Really? Look below. Weird, not sure what happened here... it seems I sent the version before this change. The patch I've made has the correct BUG_ON at: http://git.infradead.org/users/mchehab/edac.git/commitdiff/447b7929e633027ffe131f2f8f246bba5690cee7 + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + tot_dimms = 1; + tot_cschannels = 1; + tot_csrows = 1; Those initializations can be done above at variable declaration time. Yes, but the compiled code will be the same anyway, as gcc will optimize Hey, are you looking at compiled code or at source code? Because I'm looking at source code, and it is a pretty safe bet the majority of the people here do that too. What I said is that, from source code POV, a code where the loop variables are initialized just before the loop is easier to read it when the initialization of those vars are on another part of the code. That's basically why the for syntax starts with a var initialization clause. The tot_dimms friends are loop vars: their value is calculated within the loop. At the object code, this won't bring any difference. it, either by using registers for those vars or by moving the initialization to the top of the function. This function is too complex, so it is better to initialize those vars just before the loops that are calculating those totals. Simply initialize those variables at declaration time and that's it. Initializing them before the loop doesn't make the function less complex - splitting it and sanitizing it does. Initializing loop-calculated vars just before the loop makes the code easier to read, and may avoid issues that might happen during code lifecycle. + for (i = 0; i n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_cschannels *= layers[i].size; + } [..] -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, -unsigned nr_chans, int edac_index) +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, + unsigned n_layers, + struct edac_mc_layer *layers, + bool rev_order, + unsigned sz_pvt) { void *ptr = NULL; struct mem_ctl_info *mci; -struct csrow_info *csi, *csrow; +struct edac_mc_layer *layer; +struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; +u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; -unsigned size; -int row, chn; +unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; +unsigned tot_csrows, tot_channels, tot_errcount = 0; +int i, j; int err; +int row, chn; +bool per_rank = false; + +BUG_ON(n_layers EDAC_MAX_LAYERS); ^^ Let me re-send it, with the right BUG_ON there. commit 447b7929e633027ffe131f2f8f246bba5690cee7 Author: Mauro Carvalho Chehab mche...@redhat.com Date: Wed Apr 18 15:20:50 2012 -0300 edac: Change internal representation to work with layers Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core was written with the idea that memory controllers are able to directly access csrows. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMMs, instead of ranks. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks with FB-DIMM and RAMBUS memory controllers. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMMs are currently represented by multiple DIMM entries in struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same DIMM. This bug is present since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 28-04-2012 05:52, Borislav Petkov escreveu: On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote: Yes. This is a common issue at the EDAC core: on several places, it calls the edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while the debug macros already handles that. I suspect that, in the past, the __func__ were not at the macros, but some patch added it there, and forgot to fix the occurrences of its call. The patch that added it is d357cbb445208 and you reviewed it. And you wrote the patch that caused it. This is something that needs to be reviewed at the entire EDAC core (and likely at the drivers). Looks like a job for a newbie to get her/his feet wet with kernel work. I opted to not touch on this at the existing debug logic, as I think that the better is to address all those issues on one separate patch, after fixing the EDAC core bugs. No, you simply need to remove the __func__ argument in your newly added debug call: debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, __func__, i, (dimm - mci-dimms), pos[0], pos[1], pos[2], row, chn); And while you're at it, remove the rest of the __func__ arguments from your newly added debugfX calls. A single patch fixing this everywhere at drivers/edac is better and clearer than adding an unrelated fix on this patch. This is already complex enough to add more unrelated things there. Also, a simple perl/coccinelle script can replace all such __func__ occurrences on one shot. Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Sun, 2012-04-29 at 12:11 -0300, Mauro Carvalho Chehab wrote: Em 29-04-2012 11:25, Mauro Carvalho Chehab escreveu: Em 28-04-2012 05:52, Borislav Petkov escreveu: On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote: Yes. This is a common issue at the EDAC core: on several places, it calls the edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while the debug macros already handles that. I suspect that, in the past, the __func__ were not at the macros, but some patch added it there, and forgot to fix the occurrences of its call. The patch that added it is d357cbb445208 and you reviewed it. And you wrote the patch that caused it. And Boris should have also written the follow-on patches that removed most/all of the debugfX and __func__ uses. A single patch fixing this everywhere at drivers/edac is better and clearer than adding an unrelated fix on this patch. This is already complex enough to add more unrelated things there. Also, a simple perl/coccinelle script can replace all such __func__ occurrences on one shot. You make it sound simple, but it'd be a pretty complicated cocci script. Some of the changes would have to be inspected or changed by hand in any case. [] Most of the issues can be solved with the above script-based patch. There are still 171 places (12 places at the core, the rest are on the drivers) that will require a more sophisticated patch or that requires a manual fix. [] From: Mauro Carvalho Chehab mche...@redhat.com Date: Sun, 29 Apr 2012 11:59:14 -0300 Subject: [PATCH] edac: Don't add __func__ or __FILE__ for debugf[0-9] msgs Thanks Mauro, you shouldn't have had to do this. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote: Yes. This is a common issue at the EDAC core: on several places, it calls the edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while the debug macros already handles that. I suspect that, in the past, the __func__ were not at the macros, but some patch added it there, and forgot to fix the occurrences of its call. The patch that added it is d357cbb445208 and you reviewed it. This is something that needs to be reviewed at the entire EDAC core (and likely at the drivers). Looks like a job for a newbie to get her/his feet wet with kernel work. I opted to not touch on this at the existing debug logic, as I think that the better is to address all those issues on one separate patch, after fixing the EDAC core bugs. No, you simply need to remove the __func__ argument in your newly added debug call: debugf2(%s: %d: dimm%zd (%d:%d:%d): row %d, chan %d\n, __func__, i, (dimm - mci-dimms), pos[0], pos[1], pos[2], row, chn); And while you're at it, remove the rest of the __func__ arguments from your newly added debugfX calls. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, Apr 27, 2012 at 04:24:28PM -0300, Mauro Carvalho Chehab wrote: Em 27-04-2012 15:11, Luck, Tony escreveu: +for (i = 0; i dimm-mci-n_layers; i++) { +printk(KERN_CONT %d, dimm-location[i]); +if (i dimm-mci-n_layers - 1) +printk(KERN_CONT .); +} +printk(KERN_CONT \n); This looks hacky but I don't have a good suggestion what to do instead here. Maybe snprintf into a complete string which you can issue with debugf4()... This is not hacky. There are several places at the Kernel doing loops like that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this macro was added later - probably to avoid checkpatch.pl complains). There is some benefit to one printk == one output line ... it means that console output will not be (as) jumbled if multiple cpus are printk'ing at the same time. Ok, but this message only appears when all the conditions below are met: - the driver is compiled with EDAC_DEBUG; - the edac_core is modprobed with edac_debug_level=4; - during the driver modprobe, when the EDAC driver is being registered. That means nothing. Even on several-core machines, those messages won't mangle, in practice. Let's not over-design a simple debug message. No, let's design a simple debug message correctly, regardless of when it appears. Regards, Mauro -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, Apr 27, 2012 at 12:36:12PM -0300, Mauro Carvalho Chehab wrote: The fix for it were in another patch[1], as calling them as rank is needed also at the sysfs API. No, this doesn't fix it either: [ 10.486440] EDAC MC: DCT0 chip selects: [ 10.486443] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486445] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486448] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486450] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486453] EDAC DEBUG: amd64_debug_display_dimm_sizes: F2x180 (DRAM Bank Address Mapping): 0x0088 [ 10.486455] EDAC MC: DCT1 chip selects: [ 10.486458] EDAC amd64: MC: 0: 2048MB 1: 2048MB [ 10.486460] EDAC amd64: MC: 2: 2048MB 3: 2048MB [ 10.486463] EDAC amd64: MC: 4: 0MB 5: 0MB [ 10.486465] EDAC amd64: MC: 6: 0MB 7: 0MB [ 10.486467] EDAC amd64: using x8 syndromes. [ 10.486469] EDAC DEBUG: amd64_dump_dramcfg_low: F2x190 (DRAM Cfg Low): 0x00083100 [ 10.486472] EDAC DEBUG: amd64_dump_dramcfg_low: DIMM type: buffered; all DIMMs support ECC: yes [ 10.486475] EDAC DEBUG: amd64_dump_dramcfg_low: PAR/ERR parity: enabled [ 10.486478] EDAC DEBUG: amd64_dump_dramcfg_low: DCT 128bit mode width: 64b [ 10.486481] EDAC DEBUG: amd64_dump_dramcfg_low: x4 logical DIMMs present: L0: yes L1: yes L2: no L3: no [ 10.486485] EDAC DEBUG: f1x_early_channel_count: Data width is not 128 bits - need more decoding [ 10.486488] EDAC amd64: MCT channel count: 2 [ 10.486493] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc(): allocating 3692 bytes for mci data (16 ranks, 16 csrows/channels) [ 10.486501] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: rank0 (0:0:0): row 0, chan 0 [ 10.486506] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: rank1 (0:1:0): row 0, chan 1 [ 10.486510] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: rank2 (1:0:0): row 1, chan 0 [ 10.486514] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: rank3 (1:1:0): row 1, chan 1 [ 10.486518] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: rank4 (2:0:0): row 2, chan 0 [ 10.486522] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: rank5 (2:1:0): row 2, chan 1 [ 10.486526] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: rank6 (3:0:0): row 3, chan 0 [ 10.486530] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: rank7 (3:1:0): row 3, chan 1 [ 10.486534] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: rank8 (4:0:0): row 4, chan 0 [ 10.486538] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: rank9 (4:1:0): row 4, chan 1 [ 10.486542] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: rank10 (5:0:0): row 5, chan 0 [ 10.486546] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: rank11 (5:1:0): row 5, chan 1 [ 10.486550] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: rank12 (6:0:0): row 6, chan 0 [ 10.486554] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: rank13 (6:1:0): row 6, chan 1 [ 10.486558] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: rank14 (7:0:0): row 7, chan 0 [ 10.486562] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: rank15 (7:1:0): row 7, chan 1 DCT0 has 4 ranks + DCT1 also 4 ranks = 8 ranks total. Now your change is showing 16 ranks. Still b0rked. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote: [..] +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, + unsigned n_layers, + struct edac_mc_layer *layers, + bool rev_order, + unsigned sz_pvt) strange function argument vertical alignment { void *ptr = NULL; struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *lay; As before, call this layers pls. + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; - unsigned size; - int row, chn; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; + unsigned tot_csrows, tot_cschannels; No need to call this tot_cschannels - tot_channels should be enough. + int i, j; int err; + int row, chn; All those local variables should be sorted in a reverse christmas tree order: u32 this_is_the_longest_array_name[LENGTH]; void *shorter_named_variable; unsigned long size; int i; ... Why? There's nothing at the CodingStyle saying about how the vars should be ordered. If you want to enforce some particular order, please do it yourself, but apply it consistently among the entire subsystem. First of all, this way it is more readable. Second of all, maybe we should hold it down in CodingStyle. Third of all, you touch this code so you could fix it up to be more readable while you're at it. + + BUG_ON(n_layers EDAC_MAX_LAYERS); Push this BUG_ON up into edac_mc_alloc as the first thing this function does. It is already the first thing at the function. Ah, that happens later in the patchset where you rename it back to edac_mc_alloc, ok. Also, is it valid to have n_layers == 0? The memcpy call below will do nothing. Changed to: BUG_ON(n_layers EDAC_MAX_LAYERS || n_layers == 0); Really? Look below. + /* + * Calculate the total amount of dimms and csrows/cschannels while + * in the old API emulation mode + */ + tot_dimms = 1; + tot_cschannels = 1; + tot_csrows = 1; Those initializations can be done above at variable declaration time. Yes, but the compiled code will be the same anyway, as gcc will optimize Hey, are you looking at compiled code or at source code? Because I'm looking at source code, and it is a pretty safe bet the majority of the people here do that too. it, either by using registers for those vars or by moving the initialization to the top of the function. This function is too complex, so it is better to initialize those vars just before the loops that are calculating those totals. Simply initialize those variables at declaration time and that's it. Initializing them before the loop doesn't make the function less complex - splitting it and sanitizing it does. + for (i = 0; i n_layers; i++) { + tot_dimms *= layers[i].size; + if (layers[i].is_virt_csrow) + tot_csrows *= layers[i].size; + else + tot_cschannels *= layers[i].size; + } [..] -struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index) +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, +unsigned n_layers, +struct edac_mc_layer *layers, +bool rev_order, +unsigned sz_pvt) { void *ptr = NULL; struct mem_ctl_info *mci; - struct csrow_info *csi, *csrow; + struct edac_mc_layer *layer; + struct csrow_info *csi, *csr; struct rank_info *chi, *chp, *chan; struct dimm_info *dimm; + u32 *ce_per_layer[EDAC_MAX_LAYERS], *ue_per_layer[EDAC_MAX_LAYERS]; void *pvt; - unsigned size; - int row, chn; + unsigned size, tot_dimms, count, pos[EDAC_MAX_LAYERS]; + unsigned tot_csrows, tot_channels, tot_errcount = 0; + int i, j; int err; + int row, chn; + bool per_rank = false; + + BUG_ON(n_layers EDAC_MAX_LAYERS); ^^ -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Sat, 2012-04-28 at 11:16 +0200, Borislav Petkov wrote: On Fri, Apr 27, 2012 at 02:52:35PM -0300, Mauro Carvalho Chehab wrote: All those local variables should be sorted in a reverse christmas tree order: u32 this_is_the_longest_array_name[LENGTH]; void *shorter_named_variable; unsigned long size; int i; ... Why? There's nothing at the CodingStyle saying about how the vars should be ordered. If you want to enforce some particular order, please do it yourself, but apply it consistently among the entire subsystem. First of all, this way it is more readable. Not in my opinion, and blindly using reverse christmas tree can separate variables that should be declared together. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Sat, 2012-04-28 at 10:52 +0200, Borislav Petkov wrote: On Fri, Apr 27, 2012 at 01:07:38PM -0300, Mauro Carvalho Chehab wrote: Yes. This is a common issue at the EDAC core: on several places, it calls the edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while the debug macros already handles that. I suspect that, in the past, the __func__ were not at the macros, but some patch added it there, and forgot to fix the occurrences of its call. The patch that added it is d357cbb445208 and you reviewed it. This is something that needs to be reviewed at the entire EDAC core (and likely at the drivers). Looks like a job for a newbie to get her/his feet wet with kernel work. Looks to me more like a lazy maintainer/developer who doesn't want to bother with a few minutes work. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Btw, this patch gives [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [8.287594] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [8.296784] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: dimm2 (1:0:0): row 1, chan 0 [8.305968] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: dimm3 (1:1:0): row 1, chan 1 [8.315144] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: dimm4 (2:0:0): row 2, chan 0 [8.324326] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: dimm5 (2:1:0): row 2, chan 1 [8.333502] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: dimm6 (3:0:0): row 3, chan 0 [8.342684] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: dimm7 (3:1:0): row 3, chan 1 [8.351860] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: dimm8 (4:0:0): row 4, chan 0 [8.361049] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: dimm9 (4:1:0): row 4, chan 1 [8.370227] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: dimm10 (5:0:0): row 5, chan 0 [8.379582] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: dimm11 (5:1:0): row 5, chan 1 [8.388941] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: dimm12 (6:0:0): row 6, chan 0 [8.398315] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: dimm13 (6:1:0): row 6, chan 1 [8.407680] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: dimm14 (7:0:0): row 7, chan 0 [8.417047] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: dimm15 (7:1:0): row 7, chan 1 and the memory controller has the following chip selects [8.137662] EDAC MC: DCT0 chip selects: [8.150291] EDAC amd64: MC: 0: 2048MB 1: 2048MB [8.155349] EDAC amd64: MC: 2: 2048MB 3: 2048MB [8.160408] EDAC amd64: MC: 4: 0MB 5: 0MB [8.165475] EDAC amd64: MC: 6: 0MB 7: 0MB [8.180499] EDAC MC: DCT1 chip selects: [8.184693] EDAC amd64: MC: 0: 2048MB 1: 2048MB [8.189753] EDAC amd64: MC: 2: 2048MB 3: 2048MB [8.194812] EDAC amd64: MC: 4: 0MB 5: 0MB [8.199875] EDAC amd64: MC: 6: 0MB 7: 0MB Those are 4 dual-ranked DIMMs on this node, DCT0 is one channel and DCT1 is another and I have 4 ranks per channel. Having dimm0-dimm15 is very misleading and has nothing to do with the reality. So, if this is to use your nomenclature with layers, I'll have dimm0-dimm7 where each dimm is a rank. Or, the most correct thing to do would be to have dimm0-dimm3, each dual-ranked. So either tot_dimms is computed wrongly or there's a more serious error somewhere. I've reviewed almost the half patch, will review the rest when/if we sort out the above issue first. Thanks. On Tue, Apr 24, 2012 at 03:15:41PM -0300, Mauro Carvalho Chehab wrote: Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core were written with the idea that memory controllers was are able to directly access csrows, and that the channels are used inside a csrows select. This sounds funny, simply remove that second part about the channels. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMM's, instead of ranks, accessed DIMMs instead of ranks. Remove the rest. via csrow/channel. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks on FB-DIMM and RAMBUS with memory controllers on the next patches. . Remove the rest. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMM's are currently represented by multiple DIMM Multi-rank DIMMs entries at struct dimm_info. That means that changing a label for one in rank won't change the same label for the other ranks at the same dimm. of the same DIMM. Such bug is there since the beginning of the EDAC, so it is not a big This bug is present .. deal. However, on several drivers, it is possible to fix this issue, but remove on it should be a per-driver fix, as the csrow = DIMM arrangement may not be equal for all. So, don't try to fix it here yet. PS.: I tried to make this patch as short as possible, preceding it with Remove PS. several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, 2012-04-27 at 15:33 +0200, Borislav Petkov wrote: this patch gives [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 One too many __func__'s in some combination of the pr_fmt and/or dbg call and/or the actual call site? diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h [] @@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset, #endif /* CONFIG_PCI */ -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, + unsigned nr_chans, int edac_index); Why not extern? Using extern function prototypes in .h files isn't generally necessary nor is extern the more common kernel style. +static inline void edac_mc_handle_ce(struct mem_ctl_info *mci, unsigned long page_frame_number, unsigned long offset_in_page, unsigned long syndrome, int row, int channel, - const char *msg); Strange alignment, pls do static inline void edac_mc_handle_ce(struct..., unsigned..., ..., ...); or static inline void edac_mc_handle_ce(struct ..., etc) or static inline void edac_mc_handle_ce(...) ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
On Fri, Apr 27, 2012 at 10:11:35AM -0400, Joe Perches wrote: -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, +unsigned nr_chans, int edac_index); Why not extern? Using extern function prototypes in .h files isn't generally necessary nor is extern the more common kernel style. Searching for it, there's this discussion, for example: http://gcc.gnu.org/ml/gcc/2009-04/msg00812.html Maybe we should put a small note in Documentation/CodingStyle what the kernel preference is and hold people to it. -- Regards/Gruss, Boris. Advanced Micro Devices GmbH Einsteinring 24, 85609 Dornach GM: Alberto Bozzo Reg: Dornach, Landkreis Muenchen HRB Nr. 43632 WEEE Registernr: 129 19551 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 27-04-2012 10:33, Borislav Petkov escreveu: Btw, this patch gives [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 [8.287594] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 1: dimm1 (0:1:0): row 0, chan 1 [8.296784] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 2: dimm2 (1:0:0): row 1, chan 0 [8.305968] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 3: dimm3 (1:1:0): row 1, chan 1 [8.315144] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 4: dimm4 (2:0:0): row 2, chan 0 [8.324326] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 5: dimm5 (2:1:0): row 2, chan 1 [8.333502] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 6: dimm6 (3:0:0): row 3, chan 0 [8.342684] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 7: dimm7 (3:1:0): row 3, chan 1 [8.351860] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 8: dimm8 (4:0:0): row 4, chan 0 [8.361049] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 9: dimm9 (4:1:0): row 4, chan 1 [8.370227] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 10: dimm10 (5:0:0): row 5, chan 0 [8.379582] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 11: dimm11 (5:1:0): row 5, chan 1 [8.388941] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 12: dimm12 (6:0:0): row 6, chan 0 [8.398315] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 13: dimm13 (6:1:0): row 6, chan 1 [8.407680] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 14: dimm14 (7:0:0): row 7, chan 0 [8.417047] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 15: dimm15 (7:1:0): row 7, chan 1 and the memory controller has the following chip selects [8.137662] EDAC MC: DCT0 chip selects: [8.150291] EDAC amd64: MC: 0: 2048MB 1: 2048MB [8.155349] EDAC amd64: MC: 2: 2048MB 3: 2048MB [8.160408] EDAC amd64: MC: 4: 0MB 5: 0MB [8.165475] EDAC amd64: MC: 6: 0MB 7: 0MB [8.180499] EDAC MC: DCT1 chip selects: [8.184693] EDAC amd64: MC: 0: 2048MB 1: 2048MB [8.189753] EDAC amd64: MC: 2: 2048MB 3: 2048MB [8.194812] EDAC amd64: MC: 4: 0MB 5: 0MB [8.199875] EDAC amd64: MC: 6: 0MB 7: 0MB Those are 4 dual-ranked DIMMs on this node, DCT0 is one channel and DCT1 is another and I have 4 ranks per channel. Having dimm0-dimm15 is very misleading and has nothing to do with the reality. So, if this is to use your nomenclature with layers, I'll have dimm0-dimm7 where each dimm is a rank. Or, the most correct thing to do would be to have dimm0-dimm3, each dual-ranked. So either tot_dimms is computed wrongly or there's a more serious error somewhere. I've reviewed almost the half patch, will review the rest when/if we sort out the above issue first. Thanks. The fix for it were in another patch[1], as calling them as rank is needed also at the sysfs API. [1] http://lists-archives.com/linux-kernel/27623222-edac-add-a-new-per-dimm-api-and-make-the-old-per-virtual-rank-api-obsolete.html I can just merge the fix on this patch, with the enclosed diff. Regards, Mauro diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index 4d4d8b7..e0d9481 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -86,7 +86,7 @@ static void edac_mc_dump_mci(struct mem_ctl_info *mci) debugf4(\tmci-edac_check = %p\n, mci-edac_check); debugf3(\tmci-nr_csrows = %d, csrows = %p\n, mci-nr_csrows, mci-csrows); - debugf3(\tmci-nr_dimms = %d, dimns = %p\n, + debugf3(\tmci-nr_dimms = %d, dimms = %p\n, mci-tot_dimms, mci-dimms); debugf3(\tdev = %p\n, mci-dev); debugf3(\tmod_name:ctl_name = %s:%s\n, mci-mod_name, mci-ctl_name); @@ -183,10 +183,6 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) * @size_pvt: size of private storage needed * * - * FIXME: drivers handle multi-rank memories on different ways: on some - * drivers, one multi-rank memory is mapped as one DIMM, while, on others, - * a single multi-rank DIMM would be mapped into several dimms. - * * Non-csrow based drivers (like FB-DIMM and RAMBUS ones) will likely report * such DIMMS properly, but the CSROWS-based ones will likely do the wrong * thing, as two chip select values are used for dual-rank memories (and 4, for @@ -201,6 +197,12 @@ void *edac_align_ptr(void **p, unsigned size, int n_elems) * * Use edac_mc_free() to free mc structures allocated by this function. * + * NOTE: drivers handle multi-rank memories on different ways: on some + * drivers, one multi-rank memory is mapped as one entry, while, on others, + * a single multi-rank DIMM would be mapped into several entries. Currently, + * this function will allocate multiple struct dimm_info on such scenarios, + * as grouping the multiple ranks require drivers change. + * * Returns: * NULL allocation failed * struct
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 27-04-2012 11:11, Joe Perches escreveu: On Fri, 2012-04-27 at 15:33 +0200, Borislav Petkov wrote: this patch gives [8.278399] EDAC DEBUG: new_edac_mc_alloc: new_edac_mc_alloc: 0: dimm0 (0:0:0): row 0, chan 0 One too many __func__'s in some combination of the pr_fmt and/or dbg call and/or the actual call site? Yes. This is a common issue at the EDAC core: on several places, it calls the edac debug macros (DEBUGF0...DEBUGF4) passing a __func__ as an argument, while the debug macros already handles that. I suspect that, in the past, the __func__ were not at the macros, but some patch added it there, and forgot to fix the occurrences of its call. This is something that needs to be reviewed at the entire EDAC core (and likely at the drivers). I opted to not touch on this at the existing debug logic, as I think that the better is to address all those issues on one separate patch, after fixing the EDAC core bugs. diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h [] @@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset, #endif /* CONFIG_PCI */ -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, + unsigned nr_chans, int edac_index); Why not extern? Using extern function prototypes in .h files isn't generally necessary nor is extern the more common kernel style. Yes. I never add extern on the code I write. While CodingStyle doesn't explicitly say anything about that, its spirit seem to indicate to that the right thing is avoid using it, like, for example: Chapter 4: Naming C is a Spartan language, and so should your naming be. (also on other places, like avoiding to use {} for single-statement if's). So, useless clauses like extern doesn't seem to be the best choice. Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
+ for (i = 0; i dimm-mci-n_layers; i++) { + printk(KERN_CONT %d, dimm-location[i]); + if (i dimm-mci-n_layers - 1) + printk(KERN_CONT .); + } + printk(KERN_CONT \n); This looks hacky but I don't have a good suggestion what to do instead here. Maybe snprintf into a complete string which you can issue with debugf4()... This is not hacky. There are several places at the Kernel doing loops like that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this macro was added later - probably to avoid checkpatch.pl complains). There is some benefit to one printk == one output line ... it means that console output will not be (as) jumbled if multiple cpus are printk'ing at the same time. -Tony ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Em 27-04-2012 15:11, Luck, Tony escreveu: + for (i = 0; i dimm-mci-n_layers; i++) { + printk(KERN_CONT %d, dimm-location[i]); + if (i dimm-mci-n_layers - 1) + printk(KERN_CONT .); + } + printk(KERN_CONT \n); This looks hacky but I don't have a good suggestion what to do instead here. Maybe snprintf into a complete string which you can issue with debugf4()... This is not hacky. There are several places at the Kernel doing loops like that. Look, for example, at lib/hexdump.c (without KERN_CONT, as this macro was added later - probably to avoid checkpatch.pl complains). There is some benefit to one printk == one output line ... it means that console output will not be (as) jumbled if multiple cpus are printk'ing at the same time. Ok, but this message only appears when all the conditions below are met: - the driver is compiled with EDAC_DEBUG; - the edac_core is modprobed with edac_debug_level=4; - during the driver modprobe, when the EDAC driver is being registered. Even on several-core machines, those messages won't mangle, in practice. Let's not over-design a simple debug message. Regards, Mauro ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH EDACv16 1/2] edac: Change internal representation to work with layers
Change the EDAC internal representation to work with non-csrow based memory controllers. There are lots of those memory controllers nowadays, and more are coming. So, the EDAC internal representation needs to be changed, in order to work with those memory controllers, while preserving backward compatibility with the old ones. The edac core were written with the idea that memory controllers are able to directly access csrows, and that the channels are used inside a csrows select. This is not true for FB-DIMM and RAMBUS memory controllers. Also, some recent advanced memory controllers don't present a per-csrows view. Instead, they view memories as DIMM's, instead of ranks, accessed via csrow/channel. So, change the allocation and error report routines to allow them to work with all types of architectures. This will allow the removal of several hacks on FB-DIMM and RAMBUS memory controllers on the next patches. Also, several tests were done on different platforms using different x86 drivers. TODO: a multi-rank DIMM's are currently represented by multiple DIMM entries at struct dimm_info. That means that changing a label for one rank won't change the same label for the other ranks at the same dimm. Such bug is there since the beginning of the EDAC, so it is not a big deal. However, on several drivers, it is possible to fix this issue, but it should be a per-driver fix, as the csrow = DIMM arrangement may not be equal for all. So, don't try to fix it here yet. PS.: I tried to make this patch as short as possible, preceding it with several other patches that simplified the logic here. Yet, as the internal API changes, all drivers need changes. The changes are generally bigger on the drivers for FB-DIMM's. FIXME: while the FB-DIMMs are not converted to use the new design, uncorrected errors will show just one channel. In the past, all changes were on a big patch with about 150K. As it needed to be split, in order to be accepted by the EDAC ML at vger, we've opted to have this small drawback. As an advantage, it is now easier to review the patch series. Cc: Aristeu Rozanski aroza...@redhat.com Cc: Doug Thompson nor...@yahoo.com Cc: Borislav Petkov borislav.pet...@amd.com Cc: Mark Gross mark.gr...@intel.com Cc: Jason Uhlenkott juhle...@akamai.com Cc: Tim Small t...@buttersideup.com Cc: Ranganathan Desikan r...@jetztechnologies.com Cc: Arvind R. arvin...@gmail.com Cc: Olof Johansson o...@lixom.net Cc: Egor Martovetsky e...@pasemi.com Cc: Chris Metcalf cmetc...@tilera.com Cc: Michal Marek mma...@suse.cz Cc: Jiri Kosina jkos...@suse.cz Cc: Joe Perches j...@perches.com Cc: Dmitry Eremin-Solenikov dbarysh...@gmail.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: Hitoshi Mitake h.mit...@gmail.com Cc: Andrew Morton a...@linux-foundation.org Cc: Niklas Söderlund niklas.soderl...@ericsson.com Cc: Shaohui Xie shaohui@freescale.com Cc: Josh Boyer jwbo...@gmail.com Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Mauro Carvalho Chehab mche...@redhat.com --- v16: Only context changes drivers/edac/edac_core.h | 92 ++- drivers/edac/edac_mc.c | 682 -- include/linux/edac.h | 40 ++- 3 files changed, 526 insertions(+), 288 deletions(-) diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h index e48ab31..7201bb1 100644 --- a/drivers/edac/edac_core.h +++ b/drivers/edac/edac_core.h @@ -447,8 +447,13 @@ static inline void pci_write_bits32(struct pci_dev *pdev, int offset, #endif /* CONFIG_PCI */ -extern struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, - unsigned nr_chans, int edac_index); +struct mem_ctl_info *edac_mc_alloc(unsigned sz_pvt, unsigned nr_csrows, + unsigned nr_chans, int edac_index); +struct mem_ctl_info *new_edac_mc_alloc(unsigned edac_index, + unsigned n_layers, + struct edac_mc_layer *layers, + bool rev_order, + unsigned sz_pvt); extern int edac_mc_add_mc(struct mem_ctl_info *mci); extern void edac_mc_free(struct mem_ctl_info *mci); extern struct mem_ctl_info *edac_mc_find(int idx); @@ -467,24 +472,80 @@ extern int edac_mc_find_csrow_by_page(struct mem_ctl_info *mci, * reporting logic and function interface - reduces conditional * statement clutter and extra function arguments. */ -extern void edac_mc_handle_ce(struct mem_ctl_info *mci, + +void edac_mc_handle_error(const enum hw_event_mc_err_type type, + struct mem_ctl_info *mci, + const unsigned long page_frame_number, + const unsigned long offset_in_page, + const unsigned long syndrome, + const int layer0, + const int layer1, + const int