Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
On 03/13/2017 11:48 AM, Hannes Reinecke wrote: This is assuming that we're always running on a scsi_disk, and that scsi_disk is the only one implementing 'eh_action'. Neither of which is necessarily true. Ah, OK. Thanks for explaining. -- Mauricio Faria de Oliveira IBM Linux Technology Center
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
On 03/13/2017 02:37 PM, Mauricio Faria de Oliveira wrote: > Hannes, > > On 03/01/2017 06:15 AM, Hannes Reinecke wrote: >> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c >> index f2cafae..cec439c 100644 >> --- a/drivers/scsi/scsi_error.c >> +++ b/drivers/scsi/scsi_error.c >> @@ -58,6 +58,7 @@ >> static int scsi_eh_try_stu(struct scsi_cmnd *scmd); >> static int scsi_try_to_abort_cmd(struct scsi_host_template *, >> struct scsi_cmnd *); >> +static int scsi_eh_reset(struct scsi_cmnd *scmd); >> >> /* called with shost->host_lock held */ >> void scsi_eh_wakeup(struct Scsi_Host *shost) >> @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int >> eh_flag) >> if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) >> eh_flag &= ~SCSI_EH_CANCEL_CMD; >> scmd->eh_eflags |= eh_flag; >> +scsi_eh_reset(scmd); >> list_add_tail(>eh_entry, >eh_cmd_q); >> shost->host_failed++; >> scsi_eh_wakeup(shost); >> @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd >> *scmd, int rtn) >> if (!blk_rq_is_passthrough(scmd->request)) { >> struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); >> if (sdrv->eh_action) >> -rtn = sdrv->eh_action(scmd, rtn); >> +rtn = sdrv->eh_action(scmd, rtn, false); >> +} >> +return rtn; >> +} >> + >> +static int scsi_eh_reset(struct scsi_cmnd *scmd) >> +{ >> +int rtn = SUCCESS; >> + >> +if (!blk_rq_is_passthrough(scmd->request)) { >> +struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); >> +if (sdrv->eh_action) >> +rtn = sdrv->eh_action(scmd, rtn, true); >> } >> return rtn; >> } > > Apparently we can de-duplicate code in scsi_eh_reset()/scsi_eh_action(), > and change less of sd_eh_action() (i.e., no new parameter). > > Something like this. Thoughts? > > > - Deduplicate code in scsi_eh_reset() and scsi_eh_action() > - A call to scsi_eh_reset() with reset == false calls into eh_action() > > - A call to scsi_eh_reset() with reset == true returns early, > (as happens with/if sd_eh_action() is called in your patch) > > - A call to scsi_eh_reset() in scsi_eh_scmd_add() uses SUCCESS just > for consistency with scsi_eh_reset() in your patch, as the return > value is ignored in it. > > - Less changes to sd_eh_action() > - Removed one chunk in sd_eh_action() ('reset gate variable') > - No more parameter changes in sd_eh_action() (no 'reset' parameter) > > - Removed forward declaration of scsi_eh_reset(), as already suggested > > > > static int scsi_eh_reset(struct scsi_cmnd *scmd, int eh_disp, bool reset) > { > int rtn = eh_disp; > > if (!blk_rq_is_passthrough(scmd->request)) { > struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); > if (sdrv->eh_action) { > if (reset) { > struct scsi_disk *sdkp = > scsi_disk(scmd->request->rq_disk); > > /* New SCSI EH run, reset gate variable */ > sdkp->medium_access_reset = 0; > return rtn; > } > rtn = sdrv->eh_action(scmd, rtn); > } > } > return rtn; > } > Nope. This is assuming that we're always running on a scsi_disk, and that scsi_disk is the only one implementing 'eh_action'. Neither of which is necessarily true. Cheers, Hannes -- Dr. Hannes ReineckeTeamlead Storage & Networking h...@suse.de +49 911 74053 688 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton HRB 21284 (AG Nürnberg)
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
Hannes, On 03/01/2017 06:15 AM, Hannes Reinecke wrote: diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f2cafae..cec439c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -58,6 +58,7 @@ static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static int scsi_try_to_abort_cmd(struct scsi_host_template *, struct scsi_cmnd *); +static int scsi_eh_reset(struct scsi_cmnd *scmd); /* called with shost->host_lock held */ void scsi_eh_wakeup(struct Scsi_Host *shost) @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) eh_flag &= ~SCSI_EH_CANCEL_CMD; scmd->eh_eflags |= eh_flag; + scsi_eh_reset(scmd); list_add_tail(>eh_entry, >eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost); @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn) if (!blk_rq_is_passthrough(scmd->request)) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) - rtn = sdrv->eh_action(scmd, rtn); + rtn = sdrv->eh_action(scmd, rtn, false); + } + return rtn; +} + +static int scsi_eh_reset(struct scsi_cmnd *scmd) +{ + int rtn = SUCCESS; + + if (!blk_rq_is_passthrough(scmd->request)) { + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); + if (sdrv->eh_action) + rtn = sdrv->eh_action(scmd, rtn, true); } return rtn; } Apparently we can de-duplicate code in scsi_eh_reset()/scsi_eh_action(), and change less of sd_eh_action() (i.e., no new parameter). Something like this. Thoughts? - Deduplicate code in scsi_eh_reset() and scsi_eh_action() - A call to scsi_eh_reset() with reset == false calls into eh_action() - A call to scsi_eh_reset() with reset == true returns early, (as happens with/if sd_eh_action() is called in your patch) - A call to scsi_eh_reset() in scsi_eh_scmd_add() uses SUCCESS just for consistency with scsi_eh_reset() in your patch, as the return value is ignored in it. - Less changes to sd_eh_action() - Removed one chunk in sd_eh_action() ('reset gate variable') - No more parameter changes in sd_eh_action() (no 'reset' parameter) - Removed forward declaration of scsi_eh_reset(), as already suggested static int scsi_eh_reset(struct scsi_cmnd *scmd, int eh_disp, bool reset) { int rtn = eh_disp; if (!blk_rq_is_passthrough(scmd->request)) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) { if (reset) { struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); /* New SCSI EH run, reset gate variable */ sdkp->medium_access_reset = 0; return rtn; } rtn = sdrv->eh_action(scmd, rtn); } } return rtn; } Plus, diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f2cafae..cec439c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) eh_flag &= ~SCSI_EH_CANCEL_CMD; scmd->eh_eflags |= eh_flag; + scsi_eh_reset(scmd, SUCCESS, true); // return value is ignored (early exit due to reset) list_add_tail(>eh_entry, >eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost); @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn) - if (!blk_rq_is_passthrough(scmd->request)) { - struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); - if (sdrv->eh_action) - rtn = sdrv->eh_action(scmd, rtn); - } - return rtn; + return scsi_eh_reset(scmd, rtn, false); } And the rest, without the 'reset' parameter. diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index c7839f6..c794686 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1689,X +1689,Y @@ static int sd_pr_clear(struct block_device *bdev, u64 key) * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful - *
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
On 03/02/2017 09:16 PM, Benjamin Block wrote: > Hej Hannes, > > On Wed, Mar 01, 2017 at 10:15:15AM +0100, Hannes Reinecke wrote: >> The current medium access timeout counter will be increased for >> each command, so if there are enough failed commands we'll hit >> the medium access timeout for even a single failure. >> Fix this by making the timeout per EH run, ie the counter will >> only be increased once per device and EH run. >> >> Cc: Ewan Milne>> Cc: Lawrence Obermann >> Cc: Benjamin Block >> Cc: Steffen Maier >> Signed-off-by: Hannes Reinecke > > Steffen already suggested it, It would be nice to have a stable-tag > here. This already caused multiple real-world false-positive outages, I > think that qualifies for stable :) > >> --- >> drivers/scsi/scsi_error.c | 16 +++- >> drivers/scsi/sd.c | 21 + >> drivers/scsi/sd.h | 1 + >> include/scsi/scsi_driver.h | 2 +- >> 4 files changed, 34 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c >> index f2cafae..cec439c 100644 >> --- a/drivers/scsi/scsi_error.c >> +++ b/drivers/scsi/scsi_error.c >> @@ -58,6 +58,7 @@ >> static int scsi_eh_try_stu(struct scsi_cmnd *scmd); >> static int scsi_try_to_abort_cmd(struct scsi_host_template *, >> struct scsi_cmnd *); >> +static int scsi_eh_reset(struct scsi_cmnd *scmd); >> >> /* called with shost->host_lock held */ >> void scsi_eh_wakeup(struct Scsi_Host *shost) >> @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) >> if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) >> eh_flag &= ~SCSI_EH_CANCEL_CMD; >> scmd->eh_eflags |= eh_flag; >> +scsi_eh_reset(scmd); >> list_add_tail(>eh_entry, >eh_cmd_q); >> shost->host_failed++; >> scsi_eh_wakeup(shost); >> @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int >> rtn) >> if (!blk_rq_is_passthrough(scmd->request)) { >> struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); >> if (sdrv->eh_action) >> -rtn = sdrv->eh_action(scmd, rtn); >> +rtn = sdrv->eh_action(scmd, rtn, false); >> +} >> +return rtn; >> +} >> + >> +static int scsi_eh_reset(struct scsi_cmnd *scmd) >> +{ >> +int rtn = SUCCESS; >> + >> +if (!blk_rq_is_passthrough(scmd->request)) { >> +struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); >> +if (sdrv->eh_action) >> +rtn = sdrv->eh_action(scmd, rtn, true); >> } >> return rtn; >> } >> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c >> index c7839f6..c794686 100644 >> --- a/drivers/scsi/sd.c >> +++ b/drivers/scsi/sd.c >> @@ -115,7 +115,7 @@ >> static int sd_init_command(struct scsi_cmnd *SCpnt); >> static void sd_uninit_command(struct scsi_cmnd *SCpnt); >> static int sd_done(struct scsi_cmnd *); >> -static int sd_eh_action(struct scsi_cmnd *, int); >> +static int sd_eh_action(struct scsi_cmnd *, int, bool); >> static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); >> static void scsi_disk_release(struct device *cdev); >> static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *); >> @@ -1689,18 +1689,28 @@ static int sd_pr_clear(struct block_device *bdev, >> u64 key) >> * sd_eh_action - error handling callback >> * @scmd: sd-issued command that has failed >> * @eh_disp: The recovery disposition suggested by the midlayer >> + * @reset: Reset medium access counter >> * >> * This function is called by the SCSI midlayer upon completion of an >> * error test command (currently TEST UNIT READY). The result of sending >> * the eh command is passed in eh_disp. We're looking for devices that >> * fail medium access commands but are OK with non access commands like >> * test unit ready (so wrongly see the device as having a successful >> - * recovery) >> + * recovery). >> + * We have to be careful to count a medium access failure only once >> + * per SCSI EH run; there might be several timed out commands which >> + * will cause the 'max_medium_access_timeouts' counter to trigger >> + * after the first SCSI EH run already and set the device to offline. >> **/ >> -static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) >> +static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp, bool reset) >> { >> struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); >> >> +if (reset) { >> +/* New SCSI EH run, reset gate variable */ >> +sdkp->medium_access_reset = 0; >> +return eh_disp; >> +} >> if (!scsi_device_online(scmd->device) || >> !scsi_medium_access_command(scmd) || >> host_byte(scmd->result) != DID_TIME_OUT || >> @@ -1714,7
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
Hej Hannes, On Wed, Mar 01, 2017 at 10:15:15AM +0100, Hannes Reinecke wrote: > The current medium access timeout counter will be increased for > each command, so if there are enough failed commands we'll hit > the medium access timeout for even a single failure. > Fix this by making the timeout per EH run, ie the counter will > only be increased once per device and EH run. > > Cc: Ewan Milne> Cc: Lawrence Obermann > Cc: Benjamin Block > Cc: Steffen Maier > Signed-off-by: Hannes Reinecke Steffen already suggested it, It would be nice to have a stable-tag here. This already caused multiple real-world false-positive outages, I think that qualifies for stable :) > --- > drivers/scsi/scsi_error.c | 16 +++- > drivers/scsi/sd.c | 21 + > drivers/scsi/sd.h | 1 + > include/scsi/scsi_driver.h | 2 +- > 4 files changed, 34 insertions(+), 6 deletions(-) > > diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c > index f2cafae..cec439c 100644 > --- a/drivers/scsi/scsi_error.c > +++ b/drivers/scsi/scsi_error.c > @@ -58,6 +58,7 @@ > static int scsi_eh_try_stu(struct scsi_cmnd *scmd); > static int scsi_try_to_abort_cmd(struct scsi_host_template *, >struct scsi_cmnd *); > +static int scsi_eh_reset(struct scsi_cmnd *scmd); > > /* called with shost->host_lock held */ > void scsi_eh_wakeup(struct Scsi_Host *shost) > @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) > if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) > eh_flag &= ~SCSI_EH_CANCEL_CMD; > scmd->eh_eflags |= eh_flag; > + scsi_eh_reset(scmd); > list_add_tail(>eh_entry, >eh_cmd_q); > shost->host_failed++; > scsi_eh_wakeup(shost); > @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int > rtn) > if (!blk_rq_is_passthrough(scmd->request)) { > struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); > if (sdrv->eh_action) > - rtn = sdrv->eh_action(scmd, rtn); > + rtn = sdrv->eh_action(scmd, rtn, false); > + } > + return rtn; > +} > + > +static int scsi_eh_reset(struct scsi_cmnd *scmd) > +{ > + int rtn = SUCCESS; > + > + if (!blk_rq_is_passthrough(scmd->request)) { > + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); > + if (sdrv->eh_action) > + rtn = sdrv->eh_action(scmd, rtn, true); > } > return rtn; > } > diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c > index c7839f6..c794686 100644 > --- a/drivers/scsi/sd.c > +++ b/drivers/scsi/sd.c > @@ -115,7 +115,7 @@ > static int sd_init_command(struct scsi_cmnd *SCpnt); > static void sd_uninit_command(struct scsi_cmnd *SCpnt); > static int sd_done(struct scsi_cmnd *); > -static int sd_eh_action(struct scsi_cmnd *, int); > +static int sd_eh_action(struct scsi_cmnd *, int, bool); > static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); > static void scsi_disk_release(struct device *cdev); > static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *); > @@ -1689,18 +1689,28 @@ static int sd_pr_clear(struct block_device *bdev, u64 > key) > * sd_eh_action - error handling callback > * @scmd: sd-issued command that has failed > * @eh_disp: The recovery disposition suggested by the midlayer > + * @reset: Reset medium access counter > * > * This function is called by the SCSI midlayer upon completion of an > * error test command (currently TEST UNIT READY). The result of sending > * the eh command is passed in eh_disp. We're looking for devices that > * fail medium access commands but are OK with non access commands like > * test unit ready (so wrongly see the device as having a successful > - * recovery) > + * recovery). > + * We have to be careful to count a medium access failure only once > + * per SCSI EH run; there might be several timed out commands which > + * will cause the 'max_medium_access_timeouts' counter to trigger > + * after the first SCSI EH run already and set the device to offline. > **/ > -static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) > +static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp, bool reset) > { > struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); > > + if (reset) { > + /* New SCSI EH run, reset gate variable */ > + sdkp->medium_access_reset = 0; > + return eh_disp; > + } > if (!scsi_device_online(scmd->device) || > !scsi_medium_access_command(scmd) || > host_byte(scmd->result) != DID_TIME_OUT || > @@ -1714,7 +1724,10 @@ static int sd_eh_action(struct scsi_cmnd *scmd, int > eh_disp) >* process of recovering or has it
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
On 03/02/2017 12:24 AM, Bart Van Assche wrote: On Wed, 2017-03-01 at 10:15 +0100, Hannes Reinecke wrote: The current medium access timeout counter will be increased for each command, so if there are enough failed commands we'll hit the medium access timeout for even a single failure. This sentence describes multiple failed commands as a single failure. That's confusing to me. Did you perhaps intend "for a single device failure"? diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f2cafae..cec439c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -58,6 +58,7 @@ static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static int scsi_try_to_abort_cmd(struct scsi_host_template *, struct scsi_cmnd *); +static int scsi_eh_reset(struct scsi_cmnd *scmd); /* called with shost->host_lock held */ void scsi_eh_wakeup(struct Scsi_Host *shost) @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) eh_flag &= ~SCSI_EH_CANCEL_CMD; scmd->eh_eflags |= eh_flag; + scsi_eh_reset(scmd); list_add_tail(>eh_entry, >eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost); @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn) if (!blk_rq_is_passthrough(scmd->request)) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) - rtn = sdrv->eh_action(scmd, rtn); + rtn = sdrv->eh_action(scmd, rtn, false); + } + return rtn; +} + +static int scsi_eh_reset(struct scsi_cmnd *scmd) +{ + int rtn = SUCCESS; + + if (!blk_rq_is_passthrough(scmd->request)) { + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); + if (sdrv->eh_action) + rtn = sdrv->eh_action(scmd, rtn, true); } return rtn; } Can this function be moved up such that we don't need a new forward declaration? Why, but of course. I just put is here to be next to the original scsi_eh_action() function. @@ -1689,18 +1689,28 @@ static int sd_pr_clear(struct block_device *bdev, u64 key) * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer + * @reset: Reset medium access counter It seems to me that @reset does not trigger a reset of the medium access counter but rather of the variable that prevents the medium access error counter to be incremented? Correct. Will be fixing up the description. + * recovery). + * We have to be careful to count a medium access failure only once + * per SCSI EH run; there might be several timed out commands which Did you perhaps intend "once per device per SCSI EH run"? Yes. --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -106,6 +106,7 @@ struct scsi_disk { unsignedrc_basis: 2; unsignedzoned: 2; unsignedurswrz : 1; + unsignedmedium_access_reset : 1; The name of this new member is confusing to me. How about using the name "ignore_medium_access_errors" instead? And since this is a boolean, how about using true and false in assignments to this variable? Ok, will be doing so. Cheers, Hannes -- Dr. Hannes Reinecke zSeries & Storage h...@suse.de +49 911 74053 688 SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
On Wed, 2017-03-01 at 10:15 +0100, Hannes Reinecke wrote: > The current medium access timeout counter will be increased for > each command, so if there are enough failed commands we'll hit > the medium access timeout for even a single failure. This sentence describes multiple failed commands as a single failure. That's confusing to me. Did you perhaps intend "for a single device failure"? > diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c > index f2cafae..cec439c 100644 > --- a/drivers/scsi/scsi_error.c > +++ b/drivers/scsi/scsi_error.c > @@ -58,6 +58,7 @@ > static int scsi_eh_try_stu(struct scsi_cmnd *scmd); > static int scsi_try_to_abort_cmd(struct scsi_host_template *, >struct scsi_cmnd *); > +static int scsi_eh_reset(struct scsi_cmnd *scmd); > > /* called with shost->host_lock held */ > void scsi_eh_wakeup(struct Scsi_Host *shost) > @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) > if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) > eh_flag &= ~SCSI_EH_CANCEL_CMD; > scmd->eh_eflags |= eh_flag; > + scsi_eh_reset(scmd); > list_add_tail(>eh_entry, >eh_cmd_q); > shost->host_failed++; > scsi_eh_wakeup(shost); > @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int > rtn) > if (!blk_rq_is_passthrough(scmd->request)) { > struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); > if (sdrv->eh_action) > - rtn = sdrv->eh_action(scmd, rtn); > + rtn = sdrv->eh_action(scmd, rtn, false); > + } > + return rtn; > +} > + > +static int scsi_eh_reset(struct scsi_cmnd *scmd) > +{ > + int rtn = SUCCESS; > + > + if (!blk_rq_is_passthrough(scmd->request)) { > + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); > + if (sdrv->eh_action) > + rtn = sdrv->eh_action(scmd, rtn, true); > } > return rtn; > } Can this function be moved up such that we don't need a new forward declaration? > @@ -1689,18 +1689,28 @@ static int sd_pr_clear(struct block_device *bdev, u64 > key) > * sd_eh_action - error handling callback > * @scmd: sd-issued command that has failed > * @eh_disp: The recovery disposition suggested by the midlayer > + * @reset: Reset medium access counter It seems to me that @reset does not trigger a reset of the medium access counter but rather of the variable that prevents the medium access error counter to be incremented? > + * recovery). > + * We have to be careful to count a medium access failure only once > + * per SCSI EH run; there might be several timed out commands which Did you perhaps intend "once per device per SCSI EH run"? > --- a/drivers/scsi/sd.h > +++ b/drivers/scsi/sd.h > @@ -106,6 +106,7 @@ struct scsi_disk { > unsignedrc_basis: 2; > unsignedzoned: 2; > unsignedurswrz : 1; > + unsignedmedium_access_reset : 1; The name of this new member is confusing to me. How about using the name "ignore_medium_access_errors" instead? And since this is a boolean, how about using true and false in assignments to this variable? Thanks, Bart.
Re: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run
Hannes, thanks for posting this. I just found time to review the description so far: Subject: [PATCHv3 1/6] scsi_error: count medium access timeout only once per EH run "... to prevent false positives" Might be nice for understanding but also makes the short log quite long. On 03/01/2017 10:15 AM, Hannes Reinecke wrote: The current medium access timeout counter will be increased for each command, so if there are enough failed commands we'll hit "_Enough_ failed commands" sounds a bit vague to me, especially since we seem to know exactly, see below. the medium access timeout for even a single failure. Fix this by making the timeout per EH run, ie the counter will only be increased once per device and EH run. I miss a description of the user-visible consequences of the bug we fix, so any user can understand when they need this. Just an example [FYI: users have been misinterpreting "medium access timeouts" for a time(out) duration value rather than an event count which it actually is]: " Otherwise, any abort due to SCSI command timeout on a SCSI device, having at least a *number* of max_medium_access_timeouts [scsi_disk sysfs attribute, defaults to 2] SCSI commands pending, causes a false positive which erroneously sets the SCSI device offline with the following error kernel message: sd H:C:T:L: [sdXY] Medium access timeout failure. Offlining disk! " Cc: Ewan MilneCc: Lawrence Obermann Cc: Benjamin Block Cc: Steffen Maier I would prefer . Signed-off-by: Hannes Reinecke Martin introduced this feature (18a4d0a22ed6c54b67af7718c305cd010f09ddf8), James did a fix (2451079bc2ae1334058be8babd44be03ecfa7041), and David did the most recent fix (2a863ba8f6f5d72e4905a91c6281d575809fed5b) Did Martin's original commit work without these false positives? If so, some of the other commits might have caused a regression, maybe 2451079bc2ae1334058be8babd44be03ecfa7041? If so, we would need a Fixes: tag. In general I would like to see this tagged for stable due to the already evident impact. --- drivers/scsi/scsi_error.c | 16 +++- drivers/scsi/sd.c | 21 + drivers/scsi/sd.h | 1 + include/scsi/scsi_driver.h | 2 +- 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c index f2cafae..cec439c 100644 --- a/drivers/scsi/scsi_error.c +++ b/drivers/scsi/scsi_error.c @@ -58,6 +58,7 @@ static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static int scsi_try_to_abort_cmd(struct scsi_host_template *, struct scsi_cmnd *); +static int scsi_eh_reset(struct scsi_cmnd *scmd); /* called with shost->host_lock held */ void scsi_eh_wakeup(struct Scsi_Host *shost) @@ -249,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag) if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) eh_flag &= ~SCSI_EH_CANCEL_CMD; scmd->eh_eflags |= eh_flag; + scsi_eh_reset(scmd); list_add_tail(>eh_entry, >eh_cmd_q); shost->host_failed++; scsi_eh_wakeup(shost); @@ -1107,7 +1109,19 @@ static int scsi_eh_action(struct scsi_cmnd *scmd, int rtn) if (!blk_rq_is_passthrough(scmd->request)) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) - rtn = sdrv->eh_action(scmd, rtn); + rtn = sdrv->eh_action(scmd, rtn, false); + } + return rtn; +} + +static int scsi_eh_reset(struct scsi_cmnd *scmd) +{ + int rtn = SUCCESS; + + if (!blk_rq_is_passthrough(scmd->request)) { + struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); + if (sdrv->eh_action) + rtn = sdrv->eh_action(scmd, rtn, true); } return rtn; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index c7839f6..c794686 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -115,7 +115,7 @@ static int sd_init_command(struct scsi_cmnd *SCpnt); static void sd_uninit_command(struct scsi_cmnd *SCpnt); static int sd_done(struct scsi_cmnd *); -static int sd_eh_action(struct scsi_cmnd *, int); +static int sd_eh_action(struct scsi_cmnd *, int, bool); static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *); @@ -1689,18 +1689,28 @@ static int sd_pr_clear(struct block_device *bdev, u64 key) * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer + * @reset: Reset medium access counter * * This function is called by the SCSI midlayer upon