Amusingly, this is actually the theory that I had about this. I thought i had shared it with you but I may well misremember. Had no time to hack it up though.
I concur though. Sounds very plausible. On Tue Feb 24 2015 at 7:20:55 AM Pedro Martelletto <[email protected]> wrote: > Here are some thoughts on the interaction between softraid(4) and WAPBL, > and how to improve the current situation. All of the code references in > this mail concern the master branch with HEAD = 3ca4b444. > > WAPBL saw the introduction of the DIOCCACHESYNC ioctl, whose purpose, as > the name implies, is to flush an I/O controller's cache. > > WAPBL periodically issues the DIOCCACHESYNC ioctl to ensure that journal > metadata is safely on disk. This use of DIOCCACHESYNC happens at the > file system level, which means that a disk device handling multiple > partitions, each a fully independent file system, may be requested to > perform multiple and possibly concurrent DIOCCACHESYNC operations. > > For softraid(4), DIOCCACHESYNC gets translated to sr_raid_sync(). What > sr_raid_sync() implements is not quite a sync operation per se, but > rather a drain: it loops until all pending I/O operations have finished. > This rationale assumes that no further I/O requests can get posted to > softraid(4) after a sync has started, which is the case of the scenarios > for which sr_raid_sync() was originally envisaged (look at the code > paths that lead to sd_flush() other than DIOCCACHESYNC: activate, > deactive, close). > > However, that assumption does not hold true for a softraid(4) disk > backing multiple WAPBL file systems. In this situation, new I/O requests > may and very likely will be posted to softraid(4) when a sync is in > progress. This may have the effect of slowing down the sync operation, > as sr_raid_sync() loops waiting for the number of outstanding I/O > operations to reach zero, ultimately leading to a timeout after 15 > seconds. This worst-case scenario will certainly happen should one of > these parallel operations be another sync, as one sync will cause the > other to timeout. > > Oga, this is probably the slowdown you are seeing. I'm sorry that it > took so long for the problem to be diagnosed. > > Below is nothing but a sketch of a fix. I plan to work on it as I find > the time. > > -p. > > diff --git sys/dev/softraid.c sys/dev/softraid.c > index a9e9ff5..4a2657c 100644 > --- sys/dev/softraid.c > +++ sys/dev/softraid.c > @@ -2121,6 +2121,8 @@ sr_wu_free(struct sr_discipline *sd) > > DNPRINTF(SR_D_WU, "%s: sr_wu_free %p\n", DEVNAME(sd->sd_sc), sd); > > + /* XXX mutex? */ > + > while ((wu = TAILQ_FIRST(&sd->sd_wu_freeq)) != NULL) > TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link); > while ((wu = TAILQ_FIRST(&sd->sd_wu_pendq)) != NULL) > @@ -2144,6 +2146,7 @@ sr_wu_get(void *xsd) > wu = TAILQ_FIRST(&sd->sd_wu_freeq); > if (wu) { > TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link); > + TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); > sd->sd_wu_pending++; > } > mtx_leave(&sd->sd_wu_mtx); > @@ -2266,6 +2269,8 @@ sr_wu_done_callback(void *xwu) > goto done; > } > > + /* XXX mutex? */ > + > /* Remove work unit from pending queue. */ > TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) > if (wup == wu) > @@ -4078,6 +4083,7 @@ sr_raid_start_stop(struct sr_workunit *wu) > int > sr_raid_sync(struct sr_workunit *wu) > { > + struct sr_workunit *wup; > struct sr_discipline *sd = wu->swu_dis; > int ios = 0, rv = 0, retries = 15; > > @@ -4099,35 +4105,36 @@ sr_raid_sync(struct sr_workunit *wu) > > /* diagnostic */ > if (sd->sd_sync != 0) { > - printf("sd->sd_sync != 0, called reentrant\n"); > + /* wait for ongoing syncs to finish */ > } > > sd->sd_sync = 1; > - for (;;) { > - if (sd->sd_wu_pending <= ios) { > - break; > - } > > - /* wait a bit for io to drain */ > - if (msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz) == > - EWOULDBLOCK) { > - if (retries-- > 0) { > - continue; > - } > + TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { > + KASSERT((wup->swu_flags & SR_WUF_SEEN) == 0); > + wup->swu_flags |= SR_WUF_SEEN; > + } > > - /* print timeout for now */ > - printf("%s: sr_raid_sync timeout\n", > - DEVNAME(sd->sd_sc)); > + while (sd->sd_wu_pending > ios && retries-- > 0) { > + msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz); > + TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) > + if (wup->swu_flags & SR_WUF_SEEN) > + continue; > + rv = 0; > + break; > + } > > - DNPRINTF(SR_D_DIS, "%s: sr_raid_sync timeout\n", > - DEVNAME(sd->sd_sc)); > - rv = 1; > - break; > - } > + if (rv != 0) > + TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) { > + wup->swu_flags &= ~SR_WUF_SEEN; > } > - sd->sd_sync = 0; > + > mtx_leave(&sd->sd_wu_mtx); > > + sd->sd_sync = 0; > + > + /* XXX should issue DIOCCACHESYNC on the underlying disk(s)! */ > + > wakeup(&sd->sd_sync); > > return (rv); > @@ -4183,6 +4190,8 @@ sr_schedule_wu(struct sr_workunit *wu) > panic("sr_schedule_wu: work unit not in progress (state > %i)\n", > wu->swu_state); > > + /* XXX mutex? */ > + > /* Walk queue backwards and fill in collider if we have one. */ > TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) > { > if (wu->swu_blk_end < wup->swu_blk_start || > @@ -4223,9 +4232,6 @@ sr_raid_startwu(struct sr_workunit *wu) > wu->swu_state = SR_WU_INPROGRESS; > } > > - if (wu->swu_state != SR_WU_RESTART) > - TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link); > - > /* Start all of the individual I/Os. */ > if (wu->swu_cb_active == 1) > panic("%s: sr_startwu_callback", DEVNAME(sd->sd_sc)); > diff --git sys/dev/softraidvar.h sys/dev/softraidvar.h > index 462a4f2..d386fa0 100644 > --- sys/dev/softraidvar.h > +++ sys/dev/softraidvar.h > @@ -387,6 +387,7 @@ struct sr_workunit { > #define SR_WUF_WAKEUP (1<<4) /* Wakeup on I/O > completion. */ > #define SR_WUF_DISCIPLINE (1<<5) /* Discipline specific > I/O. */ > #define SR_WUF_FAKE (1<<6) /* Faked workunit. */ > +#define SR_WUF_SEEN (1<<7) /* Visited during sync. */ > > /* workunit io range */ > daddr_t swu_blk_start; > >
