Amusingly, this is actually the theory that I had about this. I thought i
had shared it with you but I may well misremember. Had no time to hack it
up though.

I concur though. Sounds very plausible.

On Tue Feb 24 2015 at 7:20:55 AM Pedro Martelletto <[email protected]>
wrote:

> Here are some thoughts on the interaction between softraid(4) and WAPBL,
> and how to improve the current situation. All of the code references in
> this mail concern the master branch with HEAD = 3ca4b444.
>
> WAPBL saw the introduction of the DIOCCACHESYNC ioctl, whose purpose, as
> the name implies, is to flush an I/O controller's cache.
>
> WAPBL periodically issues the DIOCCACHESYNC ioctl to ensure that journal
> metadata is safely on disk. This use of DIOCCACHESYNC happens at the
> file system level, which means that a disk device handling multiple
> partitions, each a fully independent file system, may be requested to
> perform multiple and possibly concurrent DIOCCACHESYNC operations.
>
> For softraid(4), DIOCCACHESYNC gets translated to sr_raid_sync(). What
> sr_raid_sync() implements is not quite a sync operation per se, but
> rather a drain: it loops until all pending I/O operations have finished.
> This rationale assumes that no further I/O requests can get posted to
> softraid(4) after a sync has started, which is the case of the scenarios
> for which sr_raid_sync() was originally envisaged (look at the code
> paths that lead to sd_flush() other than DIOCCACHESYNC: activate,
> deactive, close).
>
> However, that assumption does not hold true for a softraid(4) disk
> backing multiple WAPBL file systems. In this situation, new I/O requests
> may and very likely will be posted to softraid(4) when a sync is in
> progress. This may have the effect of slowing down the sync operation,
> as sr_raid_sync() loops waiting for the number of outstanding I/O
> operations to reach zero, ultimately leading to a timeout after 15
> seconds. This worst-case scenario will certainly happen should one of
> these parallel operations be another sync, as one sync will cause the
> other to timeout.
>
> Oga, this is probably the slowdown you are seeing. I'm sorry that it
> took so long for the problem to be diagnosed.
>
> Below is nothing but a sketch of a fix. I plan to work on it as I find
> the time.
>
> -p.
>
> diff --git sys/dev/softraid.c sys/dev/softraid.c
> index a9e9ff5..4a2657c 100644
> --- sys/dev/softraid.c
> +++ sys/dev/softraid.c
> @@ -2121,6 +2121,8 @@ sr_wu_free(struct sr_discipline *sd)
>
>         DNPRINTF(SR_D_WU, "%s: sr_wu_free %p\n", DEVNAME(sd->sd_sc), sd);
>
> +       /* XXX mutex? */
> +
>         while ((wu = TAILQ_FIRST(&sd->sd_wu_freeq)) != NULL)
>                 TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
>         while ((wu = TAILQ_FIRST(&sd->sd_wu_pendq)) != NULL)
> @@ -2144,6 +2146,7 @@ sr_wu_get(void *xsd)
>         wu = TAILQ_FIRST(&sd->sd_wu_freeq);
>         if (wu) {
>                 TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
> +               TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
>                 sd->sd_wu_pending++;
>         }
>         mtx_leave(&sd->sd_wu_mtx);
> @@ -2266,6 +2269,8 @@ sr_wu_done_callback(void *xwu)
>                         goto done;
>         }
>
> +       /* XXX mutex? */
> +
>         /* Remove work unit from pending queue. */
>         TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
>                 if (wup == wu)
> @@ -4078,6 +4083,7 @@ sr_raid_start_stop(struct sr_workunit *wu)
>  int
>  sr_raid_sync(struct sr_workunit *wu)
>  {
> +       struct sr_workunit      *wup;
>         struct sr_discipline    *sd = wu->swu_dis;
>         int                     ios = 0, rv = 0, retries = 15;
>
> @@ -4099,35 +4105,36 @@ sr_raid_sync(struct sr_workunit *wu)
>
>         /* diagnostic */
>         if (sd->sd_sync != 0) {
> -               printf("sd->sd_sync != 0, called reentrant\n");
> +               /* wait for ongoing syncs to finish */
>         }
>
>         sd->sd_sync = 1;
> -       for (;;) {
> -               if (sd->sd_wu_pending <= ios) {
> -                       break;
> -               }
>
> -               /* wait a bit for io to drain */
> -               if (msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz) ==
> -                   EWOULDBLOCK) {
> -                       if (retries-- > 0) {
> -                               continue;
> -                       }
> +       TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
> +               KASSERT((wup->swu_flags & SR_WUF_SEEN) == 0);
> +               wup->swu_flags |= SR_WUF_SEEN;
> +       }
>
> -                       /* print timeout for now */
> -                       printf("%s: sr_raid_sync timeout\n",
> -                           DEVNAME(sd->sd_sc));
> +       while (sd->sd_wu_pending > ios && retries-- > 0) {
> +               msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz);
> +               TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
> +                       if (wup->swu_flags & SR_WUF_SEEN)
> +                               continue;
> +               rv = 0;
> +               break;
> +       }
>
> -                       DNPRINTF(SR_D_DIS, "%s: sr_raid_sync timeout\n",
> -                           DEVNAME(sd->sd_sc));
> -                       rv = 1;
> -                       break;
> -               }
> +       if (rv != 0)
> +               TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
> +                       wup->swu_flags &= ~SR_WUF_SEEN;
>         }
> -       sd->sd_sync = 0;
> +
>         mtx_leave(&sd->sd_wu_mtx);
>
> +       sd->sd_sync = 0;
> +
> +       /* XXX should issue DIOCCACHESYNC on the underlying disk(s)! */
> +
>         wakeup(&sd->sd_sync);
>
>         return (rv);
> @@ -4183,6 +4190,8 @@ sr_schedule_wu(struct sr_workunit *wu)
>                 panic("sr_schedule_wu: work unit not in progress (state
> %i)\n",
>                     wu->swu_state);
>
> +       /* XXX mutex? */
> +
>         /* Walk queue backwards and fill in collider if we have one. */
>         TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link)
> {
>                 if (wu->swu_blk_end < wup->swu_blk_start ||
> @@ -4223,9 +4232,6 @@ sr_raid_startwu(struct sr_workunit *wu)
>                 wu->swu_state = SR_WU_INPROGRESS;
>         }
>
> -       if (wu->swu_state != SR_WU_RESTART)
> -               TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
> -
>         /* Start all of the individual I/Os. */
>         if (wu->swu_cb_active == 1)
>                 panic("%s: sr_startwu_callback", DEVNAME(sd->sd_sc));
> diff --git sys/dev/softraidvar.h sys/dev/softraidvar.h
> index 462a4f2..d386fa0 100644
> --- sys/dev/softraidvar.h
> +++ sys/dev/softraidvar.h
> @@ -387,6 +387,7 @@ struct sr_workunit {
>  #define SR_WUF_WAKEUP          (1<<4)          /* Wakeup on I/O
> completion. */
>  #define SR_WUF_DISCIPLINE      (1<<5)          /* Discipline specific
> I/O. */
>  #define SR_WUF_FAKE            (1<<6)          /* Faked workunit. */
> +#define SR_WUF_SEEN            (1<<7)          /* Visited during sync. */
>
>         /* workunit io range */
>         daddr_t                 swu_blk_start;
>
>

Reply via email to