On Mon, Mar 23, 2026 at 10:33:12AM +0000, John Garry wrote:
> On 23/03/2026 00:08, Benjamin Marzinski wrote:
> > > k += off, desc += off) {
> > > - u16 group_id = get_unaligned_be16(&desc[2]);
> > > -
> > > - spin_lock_irqsave(&port_group_lock, flags);
> > > - tmp_pg = alua_find_get_pg(pg->device_id_str, pg->device_id_len,
> > > - group_id);
> > > - spin_unlock_irqrestore(&port_group_lock, flags);
> > > - if (tmp_pg) {
> > > - if (spin_trylock_irqsave(&tmp_pg->lock, flags)) {
> > > - if ((tmp_pg == pg) ||
> > > - !(tmp_pg->flags & ALUA_PG_RUNNING)) {
> > > - struct alua_dh_data *h;
> > > -
> > > - tmp_pg->state = desc[0] & 0x0f;
> > > - tmp_pg->pref = desc[0] >> 7;
> > > - rcu_read_lock();
> > > - list_for_each_entry_rcu(h,
> > > - &tmp_pg->dh_list, node) {
> > > - if (!h->sdev)
> > > - continue;
> > > - h->sdev->access_state = desc[0];
> > > - }
> > > - rcu_read_unlock();
> > > - }
> > > - if (tmp_pg == pg)
> > > - tmp_pg->valid_states = desc[1];
> > > - spin_unlock_irqrestore(&tmp_pg->lock, flags);
> > > - }
> > > - kref_put(&tmp_pg->kref, release_port_group);
> > > + u16 group_id_desc = get_unaligned_be16(&desc[2]);
> > > +
> > > + spin_lock_irqsave(&h->lock, flags);
> > > + if (group_id_desc == group_id) {
> > > + h->group_id = group_id;
> > > + WRITE_ONCE(h->state, desc[0] & 0x0f);
> > > + h->pref = desc[0] >> 7;
> > > + WRITE_ONCE(sdev->access_state, desc[0]);
> > > + h->valid_states = desc[1];
> > instead of alua_rtpg() updating the access_state all of the devices in
> > all the port groups, and the state and pref of all the port groups. It
> > now just sets these for one device. It seems like it's wasting a lot of
> > information that it used to use. For instance, now when a scsi command
> > returns a unit attention that the ALUA state has changed, it won't get
> > updated on all the devices, just the one that got the unit attention.
>
> The fabric should then trigger this PG info update be re-scanned
> per-path/sdev (and not just a single sdev in the PG). From testing with a
> linux target, this is what happens - a UA is triggered per path when I
> changed the PG access state.
>
> >
> > > }
> > > + spin_unlock_irqrestore(&h->lock, flags);
> > > off = 8 + (desc[7] * 4);
> > > }
> > > skip_rtpg:
> > > - spin_lock_irqsave(&pg->lock, flags);
> > > + spin_lock_irqsave(&h->lock, flags);
> > > if (transitioning_sense)
> > > - pg->state = SCSI_ACCESS_STATE_TRANSITIONING;
> > > + h->state = SCSI_ACCESS_STATE_TRANSITIONING;
>
> ...
>
> > > -
> > > static void alua_rtpg_work(struct work_struct *work)
> > > {
> > > - struct alua_port_group *pg =
> > > - container_of(work, struct alua_port_group, rtpg_work.work);
> > > - struct scsi_device *sdev, *prev_sdev = NULL;
> > > + struct alua_dh_data *h =
> > > + container_of(work, struct alua_dh_data, rtpg_work.work);
> > > + struct scsi_device *sdev = h->sdev;
> > > LIST_HEAD(qdata_list);
> > > int err = SCSI_DH_OK;
> > > struct alua_queue_data *qdata, *tmp;
> > > - struct alua_dh_data *h;
> > > unsigned long flags;
> > > - spin_lock_irqsave(&pg->lock, flags);
> > > - sdev = pg->rtpg_sdev;
> > > - if (!sdev) {
> > > - WARN_ON(pg->flags & ALUA_PG_RUN_RTPG);
> > > - WARN_ON(pg->flags & ALUA_PG_RUN_STPG);
> > > - spin_unlock_irqrestore(&pg->lock, flags);
> > > - kref_put(&pg->kref, release_port_group);
> > > - return;
> > > - }
> > > - pg->flags |= ALUA_PG_RUNNING;
> > > - if (pg->flags & ALUA_PG_RUN_RTPG) {
> > > - int state = pg->state;
> > > + spin_lock_irqsave(&h->lock, flags);
> > > + h->flags |= ALUA_PG_RUNNING;
> > > + if (h->flags & ALUA_PG_RUN_RTPG) {
> > > + int state = h->state;
> > > - pg->flags &= ~ALUA_PG_RUN_RTPG;
> > > - spin_unlock_irqrestore(&pg->lock, flags);
> > > + h->flags &= ~ALUA_PG_RUN_RTPG;
> > > + spin_unlock_irqrestore(&h->lock, flags);
> > > if (state == SCSI_ACCESS_STATE_TRANSITIONING) {
> > > if (alua_tur(sdev) == SCSI_DH_RETRY) {
> > > - spin_lock_irqsave(&pg->lock, flags);
> > > - pg->flags &= ~ALUA_PG_RUNNING;
> > > - pg->flags |= ALUA_PG_RUN_RTPG;
> > > - if (!pg->interval)
> > > - pg->interval = ALUA_RTPG_RETRY_DELAY;
> > > - spin_unlock_irqrestore(&pg->lock, flags);
> > > - queue_delayed_work(kaluad_wq, &pg->rtpg_work,
> > > - pg->interval * HZ);
> > > + spin_lock_irqsave(&h->lock, flags);
> > > + h->flags &= ~ALUA_PG_RUNNING;
> > > + h->flags |= ALUA_PG_RUN_RTPG;
> > > + if (!h->interval)
> > > + h->interval = ALUA_RTPG_RETRY_DELAY;
> > > + spin_unlock_irqrestore(&h->lock, flags);
> > > + queue_delayed_work(kaluad_wq, &h->rtpg_work,
> > > + h->interval * HZ);
> > > return;
> > > }
> > > /* Send RTPG on failure or if TUR indicates
> > > SUCCESS */
> > > }
> > > - err = alua_rtpg(sdev, pg);
> > > - spin_lock_irqsave(&pg->lock, flags);
> > > + err = alua_rtpg(sdev);
> > > + spin_lock_irqsave(&h->lock, flags);
> > > - /* If RTPG failed on the current device, try using another */
> > > - if (err == SCSI_DH_RES_TEMP_UNAVAIL &&
> > > - (prev_sdev = alua_rtpg_select_sdev(pg)))
> > > - err = SCSI_DH_IMM_RETRY;
> > Previously, if the rtpg failed on a device, another device would be
> > tried, and the unusable device's alua state would get updated, along
> > with all the other device's states.
>
> Where specifically are you referring to here please?
The removed code above here calls alua_rtpg_select_sdev() to select a
new device to retry the rtpg on, and returns with SCSI_DH_IMM_RETRY, to
retrigger the rtpg on that device. If the rtpg completed on any device,
it would update the state on all the devices. But if we are depending
each device issuing its own rtp to update its state, what happens to
the devices that can't complete the rtpg? I assume the correct answer is
to give them some failed state.
-Ben
> > Now I don't see how a failed device
> > gets its state updated.
>
> AFAICS, I am only not omitted how we iterate through the devices per-PG, as
> now we just do this work for all paths/scsi devices.
>
> Thanks,
> John