> On Dec 27, 2018, at 6:04 AM, Jason Thorpe <thor...@me.com> wrote:
> 
> -- job_hold remains lockless, but job_rele can only be called **without** the 
> job_lock held, because it needs to acquire the lock in the unlikely case it 
> performs a cv_broadcast (see below).  Also need a job_rele_locked.

Correction -- all cases of job_rele can be called with the job_lock held.  (I 
mis-read the block of the code in the overseer where the job_lock is dropped 
immediately before calling job_rele.)

Index: kern_threadpool.c
===================================================================
RCS file: /cvsroot/src/sys/kern/kern_threadpool.c,v
retrieving revision 1.12
diff -u -p -r1.12 kern_threadpool.c
--- kern_threadpool.c   27 Dec 2018 04:45:29 -0000      1.12
+++ kern_threadpool.c   27 Dec 2018 14:37:46 -0000
@@ -134,7 +134,7 @@ static void threadpool_percpu_destroy(st
 
 static threadpool_job_fn_t threadpool_job_dead;
 
-static int     threadpool_job_hold(struct threadpool_job *);
+static void    threadpool_job_hold(struct threadpool_job *);
 static void    threadpool_job_rele(struct threadpool_job *);
 
 static void    threadpool_overseer_thread(void *) __dead;
@@ -650,19 +650,16 @@ threadpool_job_destroy(struct threadpool
        (void)strlcpy(job->job_name, "deadjob", sizeof(job->job_name));
 }
 
-static int
+static void
 threadpool_job_hold(struct threadpool_job *job)
 {
        unsigned int refcnt;
 
        do {
                refcnt = job->job_refcnt;
-               if (refcnt == UINT_MAX)
-                       return EBUSY;
+               KASSERT(refcnt != UINT_MAX);
        } while (atomic_cas_uint(&job->job_refcnt, refcnt, (refcnt + 1))
            != refcnt);
-
-       return 0;
 }
 
 static void
@@ -670,16 +667,16 @@ threadpool_job_rele(struct threadpool_jo
 {
        unsigned int refcnt;
 
+       KASSERT(mutex_owned(job->job_lock));
+
        do {
                refcnt = job->job_refcnt;
                KASSERT(0 < refcnt);
                if (refcnt == 1) {
-                       mutex_enter(job->job_lock);
                        refcnt = atomic_dec_uint_nv(&job->job_refcnt);
                        KASSERT(refcnt != UINT_MAX);
                        if (refcnt == 0)
                                cv_broadcast(&job->job_cv);
-                       mutex_exit(job->job_lock);
                        return;
                }
        } while (atomic_cas_uint(&job->job_refcnt, refcnt, (refcnt - 1))
@@ -703,6 +700,16 @@ threadpool_job_done(struct threadpool_jo
        curlwp->l_name = job->job_thread->tpt_lwp_savedname;
        lwp_unlock(curlwp);
 
+       /*
+        * Inline the work of threadpool_job_rele(); the job is already
+        * locked, the most likely scenario (XXXJRT only scenario?) is
+        * that we're dropping the last reference (the one taken in
+        * threadpool_schedule_job()), and we always do the cv_broadcast()
+        * anyway.
+        */
+       KASSERT(0 < job->job_refcnt);
+       unsigned int refcnt __diagused = atomic_dec_uint_nv(&job->job_refcnt);
+       KASSERT(refcnt != UINT_MAX);
        cv_broadcast(&job->job_cv);
        job->job_thread = NULL;
 }
@@ -725,6 +732,8 @@ threadpool_schedule_job(struct threadpoo
                return;
        }
 
+       threadpool_job_hold(job);
+
        /* Otherwise, try to assign a thread to the job.  */
        mutex_spin_enter(&pool->tp_lock);
        if (__predict_false(TAILQ_EMPTY(&pool->tp_idle_threads))) {
@@ -740,7 +749,6 @@ threadpool_schedule_job(struct threadpoo
                    __func__, job->job_name, job->job_thread));
                TAILQ_REMOVE(&pool->tp_idle_threads, job->job_thread,
                    tpt_entry);
-               threadpool_job_hold(job);
                job->job_thread->tpt_job = job;
        }
 
@@ -786,6 +794,7 @@ threadpool_cancel_job_async(struct threa
                mutex_spin_enter(&pool->tp_lock);
                TAILQ_REMOVE(&pool->tp_jobs, job, job_entry);
                mutex_spin_exit(&pool->tp_lock);
+               threadpool_job_rele(job);
                return true;
        } else {
                /* Too late -- already running.  */
@@ -889,15 +898,13 @@ threadpool_overseer_thread(void *arg)
                }
 
                /* There are idle threads, so try giving one a job.  */
-               bool rele_job = true;
                struct threadpool_job *const job = TAILQ_FIRST(&pool->tp_jobs);
                TAILQ_REMOVE(&pool->tp_jobs, job, job_entry);
-               error = threadpool_job_hold(job);
-               if (error) {
-                       TAILQ_INSERT_HEAD(&pool->tp_jobs, job, job_entry);
-                       (void)kpause("pooljob", false, hz, &pool->tp_lock);
-                       continue;
-               }
+               /*
+                * Take an extra reference on the job temporarily so that
+                * it won't disappear on us while we have both locks dropped.
+                */
+               threadpool_job_hold(job);
                mutex_spin_exit(&pool->tp_lock);
 
                mutex_enter(job->job_lock);
@@ -930,14 +937,11 @@ threadpool_overseer_thread(void *arg)
                                thread->tpt_job = job;
                                job->job_thread = thread;
                                cv_broadcast(&thread->tpt_cv);
-                               /* Gave the thread our job reference.  */
-                               rele_job = false;
                        }
                        mutex_spin_exit(&pool->tp_lock);
                }
+               threadpool_job_rele(job);
                mutex_exit(job->job_lock);
-               if (__predict_false(rele_job))
-                       threadpool_job_rele(job);
 
                mutex_spin_enter(&pool->tp_lock);
        }
@@ -1007,8 +1011,11 @@ threadpool_thread(void *arg)
                KASSERTMSG((curlwp->l_name == lwp_name),
                    "someone forgot to call threadpool_job_done()!");
 
-               /* Job is done and its name is unreferenced.  Release it.  */
-               threadpool_job_rele(job);
+               /*
+                * We can compare pointers, but we can no longer deference
+                * job after this because threadpool_job_done() drops the
+                * last reference on the job while the job is locked.
+                */
 
                mutex_spin_enter(&pool->tp_lock);
                KASSERT(thread->tpt_job == job);


-- thorpej

Reply via email to