Attached a patch against 4.13

master still compiling

On Nov 30 2018, megane wrote:

Hi,

Here's another version that crashes quickly with "very high
probability".

(cond-expand
(chicken-5 (import (chicken base))
           (import (chicken time))
           (import srfi-18))
(else (import chicken)
      (use srfi-18)))

(define m (make-mutex))

(print "@@ " (current-thread) " " "lock")
(mutex-lock! m)

(define t (current-milliseconds))
(define (get-tosleep)
 (/ (floor (* 1000 (- (+ t .030) (current-milliseconds)))) 1000))

(thread-start!
(make-thread (lambda ()
               ;; (thread-sleep! .01)
               (print "@@ " (current-thread) " " "lock")
               (let lp ()
                 (when (not (mutex-lock! m (get-tosleep)))
                   (thread-yield!)
                   (lp)))
               (print "@@ " (current-thread) " " "unlock")
               (mutex-unlock! m))))
(print "@@ " (current-thread) " " "sleep")
(thread-sleep! (get-tosleep))
(print "@@ " (current-thread) " " "unlock")
(mutex-unlock! m)
(thread-yield!)
(thread-sleep! .01)
(print "All ok!!")

--- typical output of a failing execution:

$ stdbuf -oL -eL ./t |& cat -n
    1   @@ #<thread: primordial> lock
    2   #<thread: primordial>: locking #<mutex>
    3   @@ #<thread: primordial> sleep
    4   #<thread: primordial> blocks for timeout 933.0
5 ==================== scheduling, current: #<thread: primordial>, ready: (#<thread: thread1>)
    6   timeout: #<thread: primordial> -> 933.0 (now: 904)
    7   switching to #<thread: thread1>
    8   @@ #<thread: thread1> lock
    9   #<thread: thread1>: locking #<mutex>
   10   #<thread: thread1> blocks for timeout 933.0
   11   #<thread: thread1> sleeping on mutex mutex0
12 ==================== scheduling, current: #<thread: thread1>, ready: ()
   13   timeout: #<thread: primordial> -> 933.0 (now: 904)
   14   timeout: #<thread: primordial> -> 933.0 (now: 934)
   15   timeout expired for #<thread: primordial>
   16   unblocking: #<thread: primordial>
   17   timeout: #<thread: thread1> -> 933.0 (now: 934)
   18   timeout expired for #<thread: thread1>
   19   unblocking: #<thread: thread1>
   20   switching to #<thread: primordial>
   21   @@ #<thread: primordial> unlock
   22   #<thread: primordial>: unlocking mutex0
   23
   24   Error: (mutex-unlock) Internal scheduler error: unknown thread state
   25   #<thread: thread1>
   26   ready
   27
   28           Call history:
   29
   30           t.scm:27: chicken.base#print
   31           t.scm:28: get-tosleep
   32           t.scm:15: chicken.time#current-milliseconds
   33           t.scm:15: scheme#floor
   34           t.scm:15: scheme#/
   35           t.scm:28: srfi-18#thread-sleep!
   36           t.scm:29: srfi-18#current-thread
   37           t.scm:29: chicken.base#print
   38           t.scm:30: srfi-18#mutex-unlock!         <--

(There's an extra debug message on line 15.
Add (dbg "timeout expired for " tto) in this true branch:

(if (>= now tmo1) ; timeout reached?

in ##sys#schedule)

--- The issue
mutex-unlock! makes the decision that a thread freed from
the mutex's waiting list cannot be in the 'ready state.

From the output above you see a case how a thread waiting on a mutex
can end up being in the 'ready state.

line  2: The mutex is locked by primordial thread (pt)
line  4: The pt goes to sleep until 933.0
line  7: As the pt goes to sleep thread1 is scheduled to run
line 10: thread1 tries to lock the mutex, but sets a timeout that
        happens to be at time 933.0

lines 12-14: Both threads asleep, time advances to 934
lines 15-16: pt gets put on the ready list
lines 17-19: thread1 gets put on the ready list
line 20: pt starts running
lines 21-22: pt executes mutex-unlock! while thread1 is ready to run

--- A fix

Just allow the 'ready state for threads in mutex-unlock!

In the patch I arbitrarily call ##sys#schedule after removing a thread
from the list, but I think doing nothing would work equally well.

Is this a correct fix?
Sorry, I can't help with that one..

Maybe it's possible there's threads on the waiting list, but the thread
that gets removed is not going to lock the mutex:

There are 3 threads in this scenario, A, B and C.

* A locks mutex
* A sleeps until t
* B tries to lock mutex until t
* C tries to lock mutex
* A and B are woken up at t
* A unlocks mutex, frees B
* B is scheduled to run as per the patch
* B finds out about the timeout, gives up and starts doing something else
* Now thread C is waiting on the mutex but no-one is going to free it!


From b6837b2c94feb5f8348965f538b5a45bf01a7506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20F=2E=20Wittenberger?=
 <joerg.wittenber...@softeyes.net>
Date: Mon, 3 Dec 2018 21:06:26 +0100
Subject: [PATCH] Fix 1564 internal scheduler error.

---
 scheduler.scm | 80 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/scheduler.scm b/scheduler.scm
index 0b292f7f..a1a03293 100644
--- a/scheduler.scm
+++ b/scheduler.scm
@@ -34,7 +34,7 @@
 	;; This isn't hidden ATM to allow set!ing it as a hook/workaround
 	; ##sys#force-primordial
 	fdset-set fdset-test create-fdset stderr
-	##sys#clear-i/o-state-for-thread! ##sys#abandon-mutexes) 
+	##sys#thread-clear-blocking-state! ##sys#abandon-mutexes)
   (not inline ##sys#interrupt-hook ##sys#force-primordial)
   (unsafe)
   (foreign-declare #<<EOF
@@ -185,7 +185,7 @@ EOF
 		      (if (fp>= now tmo1) ; timeout reached?
 			  (begin
 			    (##sys#setislot tto 13 #t) ; mark as being unblocked by timeout
-			    (##sys#clear-i/o-state-for-thread! tto)
+			    (##sys#thread-clear-blocking-state! tto)
 			    (##sys#thread-basic-unblock! tto)
 			    (loop (cdr lst)) )
 			  (begin
@@ -335,17 +335,9 @@ EOF
 (define (##sys#thread-kill! t s)
   (dbg "killing: " t " -> " s ", recipients: " (##sys#slot t 12))
   (##sys#abandon-mutexes t)
-  (let ((blocked (##sys#slot t 11)))
-    (cond
-     ((##sys#structure? blocked 'condition-variable)
-      (##sys#setslot blocked 2 (##sys#delq t (##sys#slot blocked 2))))
-     ((##sys#structure? blocked 'thread)
-      (##sys#setslot blocked 12 (##sys#delq t (##sys#slot blocked 12))))) )
   (##sys#remove-from-timeout-list t)
-  (##sys#clear-i/o-state-for-thread! t)
+  (##sys#thread-clear-blocking-state! t)
   (##sys#setslot t 3 s)
-  (##sys#setislot t 4 #f)
-  (##sys#setislot t 11 #f)
   (##sys#setislot t 8 '())
   (let ((rs (##sys#slot t 12)))
     (unless (null? rs)
@@ -353,13 +345,15 @@ EOF
        (lambda (t2)
 	 (dbg "  checking: " t2 " (" (##sys#slot t2 3) ") -> " (##sys#slot t2 11))
 	 (when (eq? (##sys#slot t2 11) t)
-	   (##sys#thread-basic-unblock! t2) ) )
-       rs) ) )
-  (##sys#setislot t 12 '()) )
+	   (##sys#thread-unblock! t2) ) )
+       rs)
+      (##sys#setislot t 12 '()) ) ) )
 
 (define (##sys#thread-basic-unblock! t)
   (dbg "unblocking: " t)
-  (##sys#setislot t 11 #f)		; (FD . RWFLAGS) | #<MUTEX> | #<THREAD>
+  #;(if (##sys#slot t 11) ;; remove this case after testing
+      (##sys#error '##sys#thread-basic-unblock! "Internal scheduler error: unclean unblock"
+		   (##sys#slot t 11)))
   (##sys#setislot t 4 #f)
   (##sys#add-to-ready-queue t) )
 
@@ -489,39 +483,20 @@ EOF
 					  ;; is incorrect but will be ignored, just let it run
 					  (when (##sys#slot t 4) ; also blocked for timeout?
 					    (##sys#remove-from-timeout-list t))
-					  (##sys#thread-basic-unblock! t) 
+					  (##sys#thread-clear-blocking-state! t)
+					  (##sys#thread-basic-unblock! t)
 					  (loop2 (cdr threads) keep))
 					 ((not (eq? fd (car p)))
 					  (panic (sprintf "thread is registered for I/O on unknown file-descriptor: ~S (expected ~S)" (car p) fd)))
 					 ((fdset-test inf outf (cdr p))
 					  (when (##sys#slot t 4) ; also blocked for timeout?
 					    (##sys#remove-from-timeout-list t))
+					  (##sys#thread-clear-blocking-state! t)
 					  (##sys#thread-basic-unblock! t) 
 					  (loop2 (cdr threads) keep))
 					 (else (loop2 (cdr threads) (cons t keep)))))))
 			   (cons a (loop n (add1 pos) (cdr lst))) ) ) ) ) ) ] ))) )
 
-
-;;; Clear I/O state for unblocked thread
-
-(define (##sys#clear-i/o-state-for-thread! t)
-  (when (pair? (##sys#slot t 11))
-    (let ((fd (car (##sys#slot t 11))))
-      (set! ##sys#fd-list
-	(let loop ((lst ##sys#fd-list))
-	  (if (null? lst)
-	      '()
-	      (let* ((a (car lst))
-		     (fd2 (car a)) )
-		(if (eq? fd fd2)
-		    (let ((ts (##sys#delq t (cdr a)))) ; remove from fd-list entry
-		      (cond ((null? ts) (cdr lst))
-			    (else
-			     (##sys#setslot a 1 ts) ; fd-list entry is list with t removed
-			     lst) ) )
-		    (cons a (loop (cdr lst)))))))))))
-
-
 ;;; Get list of all threads that are ready or waiting for timeout or waiting for I/O:
 ;
 ; (contributed by Joerg Wittenberger)
@@ -565,6 +540,34 @@ EOF
   (set! ##sys#fd-list (##sys#slot vec 2))
   (set! ##sys#timeout-list (##sys#slot vec 3)) )
 
+;;; Clear blocking queues
+
+(define (##sys#thread-clear-blocking-state! t)
+  (let ((blocked (##sys#slot t 11)))		; (FD . RWFLAGS) | #<MUTEX> | #<THREAD>
+    (dbg "clear-blocking " t " from " blocked)
+    (cond
+     ((pair? blocked)
+      (let ((fd (car (##sys#slot t 11))))
+	(set! ##sys#fd-list
+	      (let loop ((lst ##sys#fd-list))
+		(if (null? lst)
+		    '()
+		    (let* ((a (car lst))
+			   (fd2 (car a)) )
+		      (if (eq? fd fd2)
+			  (let ((ts (##sys#delq t (cdr a)))) ; remove from fd-list entry
+			    (cond ((null? ts) (cdr lst))
+				  (else
+				   (##sys#setslot a 1 ts) ; fd-list entry is list with t removed
+				   lst) ) )
+			  (cons a (loop (cdr lst))))))))))
+     ((##sys#structure? blocked 'condition-variable)
+      (##sys#setslot blocked 2 (##sys#delq t (##sys#slot blocked 2))))
+     ((##sys#structure? blocked 'mutex)
+      (##sys#setslot blocked 3 (##sys#delq t (##sys#slot blocked 3))))
+     ((##sys#structure? blocked 'thread)
+      (##sys#setslot blocked 12 (##sys#delq t (##sys#slot blocked 12)))))
+    (##sys#setislot t 11 #f)))
 
 ;;; Unblock thread cleanly:
 
@@ -572,10 +575,9 @@ EOF
   (when (or (eq? 'blocked (##sys#slot t 3))
 	    (eq? 'sleeping (##sys#slot t 3)))
     (##sys#remove-from-timeout-list t)
-    (##sys#clear-i/o-state-for-thread! t)
+    (##sys#thread-clear-blocking-state! t)
     (##sys#thread-basic-unblock! t) ) )
 
-
 ;;; Kill all threads in fd-, io- and timeout-lists and assign one thread as the
 ;   new primordial one. Overrides "##sys#kill-other-threads" in library.scm.
 
-- 
2.11.0

_______________________________________________
Chicken-hackers mailing list
Chicken-hackers@nongnu.org
https://lists.nongnu.org/mailman/listinfo/chicken-hackers

Reply via email to