Attached a patch against 4.13
master still compiling
On Nov 30 2018, megane wrote:
Hi,
Here's another version that crashes quickly with "very high
probability".
(cond-expand
(chicken-5 (import (chicken base))
(import (chicken time))
(import srfi-18))
(else (import chicken)
(use srfi-18)))
(define m (make-mutex))
(print "@@ " (current-thread) " " "lock")
(mutex-lock! m)
(define t (current-milliseconds))
(define (get-tosleep)
(/ (floor (* 1000 (- (+ t .030) (current-milliseconds)))) 1000))
(thread-start!
(make-thread (lambda ()
;; (thread-sleep! .01)
(print "@@ " (current-thread) " " "lock")
(let lp ()
(when (not (mutex-lock! m (get-tosleep)))
(thread-yield!)
(lp)))
(print "@@ " (current-thread) " " "unlock")
(mutex-unlock! m))))
(print "@@ " (current-thread) " " "sleep")
(thread-sleep! (get-tosleep))
(print "@@ " (current-thread) " " "unlock")
(mutex-unlock! m)
(thread-yield!)
(thread-sleep! .01)
(print "All ok!!")
--- typical output of a failing execution:
$ stdbuf -oL -eL ./t |& cat -n
1 @@ #<thread: primordial> lock
2 #<thread: primordial>: locking #<mutex>
3 @@ #<thread: primordial> sleep
4 #<thread: primordial> blocks for timeout 933.0
5 ==================== scheduling, current: #<thread: primordial>,
ready: (#<thread: thread1>)
6 timeout: #<thread: primordial> -> 933.0 (now: 904)
7 switching to #<thread: thread1>
8 @@ #<thread: thread1> lock
9 #<thread: thread1>: locking #<mutex>
10 #<thread: thread1> blocks for timeout 933.0
11 #<thread: thread1> sleeping on mutex mutex0
12 ==================== scheduling, current: #<thread: thread1>,
ready: ()
13 timeout: #<thread: primordial> -> 933.0 (now: 904)
14 timeout: #<thread: primordial> -> 933.0 (now: 934)
15 timeout expired for #<thread: primordial>
16 unblocking: #<thread: primordial>
17 timeout: #<thread: thread1> -> 933.0 (now: 934)
18 timeout expired for #<thread: thread1>
19 unblocking: #<thread: thread1>
20 switching to #<thread: primordial>
21 @@ #<thread: primordial> unlock
22 #<thread: primordial>: unlocking mutex0
23
24 Error: (mutex-unlock) Internal scheduler error: unknown thread state
25 #<thread: thread1>
26 ready
27
28 Call history:
29
30 t.scm:27: chicken.base#print
31 t.scm:28: get-tosleep
32 t.scm:15: chicken.time#current-milliseconds
33 t.scm:15: scheme#floor
34 t.scm:15: scheme#/
35 t.scm:28: srfi-18#thread-sleep!
36 t.scm:29: srfi-18#current-thread
37 t.scm:29: chicken.base#print
38 t.scm:30: srfi-18#mutex-unlock! <--
(There's an extra debug message on line 15.
Add (dbg "timeout expired for " tto) in this true branch:
(if (>= now tmo1) ; timeout reached?
in ##sys#schedule)
--- The issue
mutex-unlock! makes the decision that a thread freed from
the mutex's waiting list cannot be in the 'ready state.
From the output above you see a case how a thread waiting on a mutex
can end up being in the 'ready state.
line 2: The mutex is locked by primordial thread (pt)
line 4: The pt goes to sleep until 933.0
line 7: As the pt goes to sleep thread1 is scheduled to run
line 10: thread1 tries to lock the mutex, but sets a timeout that
happens to be at time 933.0
lines 12-14: Both threads asleep, time advances to 934
lines 15-16: pt gets put on the ready list
lines 17-19: thread1 gets put on the ready list
line 20: pt starts running
lines 21-22: pt executes mutex-unlock! while thread1 is ready to run
--- A fix
Just allow the 'ready state for threads in mutex-unlock!
In the patch I arbitrarily call ##sys#schedule after removing a thread
from the list, but I think doing nothing would work equally well.
Is this a correct fix?
Sorry, I can't help with that one..
Maybe it's possible there's threads on the waiting list, but the thread
that gets removed is not going to lock the mutex:
There are 3 threads in this scenario, A, B and C.
* A locks mutex
* A sleeps until t
* B tries to lock mutex until t
* C tries to lock mutex
* A and B are woken up at t
* A unlocks mutex, frees B
* B is scheduled to run as per the patch
* B finds out about the timeout, gives up and starts doing something else
* Now thread C is waiting on the mutex but no-one is going to free it!
From b6837b2c94feb5f8348965f538b5a45bf01a7506 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20F=2E=20Wittenberger?=
<joerg.wittenber...@softeyes.net>
Date: Mon, 3 Dec 2018 21:06:26 +0100
Subject: [PATCH] Fix 1564 internal scheduler error.
---
scheduler.scm | 80 ++++++++++++++++++++++++++++++-----------------------------
1 file changed, 41 insertions(+), 39 deletions(-)
diff --git a/scheduler.scm b/scheduler.scm
index 0b292f7f..a1a03293 100644
--- a/scheduler.scm
+++ b/scheduler.scm
@@ -34,7 +34,7 @@
;; This isn't hidden ATM to allow set!ing it as a hook/workaround
; ##sys#force-primordial
fdset-set fdset-test create-fdset stderr
- ##sys#clear-i/o-state-for-thread! ##sys#abandon-mutexes)
+ ##sys#thread-clear-blocking-state! ##sys#abandon-mutexes)
(not inline ##sys#interrupt-hook ##sys#force-primordial)
(unsafe)
(foreign-declare #<<EOF
@@ -185,7 +185,7 @@ EOF
(if (fp>= now tmo1) ; timeout reached?
(begin
(##sys#setislot tto 13 #t) ; mark as being unblocked by timeout
- (##sys#clear-i/o-state-for-thread! tto)
+ (##sys#thread-clear-blocking-state! tto)
(##sys#thread-basic-unblock! tto)
(loop (cdr lst)) )
(begin
@@ -335,17 +335,9 @@ EOF
(define (##sys#thread-kill! t s)
(dbg "killing: " t " -> " s ", recipients: " (##sys#slot t 12))
(##sys#abandon-mutexes t)
- (let ((blocked (##sys#slot t 11)))
- (cond
- ((##sys#structure? blocked 'condition-variable)
- (##sys#setslot blocked 2 (##sys#delq t (##sys#slot blocked 2))))
- ((##sys#structure? blocked 'thread)
- (##sys#setslot blocked 12 (##sys#delq t (##sys#slot blocked 12))))) )
(##sys#remove-from-timeout-list t)
- (##sys#clear-i/o-state-for-thread! t)
+ (##sys#thread-clear-blocking-state! t)
(##sys#setslot t 3 s)
- (##sys#setislot t 4 #f)
- (##sys#setislot t 11 #f)
(##sys#setislot t 8 '())
(let ((rs (##sys#slot t 12)))
(unless (null? rs)
@@ -353,13 +345,15 @@ EOF
(lambda (t2)
(dbg " checking: " t2 " (" (##sys#slot t2 3) ") -> " (##sys#slot t2 11))
(when (eq? (##sys#slot t2 11) t)
- (##sys#thread-basic-unblock! t2) ) )
- rs) ) )
- (##sys#setislot t 12 '()) )
+ (##sys#thread-unblock! t2) ) )
+ rs)
+ (##sys#setislot t 12 '()) ) ) )
(define (##sys#thread-basic-unblock! t)
(dbg "unblocking: " t)
- (##sys#setislot t 11 #f) ; (FD . RWFLAGS) | #<MUTEX> | #<THREAD>
+ #;(if (##sys#slot t 11) ;; remove this case after testing
+ (##sys#error '##sys#thread-basic-unblock! "Internal scheduler error: unclean unblock"
+ (##sys#slot t 11)))
(##sys#setislot t 4 #f)
(##sys#add-to-ready-queue t) )
@@ -489,39 +483,20 @@ EOF
;; is incorrect but will be ignored, just let it run
(when (##sys#slot t 4) ; also blocked for timeout?
(##sys#remove-from-timeout-list t))
- (##sys#thread-basic-unblock! t)
+ (##sys#thread-clear-blocking-state! t)
+ (##sys#thread-basic-unblock! t)
(loop2 (cdr threads) keep))
((not (eq? fd (car p)))
(panic (sprintf "thread is registered for I/O on unknown file-descriptor: ~S (expected ~S)" (car p) fd)))
((fdset-test inf outf (cdr p))
(when (##sys#slot t 4) ; also blocked for timeout?
(##sys#remove-from-timeout-list t))
+ (##sys#thread-clear-blocking-state! t)
(##sys#thread-basic-unblock! t)
(loop2 (cdr threads) keep))
(else (loop2 (cdr threads) (cons t keep)))))))
(cons a (loop n (add1 pos) (cdr lst))) ) ) ) ) ) ] ))) )
-
-;;; Clear I/O state for unblocked thread
-
-(define (##sys#clear-i/o-state-for-thread! t)
- (when (pair? (##sys#slot t 11))
- (let ((fd (car (##sys#slot t 11))))
- (set! ##sys#fd-list
- (let loop ((lst ##sys#fd-list))
- (if (null? lst)
- '()
- (let* ((a (car lst))
- (fd2 (car a)) )
- (if (eq? fd fd2)
- (let ((ts (##sys#delq t (cdr a)))) ; remove from fd-list entry
- (cond ((null? ts) (cdr lst))
- (else
- (##sys#setslot a 1 ts) ; fd-list entry is list with t removed
- lst) ) )
- (cons a (loop (cdr lst)))))))))))
-
-
;;; Get list of all threads that are ready or waiting for timeout or waiting for I/O:
;
; (contributed by Joerg Wittenberger)
@@ -565,6 +540,34 @@ EOF
(set! ##sys#fd-list (##sys#slot vec 2))
(set! ##sys#timeout-list (##sys#slot vec 3)) )
+;;; Clear blocking queues
+
+(define (##sys#thread-clear-blocking-state! t)
+ (let ((blocked (##sys#slot t 11))) ; (FD . RWFLAGS) | #<MUTEX> | #<THREAD>
+ (dbg "clear-blocking " t " from " blocked)
+ (cond
+ ((pair? blocked)
+ (let ((fd (car (##sys#slot t 11))))
+ (set! ##sys#fd-list
+ (let loop ((lst ##sys#fd-list))
+ (if (null? lst)
+ '()
+ (let* ((a (car lst))
+ (fd2 (car a)) )
+ (if (eq? fd fd2)
+ (let ((ts (##sys#delq t (cdr a)))) ; remove from fd-list entry
+ (cond ((null? ts) (cdr lst))
+ (else
+ (##sys#setslot a 1 ts) ; fd-list entry is list with t removed
+ lst) ) )
+ (cons a (loop (cdr lst))))))))))
+ ((##sys#structure? blocked 'condition-variable)
+ (##sys#setslot blocked 2 (##sys#delq t (##sys#slot blocked 2))))
+ ((##sys#structure? blocked 'mutex)
+ (##sys#setslot blocked 3 (##sys#delq t (##sys#slot blocked 3))))
+ ((##sys#structure? blocked 'thread)
+ (##sys#setslot blocked 12 (##sys#delq t (##sys#slot blocked 12)))))
+ (##sys#setislot t 11 #f)))
;;; Unblock thread cleanly:
@@ -572,10 +575,9 @@ EOF
(when (or (eq? 'blocked (##sys#slot t 3))
(eq? 'sleeping (##sys#slot t 3)))
(##sys#remove-from-timeout-list t)
- (##sys#clear-i/o-state-for-thread! t)
+ (##sys#thread-clear-blocking-state! t)
(##sys#thread-basic-unblock! t) ) )
-
;;; Kill all threads in fd-, io- and timeout-lists and assign one thread as the
; new primordial one. Overrides "##sys#kill-other-threads" in library.scm.
--
2.11.0
_______________________________________________
Chicken-hackers mailing list
Chicken-hackers@nongnu.org
https://lists.nongnu.org/mailman/listinfo/chicken-hackers