Re: [PATCH?] Separate pthread patches, #2 take 3

2009-06-03 Thread Dave Korn
Christopher Faylor wrote:
> On Thu, Jun 04, 2009 at 04:03:03AM +0100, Dave Korn wrote:
>> but it's a horrible bit of code.  Declaring the memory location as input 
>> only,
>> then clobbering all of memory and potentially confusing the optimisers with
>> type aliasing casts?  It makes me very uneasy.
> 
> Ok.  I'm convinced.  Please check in whatever you think is best.
> 
> cgf

  I will wait and see what advice the gcc list has to offer on the topic
first.  It may yet cast a new light on things.  (Or it may just confirm my
suspicions that even in trusted and well-tested library code, there has been a
fair deal of ad-hoc-ery and copypastaing of inline asms that people didn't
really grok.)  I'll also see if I can dig up any recent PRs or fixes that
might have a bearing on why we get better code from HEAD than 4.3.

cheers,
  DaveK


Re: [PATCH?] Separate pthread patches, #2 take 3

2009-06-03 Thread Christopher Faylor
On Thu, Jun 04, 2009 at 04:03:03AM +0100, Dave Korn wrote:
>but it's a horrible bit of code.  Declaring the memory location as input only,
>then clobbering all of memory and potentially confusing the optimisers with
>type aliasing casts?  It makes me very uneasy.

Ok.  I'm convinced.  Please check in whatever you think is best.

cgf


Re: [PATCH?] Separate pthread patches, #2 take 3

2009-06-03 Thread Dave Korn
Christopher Faylor wrote:

> I thought that, given your last message to cygwin-developers, you were
> going to go off and figure out the best of four implementations.  Is this
> the result of that investigation?

  Once I discarded the ones that weren't quite right because they included a
setz/sete instruction (bsd and boehm), I was left with the ones in the
attached testcase.  The only version I hadn't tested yet is the linux-derived 
one:

struct __xchg_dummy { unsigned long a[100]; };
#define __xg(x) ((struct __xchg_dummy *)(x))
#define LOCK_PREFIX " lock "
static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
  unsigned long newv)
{
unsigned long prev;
__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
 : "=a"(prev)
 : "q"(newv), "m"(*__xg(ptr)), "0"(old)
 : "memory");
return prev;
}
extern __inline__ long
ilockcmpexch (volatile long *t, long v, long c)
{
  return __cmpxchg (t, c, v);
}

  This actually produces the same assembly as my version:

L14:
movl__ZN13pthread_mutex7mutexesE, %eax   # mutexes.head, D.2029
movl%eax, 36(%ebx)   # D.2029, .next
/APP
 # 75 "mxfull.cpp" 1
 lock cmpxchgl %ebx,__ZN13pthread_mutex7mutexesE # this,
 # 0 "" 2
/NO_APP
cmpl%eax, 36(%ebx)   # prev, .next
jne L14  #,


but it's a horrible bit of code.  Declaring the memory location as input only,
then clobbering all of memory and potentially confusing the optimisers with
type aliasing casts?  It makes me very uneasy.

cheers,
  DaveK



// g++ -c mxfull.cpp -o mxfull.o --save-temps -O2 -fverbose-asm

typedef long LONG;
typedef void *HANDLE;
typedef void *PVOID;
typedef char *LPCSTR;

typedef class pthread_mutex *pthread_mutex_t;
typedef class pthread_mutexattr *pthread_mutexattr_t;
typedef class pthread *pthread_t;

struct SECURITY_ATTRIBUTES;
typedef struct SECURITY_ATTRIBUTES *LPSECURITY_ATTRIBUTES;
extern struct SECURITY_ATTRIBUTES sec_none_nih;

HANDLE __attribute__((__stdcall__)) 
CreateSemaphoreA(LPSECURITY_ATTRIBUTES,LONG,LONG,LPCSTR);

class verifyable_object
{
public:
  long magic;

  verifyable_object (long);
  virtual ~verifyable_object ();
};

#if 0
// My version
 extern __inline__ long
 ilockcmpexch (volatile long *t, long v, long c)
 {
  register long __res __asm ("%eax") = c;
   __asm__ __volatile__ ("\n\
lock cmpxchgl %2,%1\n\
": "+a" (__res), "=m" (*t) : "q" (v), "m" (*t) : "memory", "cc");
   return __res;
 }
#elif 0
// Original version
 extern __inline__ long
 ilockcmpexch (volatile long *t, long v, long c)
 {
  register int __res;
   __asm__ __volatile__ ("\n\
lock cmpxchgl %3,(%1)\n\
": "=a" (__res), "=q" (t) : "1" (t), "q" (v), "0" (c): "cc");
   return __res;
 }
#elif 0
// GlibC/uClibC version
extern __inline__ long
ilockcmpexch (volatile long *t, long v, long c)
{
  return ({
__typeof (*t) ret;
__asm __volatile ("lock cmpxchgl %2, %1"
: "=a" (ret), "=m" (*t)
: "r" (v), "m" (*t), "0" (c));
ret;
});
}
#elif 01
// Linux-2.6.8-1 version
struct __xchg_dummy { unsigned long a[100]; };
#define __xg(x) ((struct __xchg_dummy *)(x))
#define LOCK_PREFIX " lock "
static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
  unsigned long newv)
{
unsigned long prev;
__asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
 : "=a"(prev)
 : "q"(newv), "m"(*__xg(ptr)), "0"(old)
 : "memory");
return prev;
}
extern __inline__ long
ilockcmpexch (volatile long *t, long v, long c)
{
  return __cmpxchg (t, c, v);
}
#endif

class pthread_mutexattr: public verifyable_object
{
public:
  int pshared;
  int mutextype;
  pthread_mutexattr ();
  ~pthread_mutexattr ();
};

template  inline void
List_insert (list_node *&head, list_node *node)
{
  if (!node)
return;
  do
node->next = head;
  while ((PVOID)ilockcmpexch((LONG volatile 
*)(&head),(LONG)(node),(LONG)(node->next)) != node->next);
}

template  class List
{
 public:
  List() : head(__null)
  {
  }

  ~List()
  {
  }

  void insert (list_node *node)
  {
List_insert (head, node);
  }

  list_node *head;

};

class pthread_mutex: public verifyable_object
{
public:

  unsigned long lock_counter;
  HANDLE win32_obj_id;
  unsigned int recursion_counter;
  LONG condwaits;
  pthread_t owner;



  int type;
  int pshared;


  pthread_mutex (pthread_mutexattr * = __null);
  ~pthread_mutex ();

  class pthread_mutex * next;

private:
  static List mutexes;
};

List pthread_mutex::mutexes;

pthread_mutex::pthread_mutex (pthread_mutexattr *attr) :
  verifyable_object (0

Re: [PATCH?] Separate pthread patches, #2 take 3

2009-06-03 Thread Christopher Faylor
On Thu, Jun 04, 2009 at 12:47:48AM +0100, Dave Korn wrote:
>Dave Korn wrote:
>>   The attached patch implements ilockexch and ilockcmpexch, using the inline
>> asm definition from __arch_compare_and_exchange_val_32_acq in
>> glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline 
>> rather
>> than in its original preprocessor macro form.
>
>  The attached patch does likewise, but adds a "memory" clobber.  It generates
>correct code:
>
>L186:
>   .loc 3 127 0
>   movl__ZN13pthread_mutex7mutexesE+8, %eax # mutexes.head, D.28599
>   movl%eax, 36(%ebx)   # D.28599, .next
>   .loc 2 60 0
>/APP
> # 60 "/gnu/winsup/src/winsup/cygwin/winbase.h" 1
>   lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE+8   # this,
> # 0 "" 2
>/NO_APP
>   movl%eax, -12(%ebp)  # tmp68, ret
>   .loc 2 61 0
>   movl-12(%ebp), %eax  # ret, D.28596
>   .loc 3 126 0
>   cmpl%eax, 36(%ebx)   # D.28596, .next
>   jne L186 #,
>
>
>although as you see it has some needless register motion as it stores %eax to
>the stack slot for ret and reloads it.  Still, this is now almost as good as
>the code generated by my original patch.
>
>winsup/cygwin/ChangeLog
>
>   * winbase.h (ilockexch):  Fix asm constraints.
>   (ilockcmpexch):  Likewise.
>
>
>  Ok-ish?

I thought that, given your last message to cygwin-developers, you were
going to go off and figure out the best of four implementations.  Is this
the result of that investigation?

cgf


Re: [PATCH?] Separate pthread patches, #2 take 2.

2009-06-03 Thread Dave Korn
Dave Korn wrote:
> Dave Korn wrote:
>>   The attached patch implements ilockexch and ilockcmpexch, using the inline
>> asm definition from __arch_compare_and_exchange_val_32_acq in
>> glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline 
>> rather
>> than in its original preprocessor macro form.
>>
>>   It generates incorrect code.
> 
>   This much looks like it's probably a compiler bug.  

  Let's see whether anyone else agrees:

http://gcc.gnu.org/ml/gcc/2009-06/msg00053.html

cheers,
  DaveK


Re: [PATCH] Separate pthread fixes #1

2009-06-03 Thread Christopher Faylor
On Thu, Jun 04, 2009 at 12:11:24AM +0100, Dave Korn wrote:
>
>  The attached patch separates out the uncontroversial change to the
>__cygwin_lock* functions.
>
>winsup/cygwin/ChangeLog
>
>   * thread.cc (__cygwin_lock_lock):  Delete racy optimisation.
>   (__cygwin_lock_unlock):  Likewise.
>
>  OK?

Yes.  Thanks.

FWIW, I have made this same change many times over the years but I was
always afraid of the performance hit so I've ended up reverting it.
Since you've demonstrated a real problem, performance concerns obviously
don't matter.

cgf


Re: [PATCH?] Separate pthread patches, #2 take 2.

2009-06-03 Thread Dave Korn
Dave Korn wrote:
>   The attached patch implements ilockexch and ilockcmpexch, using the inline
> asm definition from __arch_compare_and_exchange_val_32_acq in
> glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline rather
> than in its original preprocessor macro form.
> 
>   It generates incorrect code.

  This much looks like it's probably a compiler bug.  This version, compiled
with current GCC HEAD, generates the same results as in the take 3 version,
without needing the explicit memory clobber added:

L469:
.loc 2 127 0
movl__ZN13pthread_mutex7mutexesE+8, %eax # mutexes.head, D.30413
movl%eax, 36(%ebx)   # D.30413, .next
.loc 5 58 0
/APP
 # 58 "/gnu/winsup/src/winsup/cygwin/winbase.h" 1
lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE+8   # this,
 # 0 "" 2
/NO_APP
movl%eax, -12(%ebp)  # tmp76, ret
.loc 5 59 0
movl-12(%ebp), %eax  # ret, D.30414
.loc 2 126 0
cmpl%eax, 36(%ebx)   # D.30414, .next
jne L469 #,

... right down to the unoptimised register motion.  This is what we would have
hoped to see: the "memory" clobber ought to be superfluous, and the
write-output operand in *t should have told GCC not to move the store to
node->next after the loop.

  I checked 4.3.3; it behaves the same as 4.3.2, i.e. it incorrectly lowers
the store without a memory clobber present.

cheers,
  DaveK


Re: [PATCH?] Separate pthread patches, #2 take 1 redux

2009-06-03 Thread Dave Korn
Dave Korn wrote:
> Dave Korn wrote:
>>   The attached patch implements ilockexch and ilockcmpexch, using the inline
>> asm definition from __arch_compare_and_exchange_val_32_acq in
>> glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline 
>> rather
>> than in its original preprocessor macro form.

  And this one, just to have the full set in the same place, is the version
that I originally suggested.  It generates correct and efficient code:

L215:
.loc 3 127 0
movl__ZN13pthread_mutex7mutexesE+8, %eax # mutexes.head, D.28638
movl%eax, 36(%ebx)   # D.28638, .next
.loc 2 53 0
/APP
 # 53 "/gnu/winsup/src/winsup/cygwin/winbase.h" 1

lock cmpxchgl %ebx,__ZN13pthread_mutex7mutexesE+8# this,

 # 0 "" 2
/NO_APP
.loc 3 126 0
cmpl%eax, 36(%ebx)   # D.28635, .next
jne L215 #,

but is more risky.  No ChangeLog because it's not going to be approved, I'm
posting it just for completeness and future reference.

cheers,
  DaveK
Index: winsup/cygwin/winbase.h
===
RCS file: /cvs/src/src/winsup/cygwin/winbase.h,v
retrieving revision 1.14
diff -p -u -r1.14 winbase.h
--- winsup/cygwin/winbase.h	12 Jul 2008 18:09:17 -	1.14
+++ winsup/cygwin/winbase.h	3 Jun 2009 17:38:06 -
@@ -38,21 +38,21 @@ ilockdecr (volatile long *m)
 extern __inline__ long
 ilockexch (volatile long *t, long v)
 {
-  register int __res;
+  register long __res __asm__ ("%eax") = *t;
   __asm__ __volatile__ ("\n\
-1:	lock	cmpxchgl %3,(%1)\n\
+1:	lock	cmpxchgl %2,%1\n\
 	jne 1b\n\
- 	": "=a" (__res), "=q" (t): "1" (t), "q" (v), "0" (*t): "cc");
+ 	": "+a" (__res), "=m" (*t): "q" (v), "m" (*t) : "memory", "cc");
   return __res;
 }
 
 extern __inline__ long
 ilockcmpexch (volatile long *t, long v, long c)
 {
-  register int __res;
+  register long __res __asm ("%eax") = c;
   __asm__ __volatile__ ("\n\
-	lock cmpxchgl %3,(%1)\n\
-	": "=a" (__res), "=q" (t) : "1" (t), "q" (v), "0" (c): "cc");
+	lock cmpxchgl %2,%1\n\
+	": "+a" (__res), "=m" (*t) : "q" (v), "m" (*t) : "memory", "cc");
   return __res;
 }
 


Re: [PATCH?] Separate pthread patches, #2 take 3

2009-06-03 Thread Dave Korn
Dave Korn wrote:
>   The attached patch implements ilockexch and ilockcmpexch, using the inline
> asm definition from __arch_compare_and_exchange_val_32_acq in
> glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline rather
> than in its original preprocessor macro form.

  The attached patch does likewise, but adds a "memory" clobber.  It generates
correct code:

L186:
.loc 3 127 0
movl__ZN13pthread_mutex7mutexesE+8, %eax # mutexes.head, D.28599
movl%eax, 36(%ebx)   # D.28599, .next
.loc 2 60 0
/APP
 # 60 "/gnu/winsup/src/winsup/cygwin/winbase.h" 1
lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE+8   # this,
 # 0 "" 2
/NO_APP
movl%eax, -12(%ebp)  # tmp68, ret
.loc 2 61 0
movl-12(%ebp), %eax  # ret, D.28596
.loc 3 126 0
cmpl%eax, 36(%ebx)   # D.28596, .next
jne L186 #,


although as you see it has some needless register motion as it stores %eax to
the stack slot for ret and reloads it.  Still, this is now almost as good as
the code generated by my original patch.

winsup/cygwin/ChangeLog

* winbase.h (ilockexch):  Fix asm constraints.
(ilockcmpexch):  Likewise.


  Ok-ish?

cheers,
  DaveK

Index: winsup/cygwin/winbase.h
===
RCS file: /cvs/src/src/winsup/cygwin/winbase.h,v
retrieving revision 1.14
diff -p -u -r1.14 winbase.h
--- winsup/cygwin/winbase.h	12 Jul 2008 18:09:17 -	1.14
+++ winsup/cygwin/winbase.h	3 Jun 2009 23:28:02 -
@@ -38,22 +38,28 @@ ilockdecr (volatile long *m)
 extern __inline__ long
 ilockexch (volatile long *t, long v)
 {
-  register int __res;
-  __asm__ __volatile__ ("\n\
-1:	lock	cmpxchgl %3,(%1)\n\
-	jne 1b\n\
- 	": "=a" (__res), "=q" (t): "1" (t), "q" (v), "0" (*t): "cc");
-  return __res;
+  return ({
+		__typeof (*t) ret;
+		__asm __volatile ("1:	lock cmpxchgl %2, %1\n"
+"	jne 1b\n"
+			: "=a" (ret), "=m" (*t)
+			: "r" (v), "m" (*t), "0" (*t)
+			: "memory");
+		ret;
+	});
 }
 
 extern __inline__ long
 ilockcmpexch (volatile long *t, long v, long c)
 {
-  register int __res;
-  __asm__ __volatile__ ("\n\
-	lock cmpxchgl %3,(%1)\n\
-	": "=a" (__res), "=q" (t) : "1" (t), "q" (v), "0" (c): "cc");
-  return __res;
+  return ({
+		__typeof (*t) ret;
+		__asm __volatile ("lock cmpxchgl %2, %1"
+			: "=a" (ret), "=m" (*t)
+			: "r" (v), "m" (*t), "0" (c)
+			: "memory");
+		ret;
+	});
 }
 
 #undef InterlockedIncrement


[PATCH?] Separate pthread patches, #2 take 2.

2009-06-03 Thread Dave Korn

  The attached patch implements ilockexch and ilockcmpexch, using the inline
asm definition from __arch_compare_and_exchange_val_32_acq in
glibc-2.10.1/sysdeps/i386/i486/bits/atomic.h, trivially expanded inline rather
than in its original preprocessor macro form.

  It generates incorrect code.  The sequence discussed before,

126   do
127 node->next = head;
128   while (InterlockedCompareExchangePointer (&head, node, node->next) !=
node->next);

is now compiled down to this loop:

L186:
.loc 3 127 0
movl__ZN13pthread_mutex7mutexesE+8, %edx # mutexes.head, D.28599
.loc 2 58 0
movl%edx, %eax   # D.28599, tmp68
/APP
 # 58 "/gnu/winsup/src/winsup/cygwin/winbase.h" 1
lock cmpxchgl %ebx, __ZN13pthread_mutex7mutexesE+8   # this,
 # 0 "" 2
/NO_APP
movl%eax, -12(%ebp)  # tmp68, ret
.loc 2 59 0
movl-12(%ebp), %eax  # ret, D.28596
.loc 3 126 0
cmpl%eax, %edx   # D.28596, D.28599
jne L186 #,
movl%edx, 36(%ebx)   # D.28599, .next


... which is in fact the equivalent of

126   do
127 ;
128   while (InterlockedCompareExchangePointer (&head, node, node->next) !=
node->next);
(126) node->next = head;

  As it caches the values of head in %edx during the spin loop, and only
stores it to node->next after having overwritten *head with node, there is a
short window after the new node is linked to the front of the chain during
which its chain pointer is incorrect.

  Not OK for head?

cheers,
  DaveK


Index: winsup/cygwin/winbase.h
===
RCS file: /cvs/src/src/winsup/cygwin/winbase.h,v
retrieving revision 1.14
diff -p -u -r1.14 winbase.h
--- winsup/cygwin/winbase.h	12 Jul 2008 18:09:17 -	1.14
+++ winsup/cygwin/winbase.h	3 Jun 2009 22:54:38 -
@@ -34,27 +34,31 @@ ilockdecr (volatile long *m)
 	": "=&r" (__res), "=m" (*m): "m" (*m): "cc");
   return __res;
 }
-
-extern __inline__ long
-ilockexch (volatile long *t, long v)
-{
-  register int __res;
-  __asm__ __volatile__ ("\n\
-1:	lock	cmpxchgl %3,(%1)\n\
-	jne 1b\n\
- 	": "=a" (__res), "=q" (t): "1" (t), "q" (v), "0" (*t): "cc");
-  return __res;
-}
-
-extern __inline__ long
-ilockcmpexch (volatile long *t, long v, long c)
-{
-  register int __res;
-  __asm__ __volatile__ ("\n\
-	lock cmpxchgl %3,(%1)\n\
-	": "=a" (__res), "=q" (t) : "1" (t), "q" (v), "0" (c): "cc");
-  return __res;
-}
+
+extern __inline__ long
+ilockexch (volatile long *t, long v)
+{
+  return ({
+		__typeof (*t) ret;
+		__asm __volatile ("1:	lock cmpxchgl %2, %1\n"
+"	jne 1b\n"
+			: "=a" (ret), "=m" (*t)
+			: "r" (v), "m" (*t), "0" (*t));
+		ret;
+	});
+}
+
+extern __inline__ long
+ilockcmpexch (volatile long *t, long v, long c)
+{
+  return ({
+		__typeof (*t) ret;
+		__asm __volatile ("lock cmpxchgl %2, %1"
+			: "=a" (ret), "=m" (*t)
+			: "r" (v), "m" (*t), "0" (c));
+		ret;
+	});
+}
 
 #undef InterlockedIncrement
 #define InterlockedIncrement ilockincr


[PATCH] Separate pthread fixes #1

2009-06-03 Thread Dave Korn

  The attached patch separates out the uncontroversial change to the
__cygwin_lock* functions.

winsup/cygwin/ChangeLog

* thread.cc (__cygwin_lock_lock):  Delete racy optimisation.
(__cygwin_lock_unlock):  Likewise.

  OK?

cheers,
  DaveK

Index: winsup/cygwin/thread.cc
===
RCS file: /cvs/src/src/winsup/cygwin/thread.cc,v
retrieving revision 1.215
diff -p -u -r1.215 thread.cc
--- winsup/cygwin/thread.cc	20 Jan 2009 12:40:31 -	1.215
+++ winsup/cygwin/thread.cc	3 Jun 2009 22:53:40 -
@@ -76,13 +76,8 @@ __cygwin_lock_fini (_LOCK_T *lock)
 extern "C" void
 __cygwin_lock_lock (_LOCK_T *lock)
 {
-  if (MT_INTERFACE->threadcount <= 1)
-paranoid_printf ("threadcount %d.  not locking", MT_INTERFACE->threadcount);
-  else
-{
-  paranoid_printf ("threadcount %d.  locking", MT_INTERFACE->threadcount);
-  pthread_mutex_lock ((pthread_mutex_t*) lock);
-}
+  paranoid_printf ("threadcount %d.  locking", MT_INTERFACE->threadcount);
+  pthread_mutex_lock ((pthread_mutex_t*) lock);
 }
 
 extern "C" int
@@ -95,13 +90,8 @@ __cygwin_lock_trylock (_LOCK_T *lock)
 extern "C" void
 __cygwin_lock_unlock (_LOCK_T *lock)
 {
-  if (MT_INTERFACE->threadcount <= 1)
-paranoid_printf ("threadcount %d.  not unlocking", MT_INTERFACE->threadcount);
-  else
-{
-  pthread_mutex_unlock ((pthread_mutex_t*) lock);
-  paranoid_printf ("threadcount %d.  unlocked", MT_INTERFACE->threadcount);
-}
+  pthread_mutex_unlock ((pthread_mutex_t*) lock);
+  paranoid_printf ("threadcount %d.  unlocked", MT_INTERFACE->threadcount);
 }
 
 static inline verifyable_object_state


Re: [PATCH] Pthread fixes arising.

2009-06-03 Thread Christopher Faylor
On Wed, Jun 03, 2009 at 07:16:04PM +0100, Dave Korn wrote:
>winsup/cygwin/ChangeLog
>
>   * thread.cc (__cygwin_lock_lock):  Delete racy optimisation.
>   (__cygwin_lock_unlock):  Likewise.
>
>   * winbase.h (ilockexch):  Fix asm constraints.
>   (ilockcmpexch):  Likewise.
>
>  OK for head?

Which version of the asm constraints are these?  If this is the "I'm
pretty sure I know what I'm doing" version then no.  If these are the
versions from some tried-and-true OS/library then yes.

Actually maybe this should be two checkins since the __cygwin_lock_lock
stuff is uncontroversial.

cgf


Re: [PATCH] Pthread fixes arising.

2009-06-03 Thread Dave Korn
Dave Korn wrote:

>   *** main is now taking and releasing it recursively with a bias of one 

  Two, not one.

cheers,
  DaveK



[PATCH] Pthread fixes arising.

2009-06-03 Thread Dave Korn

Hi everyone,

  The attached patch fixes the pthread bugs discovered and reported between
Thomas Stalder and myself.  The winbase.h inline asms bug has already been the
subject of a long thread.

  The latest fix is a simple racy optimisation in __cygwin_lock_{,un}lock;
they attempt to save themselves the bother of locking if multiple threads
aren't in operation, but when you have threads asynchronously being born and
dying, this can mean that the number of locks and unlocks becomes mismatched,
as the optimisation decision can be taken differently for both halves of a
matching lock/unlock pair of calls.  Here's an example of it going wrong in
practice:

   84   47795 [main] stalder3 3612 __cygwin_lock_lock: threadcount 2.  locking
 4981   47948 [main] stalder3 3612 __cygwin_lock_lock: threadcount 2.  locking

   * other thread dies here 

-4876   43704 [main] stalder3 3612 __cygwin_lock_unlock: threadcount 1.  not
unlocking
   48   43752 [main] stalder3 3612 __cygwin_lock_unlock: threadcount 1.  not
unlocking

    main thinks it has unlocked the lock but it still holds it *

-4874   44571 [main] stalder3 3612 __cygwin_lock_lock: threadcount 2.  locking
 4982   49599 [main] stalder3 3612 __cygwin_lock_lock: threadcount 2.  locking
 4968   50294 [main] stalder3 3612 __cygwin_lock_unlock: threadcount 2.  
unlocked
 4969   50398 [main] stalder3 3612 __cygwin_lock_unlock: threadcount 2.  
unlocked

  *** main is now taking and releasing it recursively with a bias of one 

  105   50554 [unknown (0x7F4)] stalder3 3612 __cygwin_lock_lock: threadcount
2.  locking

  Now this thread waits potentially forever to take the lock.  But while it is
doing this, it happens to be holding the user-level mutex.  As soon as the
main thread next waits on this mutex, while still recursively holding the
__cygwin_lock_* mutex, we have a classic deadlock caused by priority
inversion.  I don't figure the optimisation is worth the trouble - in theory
we could perhaps still try and hang onto it from program startup until the
first user thread gets created and then make sure it never gets re-enabled for
the rest of the runtime even if the number of threads drops back down again,
and the worst that would happen is excess unlock calls, but even that seems a
bit ugly.  So I dropped it altogether.

winsup/cygwin/ChangeLog

* thread.cc (__cygwin_lock_lock):  Delete racy optimisation.
(__cygwin_lock_unlock):  Likewise.

* winbase.h (ilockexch):  Fix asm constraints.
(ilockcmpexch):  Likewise.

  OK for head?

cheers,
  DaveK


? winsup/cygwin/cygwin-cxx.h
? winsup/cygwin/mutex
Index: winsup/cygwin/thread.cc
===
RCS file: /cvs/src/src/winsup/cygwin/thread.cc,v
retrieving revision 1.215
diff -p -u -r1.215 thread.cc
--- winsup/cygwin/thread.cc	20 Jan 2009 12:40:31 -	1.215
+++ winsup/cygwin/thread.cc	3 Jun 2009 17:38:06 -
@@ -76,13 +76,8 @@ __cygwin_lock_fini (_LOCK_T *lock)
 extern "C" void
 __cygwin_lock_lock (_LOCK_T *lock)
 {
-  if (MT_INTERFACE->threadcount <= 1)
-paranoid_printf ("threadcount %d.  not locking", MT_INTERFACE->threadcount);
-  else
-{
-  paranoid_printf ("threadcount %d.  locking", MT_INTERFACE->threadcount);
-  pthread_mutex_lock ((pthread_mutex_t*) lock);
-}
+  paranoid_printf ("threadcount %d.  locking", MT_INTERFACE->threadcount);
+  pthread_mutex_lock ((pthread_mutex_t*) lock);
 }
 
 extern "C" int
@@ -95,13 +90,8 @@ __cygwin_lock_trylock (_LOCK_T *lock)
 extern "C" void
 __cygwin_lock_unlock (_LOCK_T *lock)
 {
-  if (MT_INTERFACE->threadcount <= 1)
-paranoid_printf ("threadcount %d.  not unlocking", MT_INTERFACE->threadcount);
-  else
-{
-  pthread_mutex_unlock ((pthread_mutex_t*) lock);
-  paranoid_printf ("threadcount %d.  unlocked", MT_INTERFACE->threadcount);
-}
+  pthread_mutex_unlock ((pthread_mutex_t*) lock);
+  paranoid_printf ("threadcount %d.  unlocked", MT_INTERFACE->threadcount);
 }
 
 static inline verifyable_object_state
Index: winsup/cygwin/winbase.h
===
RCS file: /cvs/src/src/winsup/cygwin/winbase.h,v
retrieving revision 1.14
diff -p -u -r1.14 winbase.h
--- winsup/cygwin/winbase.h	12 Jul 2008 18:09:17 -	1.14
+++ winsup/cygwin/winbase.h	3 Jun 2009 17:38:06 -
@@ -38,21 +38,21 @@ ilockdecr (volatile long *m)
 extern __inline__ long
 ilockexch (volatile long *t, long v)
 {
-  register int __res;
+  register long __res __asm__ ("%eax") = *t;
   __asm__ __volatile__ ("\n\
-1:	lock	cmpxchgl %3,(%1)\n\
+1:	lock	cmpxchgl %2,%1\n\
 	jne 1b\n\
- 	": "=a" (__res), "=q" (t): "1" (t), "q" (v), "0" (*t): "cc");
+ 	": "+a" (__res), "=m" (*t): "q" (v), "m" (*t) : "memory", "cc");
   return __res;
 }
 
 extern __inline__ long
 ilockcmpexch (volatile long *t, long v, long c)
 {
-  register int __res;
+  register long __res __asm ("%eax") = c;
   __asm__ __volati