Hi Jakub,
Actually, I see a problem, but not related to this patch. I bet e.g. tsan would complain heavily on the wrappers, because the code is racy:
Here is a patch implementing your suggestion. Tested at least so far that all matmul test cases pass on my machine. OK for trunk? Regards Thomas 2017-03-02 Thomas Koenig <tkoe...@gcc.gnu.org> Jakub Jelinek <ja...@redhat.com> * m4/matmul.m4 (matmul_'rtype_code`_avx2): Avoid race condition on storing function pointer. * generated/matmul_c10.c: Regenerated. * generated/matmul_c16.c: Regenerated. * generated/matmul_c4.c: Regenerated. * generated/matmul_c8.c: Regenerated. * generated/matmul_i1.c: Regenerated. * generated/matmul_i16.c: Regenerated. * generated/matmul_i2.c: Regenerated. * generated/matmul_i4.c: Regenerated. * generated/matmul_i8.c: Regenerated. * generated/matmul_r10.c: Regenerated. * generated/matmul_r16.c: Regenerated. * generated/matmul_r4.c: Regenerated. * generated/matmul_r8.c: Regenerated.
Index: generated/matmul_c10.c =================================================================== --- generated/matmul_c10.c (Revision 245836) +++ generated/matmul_c10.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_c10 (gfc_array_c10 * const restrict re gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_c10 * const restrict retarray, + gfc_array_c10 * const restrict a, gfc_array_c10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_c10_vanilla; + matmul_fn = matmul_c10_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_c10 (gfc_array_c10 * const restrict re #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_c10_avx512f; - goto tailcall; + matmul_fn = matmul_c10_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_c10 (gfc_array_c10 * const restrict re if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_c10_avx2; - goto tailcall; + matmul_fn = matmul_c10_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_c10 (gfc_array_c10 * const restrict re #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_c10_avx; - goto tailcall; + matmul_fn = matmul_c10_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_c16.c =================================================================== --- generated/matmul_c16.c (Revision 245836) +++ generated/matmul_c16.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_c16 (gfc_array_c16 * const restrict re gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_c16 * const restrict retarray, + gfc_array_c16 * const restrict a, gfc_array_c16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_c16_vanilla; + matmul_fn = matmul_c16_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_c16 (gfc_array_c16 * const restrict re #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_c16_avx512f; - goto tailcall; + matmul_fn = matmul_c16_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_c16 (gfc_array_c16 * const restrict re if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_c16_avx2; - goto tailcall; + matmul_fn = matmul_c16_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_c16 (gfc_array_c16 * const restrict re #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_c16_avx; - goto tailcall; + matmul_fn = matmul_c16_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_c4.c =================================================================== --- generated/matmul_c4.c (Revision 245836) +++ generated/matmul_c4.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_c4 (gfc_array_c4 * const restrict reta gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_c4 * const restrict retarray, + gfc_array_c4 * const restrict a, gfc_array_c4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_c4_vanilla; + matmul_fn = matmul_c4_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_c4 (gfc_array_c4 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_c4_avx512f; - goto tailcall; + matmul_fn = matmul_c4_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_c4 (gfc_array_c4 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_c4_avx2; - goto tailcall; + matmul_fn = matmul_c4_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_c4 (gfc_array_c4 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_c4_avx; - goto tailcall; + matmul_fn = matmul_c4_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_c8.c =================================================================== --- generated/matmul_c8.c (Revision 245836) +++ generated/matmul_c8.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_c8 (gfc_array_c8 * const restrict reta gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_c8 * const restrict retarray, + gfc_array_c8 * const restrict a, gfc_array_c8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_c8_vanilla; + matmul_fn = matmul_c8_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_c8 (gfc_array_c8 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_c8_avx512f; - goto tailcall; + matmul_fn = matmul_c8_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_c8 (gfc_array_c8 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_c8_avx2; - goto tailcall; + matmul_fn = matmul_c8_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_c8 (gfc_array_c8 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_c8_avx; - goto tailcall; + matmul_fn = matmul_c8_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_i1.c =================================================================== --- generated/matmul_i1.c (Revision 245836) +++ generated/matmul_i1.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_i1 (gfc_array_i1 * const restrict reta gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_i1 * const restrict retarray, + gfc_array_i1 * const restrict a, gfc_array_i1 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_i1_vanilla; + matmul_fn = matmul_i1_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_i1 (gfc_array_i1 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_i1_avx512f; - goto tailcall; + matmul_fn = matmul_i1_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_i1 (gfc_array_i1 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_i1_avx2; - goto tailcall; + matmul_fn = matmul_i1_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_i1 (gfc_array_i1 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_i1_avx; - goto tailcall; + matmul_fn = matmul_i1_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_i16.c =================================================================== --- generated/matmul_i16.c (Revision 245836) +++ generated/matmul_i16.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_i16 (gfc_array_i16 * const restrict re gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_i16 * const restrict retarray, + gfc_array_i16 * const restrict a, gfc_array_i16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_i16_vanilla; + matmul_fn = matmul_i16_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_i16 (gfc_array_i16 * const restrict re #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_i16_avx512f; - goto tailcall; + matmul_fn = matmul_i16_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_i16 (gfc_array_i16 * const restrict re if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_i16_avx2; - goto tailcall; + matmul_fn = matmul_i16_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_i16 (gfc_array_i16 * const restrict re #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_i16_avx; - goto tailcall; + matmul_fn = matmul_i16_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_i2.c =================================================================== --- generated/matmul_i2.c (Revision 245836) +++ generated/matmul_i2.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_i2 (gfc_array_i2 * const restrict reta gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_i2 * const restrict retarray, + gfc_array_i2 * const restrict a, gfc_array_i2 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_i2_vanilla; + matmul_fn = matmul_i2_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_i2 (gfc_array_i2 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_i2_avx512f; - goto tailcall; + matmul_fn = matmul_i2_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_i2 (gfc_array_i2 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_i2_avx2; - goto tailcall; + matmul_fn = matmul_i2_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_i2 (gfc_array_i2 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_i2_avx; - goto tailcall; + matmul_fn = matmul_i2_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_i4.c =================================================================== --- generated/matmul_i4.c (Revision 245836) +++ generated/matmul_i4.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_i4 (gfc_array_i4 * const restrict reta gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_i4 * const restrict retarray, + gfc_array_i4 * const restrict a, gfc_array_i4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_i4_vanilla; + matmul_fn = matmul_i4_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_i4 (gfc_array_i4 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_i4_avx512f; - goto tailcall; + matmul_fn = matmul_i4_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_i4 (gfc_array_i4 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_i4_avx2; - goto tailcall; + matmul_fn = matmul_i4_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_i4 (gfc_array_i4 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_i4_avx; - goto tailcall; + matmul_fn = matmul_i4_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_i8.c =================================================================== --- generated/matmul_i8.c (Revision 245836) +++ generated/matmul_i8.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_i8 (gfc_array_i8 * const restrict reta gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_i8 * const restrict retarray, + gfc_array_i8 * const restrict a, gfc_array_i8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_i8_vanilla; + matmul_fn = matmul_i8_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_i8 (gfc_array_i8 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_i8_avx512f; - goto tailcall; + matmul_fn = matmul_i8_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_i8 (gfc_array_i8 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_i8_avx2; - goto tailcall; + matmul_fn = matmul_i8_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_i8 (gfc_array_i8 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_i8_avx; - goto tailcall; + matmul_fn = matmul_i8_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_r10.c =================================================================== --- generated/matmul_r10.c (Revision 245836) +++ generated/matmul_r10.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_r10 (gfc_array_r10 * const restrict re gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_r10 * const restrict retarray, + gfc_array_r10 * const restrict a, gfc_array_r10 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_r10_vanilla; + matmul_fn = matmul_r10_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_r10 (gfc_array_r10 * const restrict re #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_r10_avx512f; - goto tailcall; + matmul_fn = matmul_r10_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_r10 (gfc_array_r10 * const restrict re if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_r10_avx2; - goto tailcall; + matmul_fn = matmul_r10_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_r10 (gfc_array_r10 * const restrict re #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_r10_avx; - goto tailcall; + matmul_fn = matmul_r10_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_r16.c =================================================================== --- generated/matmul_r16.c (Revision 245836) +++ generated/matmul_r16.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_r16 (gfc_array_r16 * const restrict re gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_r16 * const restrict retarray, + gfc_array_r16 * const restrict a, gfc_array_r16 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_r16_vanilla; + matmul_fn = matmul_r16_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_r16 (gfc_array_r16 * const restrict re #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_r16_avx512f; - goto tailcall; + matmul_fn = matmul_r16_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_r16 (gfc_array_r16 * const restrict re if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_r16_avx2; - goto tailcall; + matmul_fn = matmul_r16_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_r16 (gfc_array_r16 * const restrict re #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_r16_avx; - goto tailcall; + matmul_fn = matmul_r16_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_r4.c =================================================================== --- generated/matmul_r4.c (Revision 245836) +++ generated/matmul_r4.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_r4 (gfc_array_r4 * const restrict reta gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_r4 * const restrict retarray, + gfc_array_r4 * const restrict a, gfc_array_r4 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_r4_vanilla; + matmul_fn = matmul_r4_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_r4 (gfc_array_r4 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_r4_avx512f; - goto tailcall; + matmul_fn = matmul_r4_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_r4 (gfc_array_r4 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_r4_avx2; - goto tailcall; + matmul_fn = matmul_r4_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_r4 (gfc_array_r4 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_r4_avx; - goto tailcall; + matmul_fn = matmul_r4_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: generated/matmul_r8.c =================================================================== --- generated/matmul_r8.c (Revision 245836) +++ generated/matmul_r8.c (Arbeitskopie) @@ -2258,9 +2258,14 @@ void matmul_r8 (gfc_array_r8 * const restrict reta gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) (gfc_array_r8 * const restrict retarray, + gfc_array_r8 * const restrict a, gfc_array_r8 * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_r8_vanilla; + matmul_fn = matmul_r8_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -2267,8 +2272,8 @@ void matmul_r8 (gfc_array_r8 * const restrict reta #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_r8_avx512f; - goto tailcall; + matmul_fn = matmul_r8_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -2277,8 +2282,8 @@ void matmul_r8 (gfc_array_r8 * const restrict reta if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_r8_avx2; - goto tailcall; + matmul_fn = matmul_r8_avx2; + goto store; } #endif @@ -2286,14 +2291,15 @@ void matmul_r8 (gfc_array_r8 * const restrict reta #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_r8_avx; - goto tailcall; + matmul_fn = matmul_r8_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); } Index: m4/matmul.m4 =================================================================== --- m4/matmul.m4 (Revision 245836) +++ m4/matmul.m4 (Arbeitskopie) @@ -123,9 +123,14 @@ void matmul_'rtype_code` ('rtype` * const restrict 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, int blas_limit, blas_call gemm) = NULL; + void (*matmul_fn) ('rtype` * const restrict retarray, + 'rtype` * const restrict a, 'rtype` * const restrict b, int try_blas, + int blas_limit, blas_call gemm) = NULL; + + matmul_fn = __atomic_load_n (&matmul_p, __ATOMIC_RELAXED); if (matmul_p == NULL) { - matmul_p = matmul_'rtype_code`_vanilla; + matmul_fn = matmul_'rtype_code`_vanilla; if (__cpu_model.__cpu_vendor == VENDOR_INTEL) { /* Run down the available processors in order of preference. */ @@ -132,8 +137,8 @@ void matmul_'rtype_code` ('rtype` * const restrict #ifdef HAVE_AVX512F if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX512F)) { - matmul_p = matmul_'rtype_code`_avx512f; - goto tailcall; + matmul_fn = matmul_'rtype_code`_avx512f; + goto store; } #endif /* HAVE_AVX512F */ @@ -142,8 +147,8 @@ void matmul_'rtype_code` ('rtype` * const restrict if ((__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX2)) && (__cpu_model.__cpu_features[0] & (1 << FEATURE_FMA))) { - matmul_p = matmul_'rtype_code`_avx2; - goto tailcall; + matmul_fn = matmul_'rtype_code`_avx2; + goto store; } #endif @@ -151,14 +156,15 @@ void matmul_'rtype_code` ('rtype` * const restrict #ifdef HAVE_AVX if (__cpu_model.__cpu_features[0] & (1 << FEATURE_AVX)) { - matmul_p = matmul_'rtype_code`_avx; - goto tailcall; + matmul_fn = matmul_'rtype_code`_avx; + goto store; } #endif /* HAVE_AVX */ } + store: + __atomic_store_n (&matmul_p, matmul_fn, __ATOMIC_RELAXED); } -tailcall: (*matmul_p) (retarray, a, b, try_blas, blas_limit, gemm); }