[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
This revision was automatically updated to reflect the committed changes. Closed by commit rOMP345867: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records (authored by gbercea, committed by ). Changed prior to commit: https://reviews.llvm.org/D53141?vs=172007&id=172184#toc Repository: rOMP OpenMP https://reviews.llvm.org/D53141 Files: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h libomptarget/deviceRTLs/nvptx/src/supporti.h Index: libomptarget/deviceRTLs/nvptx/src/interface.h === --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, +int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } Index: libomptarget/deviceRTLs/nvptx/src/supporti.h === --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu === --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,47 +369,31 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, -int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); -return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize); +return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); } + // Only warp active maste
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
ABataev accepted this revision. ABataev added a comment. LG Repository: rOMP OpenMP https://reviews.llvm.org/D53141 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea marked an inline comment as done. gtbercea added inline comments. Comment at: libomptarget/deviceRTLs/nvptx/src/supporti.h:191 PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; ABataev wrote: > Same, do you really need to remove this? Yes, leaving this in will lead to the whole malloc to be optimized out after inlining. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea updated this revision to Diff 172007. gtbercea added a comment. Reinstate assert. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 Files: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h libomptarget/deviceRTLs/nvptx/src/supporti.h Index: libomptarget/deviceRTLs/nvptx/src/supporti.h === --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } Index: libomptarget/deviceRTLs/nvptx/src/interface.h === --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, +int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu === --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,47 +369,31 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, -int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); -return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize); +return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); } + // Only warp active master threads manage the stack. + bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; + // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. const size_t Alignme
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
ABataev added inline comments. Comment at: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu:159 omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, Do you really need to remove this? Comment at: libomptarget/deviceRTLs/nvptx/src/supporti.h:191 PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; Same, do you really need to remove this? Repository: rOMP OpenMP https://reviews.llvm.org/D53141 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea updated this revision to Diff 171993. gtbercea added a comment. Move while on else branch. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 Files: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h libomptarget/deviceRTLs/nvptx/src/supporti.h Index: libomptarget/deviceRTLs/nvptx/src/supporti.h === --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } @@ -156,7 +154,6 @@ // omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); newTaskDescr->ThreadLimit() = ThreadLimit; Index: libomptarget/deviceRTLs/nvptx/src/interface.h === --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, +int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu === --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,47 +369,31 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, -int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); -return om
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
ABataev added inline comments. Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:442 +} + } while (!FrameP); Do you realy need the loop here? Seems to me it is better to have it in `else` branch of the `if (IsWarpMaster)` statement Repository: rOMP OpenMP https://reviews.llvm.org/D53141 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea updated this revision to Diff 171988. gtbercea added a comment. Address comments. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 Files: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h libomptarget/deviceRTLs/nvptx/src/supporti.h Index: libomptarget/deviceRTLs/nvptx/src/supporti.h === --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -123,7 +123,7 @@ struct DataSharingStateTy { __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; void *StackPtr[DS_Max_Warp_Number]; - void *FramePtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; int32_t ActiveThreads[DS_Max_Warp_Number]; }; // Additional worker slot type which is initialized with the default worker slot Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } @@ -156,7 +154,6 @@ // omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); newTaskDescr->ThreadLimit() = ThreadLimit; Index: libomptarget/deviceRTLs/nvptx/src/interface.h === --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, +int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu === --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -129,7 +129,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; int32_t &ActiveT = DataSharingState.ActiveThreads[WID]; DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); @@ -283,7 +283,7 @@ __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; void *&StackP = DataSharingState.StackPtr[WID]; - void *&FrameP = DataSharingState.FramePtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; SlotP = *SavedSharedSlot; StackP = *SavedSharedStack; @@ -321,7 +321,7 @@ DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID); - void *P = DataSharingState.FramePtr[SourceWID]; + void * volatile P = DataSharingState.FramePtr[SourceWID]; DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); return P; } @@ -369,96 +369,107 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, -int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); -return omptarge
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
ABataev added inline comments. Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:389 unsigned WID = getWarpId(); + // void * volatile FramePointer = 0; void *&FrameP = DataSharingState.FramePtr[WID]; This must be removed Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:438 +// point to the start of the new frame held in StackP. +//atomicExch((unsigned long long *)&FrameP, (unsigned long long)StackP); +FrameP = StackP; Also, must be removed Comment at: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu:444 +} + } while (!FrameP); It is a very bad idea to have something like this without atomic instructions. Also, for writing, you need to use atomic instructions (+, possibly, `volatile` data type). Otherwise, it leads to undefined behavior and problems during optimization. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea updated this revision to Diff 170238. gtbercea added a comment. Refactor. Repository: rOMP OpenMP https://reviews.llvm.org/D53141 Files: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/supporti.h Index: libomptarget/deviceRTLs/nvptx/src/supporti.h === --- libomptarget/deviceRTLs/nvptx/src/supporti.h +++ libomptarget/deviceRTLs/nvptx/src/supporti.h @@ -188,7 +188,6 @@ { void *ptr = malloc(size); PRINT(LD_MEM, "malloc data of size %zu for %s: 0x%llx\n", size, msg, P64(ptr)); - ASSERT(LT_SAFETY, ptr, "failed to allocate %zu bytes for %s\n", size, msg); return ptr; } Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -40,8 +40,6 @@ INLINE unsigned smid() { unsigned id; asm("mov.u32 %0, %%smid;" : "=r"(id)); - ASSERT0(LT_FUSSY, nsmid() <= MAX_SM, - "Expected number of SMs is less than reported."); return id; } @@ -156,7 +154,6 @@ // omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, currTeamDescr.LevelZeroTaskDescr()); newTaskDescr->ThreadLimit() = ThreadLimit; Index: libomptarget/deviceRTLs/nvptx/src/interface.h === --- libomptarget/deviceRTLs/nvptx/src/interface.h +++ libomptarget/deviceRTLs/nvptx/src/interface.h @@ -478,6 +478,8 @@ EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, +int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); Index: libomptarget/deviceRTLs/nvptx/src/data_sharing.cu === --- libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -369,96 +369,109 @@ __threadfence_block(); } -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void* __kmpc_data_sharing_push_stack(size_t DataSize, -int16_t UseSharedMemory) { +INLINE void* data_sharing_push_stack_common(size_t PushSize) { if (isRuntimeUninitialized()) { ASSERT0(LT_FUSSY, isSPMDMode(), "Expected SPMD mode with uninitialized runtime."); -return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(DataSize); +return omptarget_nvptx_SimpleThreadPrivateContext::Allocate(PushSize); } + // Only warp active master threads manage the stack. + bool IsWarpMaster = (getThreadId() % WARPSIZE) == 0; + // Add worst-case padding to DataSize so that future stack allocations are // correctly aligned. const size_t Alignment = 8; - if (DataSize % Alignment != 0) { -DataSize += (Alignment - DataSize % Alignment); - } + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; // Frame pointer must be visible to all workers in the same warp. unsigned WID = getWarpId(); + // void * volatile FramePointer = 0; void *&FrameP = DataSharingState.FramePtr[WID]; - // Only warp active master threads manage the stack. - if (getThreadId() % WARPSIZE == 0) { -// SlotP will point to either the shared memory slot or an existing -// global memory slot. -__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; -void *&StackP = DataSharingState.StackPtr[WID]; - -// Compute the total memory footprint of the requested data. -// The master thread requires a stack only for itself. A worker -// thread (which at this point is a warp master) will require -// space for the variables of each thread in the warp, -// i.e. one DataSize chunk per warp lane. -// TODO: change WARPSIZE to the number of active threads in the warp. -size_t PushSize = IsMasterThread() ? DataSize : WARPSIZE * DataSize; + do { +if (IsWarpMaster) { + // SlotP will point to either the shared memory slot or an existing + // global memory slot
[PATCH] D53141: [OpenMP][libomptarget] Add runtime function for pushing coalesced global records
gtbercea updated this revision to Diff 170236. gtbercea added a comment. Herald added subscribers: cfe-commits, jholewinski. Refactor. Repository: rC Clang https://reviews.llvm.org/D53141 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp === --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -4238,16 +4238,17 @@ Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), S.getIterationVariable()->getType(), S.getBeginLoc()); +return; } + CGOpenMPRuntime::getDefaultDistScheduleAndChunk( + CGF, S, ScheduleKind, Chunk); } void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { -ScheduleKind = OMPC_SCHEDULE_static; -Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize( -S.getIterationVariable()->getType()), 1); - } + ScheduleKind = OMPC_SCHEDULE_static; + Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize( + S.getIterationVariable()->getType()), 1); } Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp === --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -4238,16 +4238,17 @@ Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF), CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0), S.getIterationVariable()->getType(), S.getBeginLoc()); +return; } + CGOpenMPRuntime::getDefaultDistScheduleAndChunk( + CGF, S, ScheduleKind, Chunk); } void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk( CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const { - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) { -ScheduleKind = OMPC_SCHEDULE_static; -Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize( -S.getIterationVariable()->getType()), 1); - } + ScheduleKind = OMPC_SCHEDULE_static; + Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize( + S.getIterationVariable()->getType()), 1); } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits