Commit: 93147cab0350151a376a65d244dc5b275a0baa3e Author: Bastien Montagne Date: Tue Nov 5 12:18:05 2019 +0100 Branches: tmp-task-foreach-pool https://developer.blender.org/rB93147cab0350151a376a65d244dc5b275a0baa3e
BLI_task: Initial implementation of pooled threaded index range iterator. This code allows to push a set of different operations all based on iterations over a range of indices, and then process them all at once over multiple threads. This is mainly interesting for relatively low amount of individual tasks, as expected. E.g. performance tests on a 32 threads machine, for a set of 10 different tasks, shows following improvements when using pooled version instead of ten sequential calls to `BLI_task_parallel_range()`: | Num Items | Sequential | Pooled | Speed-up | | --------- | ---------- | ------- | -------- | | 10K | 365 us | 138 us | 2.5 x | | 100K | 877 us | 530 us | 1.66 x | | 1000K | 5521 us | 4625 us | 1.25 x | Differential Revision: https://developer.blender.org/D6189 =================================================================== M source/blender/blenlib/BLI_task.h M source/blender/blenlib/intern/task.c M tests/gtests/blenlib/BLI_task_performance_test.cc M tests/gtests/blenlib/BLI_task_test.cc =================================================================== diff --git a/source/blender/blenlib/BLI_task.h b/source/blender/blenlib/BLI_task.h index 7ef5e518cc8..05c3d43a0de 100644 --- a/source/blender/blenlib/BLI_task.h +++ b/source/blender/blenlib/BLI_task.h @@ -196,9 +196,22 @@ void BLI_task_parallel_range(const int start, const int stop, void *userdata, TaskParallelRangeFunc func, - const TaskParallelSettings *settings); - -/* This data is shared between all tasks, its access needs thread lock or similar protection. */ + TaskParallelSettings *settings); + +typedef struct TaskParallelRangePool TaskParallelRangePool; +struct TaskParallelRangePool *BLI_task_parallel_range_pool_init( + const struct TaskParallelSettings *settings); +void BLI_task_parallel_range_pool_push(struct TaskParallelRangePool *range_pool, + const int start, + const int stop, + void *userdata, + TaskParallelRangeFunc func, + const struct TaskParallelSettings *settings); +void BLI_task_parallel_range_pool_work_and_wait(struct TaskParallelRangePool *range_pool); +void BLI_task_parallel_range_pool_free(struct TaskParallelRangePool *range_pool); + +/* This data is shared between all tasks, its access needs thread lock or similar protection. + */ typedef struct TaskParallelIteratorStateShared { /* Maximum amount of items to acquire at once. */ int chunk_size; diff --git a/source/blender/blenlib/intern/task.c b/source/blender/blenlib/intern/task.c index bb69dc6452f..0088fd55648 100644 --- a/source/blender/blenlib/intern/task.c +++ b/source/blender/blenlib/intern/task.c @@ -1042,15 +1042,49 @@ void BLI_task_pool_delayed_push_end(TaskPool *pool, int thread_id) if (((_mem) != NULL) && ((_size) > 8192)) \ MEM_freeN((_mem)) -typedef struct ParallelRangeState { +/* Stores all needed data to perform a parallelized iteration, + * with a same operation (callback function). + * It can be chained with other tasks in a single-linked list way. */ +typedef struct TaskParallelRangeState { + struct TaskParallelRangeState *next; + int start, stop; void *userdata; TaskParallelRangeFunc func; + /* Each instance of looping chunks will get a copy of this data + * (similar to OpenMP's firstprivate). + */ + void *userdata_chunk; /* Pointer to actual data. */ + size_t userdata_chunk_size; /* Size of that data. */ + void *userdata_chunk_array; /* Array of 'tls' copies of userdata_chunk for each running task. */ + size_t userdata_chunk_len; /* Number of items in the array, i.e. number of worker threads. */ + /* Function called from calling thread once whole range have been processed. */ + TaskParallelFinalizeFunc func_finalize; + int iter; int chunk_size; -} ParallelRangeState; +} TaskParallelRangeState; + +/* Stores all the parallel tasks for a single pool. */ +typedef struct TaskParallelRangePool { + /* The workers' task pool. */ + TaskPool *pool; + /* The number of worker tasks we need to create. */ + int num_tasks; + /* The total number of iterations in all the added ranges. */ + int num_iters; + /* The size (number of items) processed at once by a worker task. */ + int chunk_size; + + /* Linked list of range tasks to process. */ + TaskParallelRangeState *parallel_range_tasks; + /* Current range task beeing processed, swapped atomically. */ + TaskParallelRangeState *current_task; + /* Scheduling settings common to all tasks. */ + TaskParallelSettings *settings; +} TaskParallelRangePool; BLI_INLINE void task_parallel_calc_chunk_size(const TaskParallelSettings *settings, const int tot_items, @@ -1113,66 +1147,95 @@ BLI_INLINE void task_parallel_calc_chunk_size(const TaskParallelSettings *settin } } -BLI_INLINE void task_parallel_range_calc_chunk_size(const TaskParallelSettings *settings, - const int num_tasks, - ParallelRangeState *state) +BLI_INLINE void task_parallel_range_calc_chunk_size(TaskParallelRangePool *range_pool) { + int num_iters = 0; + for (TaskParallelRangeState *state = range_pool->parallel_range_tasks; state != NULL; + state = state->next) { + num_iters += state->stop - state->start; + } + range_pool->num_iters = num_iters; task_parallel_calc_chunk_size( - settings, state->stop - state->start, num_tasks, &state->chunk_size); + range_pool->settings, num_iters, range_pool->num_tasks, &range_pool->chunk_size); } -BLI_INLINE bool parallel_range_next_iter_get(ParallelRangeState *__restrict state, - int *__restrict iter, - int *__restrict count) +BLI_INLINE bool parallel_range_next_iter_get(TaskParallelRangePool *__restrict range_pool, + int *__restrict r_iter, + int *__restrict r_count, + TaskParallelRangeState **__restrict r_state) { - int previter = atomic_fetch_and_add_int32(&state->iter, state->chunk_size); + TaskParallelRangeState *state; + int previter = INT32_MAX; - *iter = previter; - *count = max_ii(0, min_ii(state->chunk_size, state->stop - previter)); + do { + if ((state = range_pool->current_task) == NULL) { + break; + } - return (previter < state->stop); + previter = atomic_fetch_and_add_int32(&state->iter, range_pool->chunk_size); + *r_iter = previter; + *r_count = max_ii(0, min_ii(range_pool->chunk_size, state->stop - previter)); + + if (previter >= state->stop) { + /* At that point the state we got is done, we need to go to the next one. In case some other + * thread already did it, then this does nothing, and we'll just get current valid state + * at start of the next loop. */ + atomic_cas_ptr((void **)&range_pool->current_task, state, state->next); + } + } while (state != NULL && previter >= state->stop); + + *r_state = state; + return (state != NULL && previter < state->stop); } -static void parallel_range_func(TaskPool *__restrict pool, void *userdata_chunk, int thread_id) +static void parallel_range_func(TaskPool *__restrict pool, void *userdata_chunk_idx, int thread_id) { - ParallelRangeState *__restrict state = BLI_task_pool_userdata(pool); + TaskParallelRangePool *__restrict range_pool = BLI_task_pool_userdata(pool); TaskParallelTLS tls = { .thread_id = thread_id, - .userdata_chunk = userdata_chunk, + .userdata_chunk = NULL, }; + TaskParallelRangeState *state; int iter, count; - while (parallel_range_next_iter_get(state, &iter, &count)) { + while (parallel_range_next_iter_get(range_pool, &iter, &count, &state)) { + tls.userdata_chunk = (char *)state->userdata_chunk_array + + (((size_t)POINTER_AS_INT(userdata_chunk_idx)) * + state->userdata_chunk_size); for (int i = 0; i < count; i++) { state->func(state->userdata, iter + i, &tls); } } } -static void parallel_range_single_thread(const int start, - int const stop, - void *userdata, - TaskParallelRangeFunc func, - const TaskParallelSettings *settings) +static void parallel_range_single_thread(TaskParallelRangePool *range_pool) { - void *userdata_chunk = settings->userdata_chunk; - const size_t userdata_chunk_size = settings->userdata_chunk_size; - void *userdata_chunk_local = NULL; - const bool use_userdata_chunk = (userdata_chunk_size != 0) && (userdata_chunk != NULL); - if (use_userdata_chunk) { - userdata_chunk_local = MALLOCA(userdata_chunk_size); - memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size); - } - TaskParallelTLS tls = { - .thread_id = 0, - .userdata_chunk = userdata_chunk_local, - }; - for (int i = start; i < stop; i++) { - func(userdata, i, &tls); - } - if (settings->func_finalize != NULL) { - settings->func_finalize(userdata, userdata_chunk_local); + for (TaskParallelRangeState *state = range_pool->parallel_range_tasks; state != NULL; + state = state->next) { + const int start = state->start; + const int stop = state->stop; + void *userdata = state->userdata; + TaskParallelRangeFunc func = state->func; + + void *userdata_chunk = state->userdata_chunk; + const size_t userdata_chunk_size = state->userdata_chunk_size; + void *userdata_chunk_local = NULL; + const bool use_userdata_chunk = (userdata_chunk_size != 0) && (userdata_chunk != NULL); + if (use_userdata_chunk) { + userdata_chunk_local = MALLOCA(userdata_chunk_size); + memcpy(userdata_chunk_local, userdata_chunk, userdata_chunk_size); + } + TaskParallelTLS tls = { + .thread_id = 0, + .userdata_chunk = userdata_chunk_local, + }; + for (int i = start; i < stop; i++) { + func(userdata, i, &tls); + } + if (state->func_finalize != NULL) { + state->func_finalize(userdata, userdata_chunk_local); + } + MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size); } - MALLOCA_FREE(userdata_chunk_local, userdata_chunk_size); } /** @@ -1185,78 +1248,84 @@ void BLI_task_parallel_range(const int start, const int stop, void *userdata, TaskParallelRangeFunc func, - const TaskParallelSettings *settings) + TaskParallelSettings *settings) { - TaskScheduler *task_scheduler; - TaskPool *task_pool; - ParallelRangeState state; - int i, num_threads, num_tasks; - - void *userdata_chunk = settings->userdata_chunk; - const size_t userdata_chunk_size = settings->userdata_chunk_size; - void *userdata_chunk_local = NULL @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org https://lists.blender.org/mailman/listinfo/bf-blender-cvs