Commit: 30f4576d51bd247488da8d250b541539a3bd5fa8 Author: Lukas Stockner Date: Thu Mar 30 00:25:45 2017 +0200 Branches: temp-cycles-denoising https://developer.blender.org/rB30f4576d51bd247488da8d250b541539a3bd5fa8
Cycles Denoising: Refactor design row construction This change improves the shared memory access pattern, reduces the local memory requirementy by 11 floats per thread and saves some memory copying. =================================================================== M intern/cycles/filter/filter_features.h M intern/cycles/filter/filter_reconstruction.h M intern/cycles/filter/filter_transform.h M intern/cycles/filter/filter_transform_gpu.h M intern/cycles/filter/filter_transform_sse.h M intern/cycles/util/util_math_matrix.h =================================================================== diff --git a/intern/cycles/filter/filter_features.h b/intern/cycles/filter/filter_features.h index b25649f497a..f4f6e1f7639 100644 --- a/intern/cycles/filter/filter_features.h +++ b/intern/cycles/filter/filter_features.h @@ -16,7 +16,7 @@ CCL_NAMESPACE_BEGIN -#define ccl_get_feature(pass) buffer[(pass)*pass_stride] +#define ccl_get_feature(buffer, pass) buffer[(pass)*pass_stride] /* Loop over the pixels in the range [low.x, high.x) x [low.y, high.y). * pixel_buffer always points to the current pixel in the first pass. */ @@ -28,32 +28,18 @@ pixel_buffer += buffer_w - (high.x - low.x); \ } -ccl_device_inline void filter_get_feature_mean(int2 pixel, ccl_global float ccl_readonly_ptr buffer, float *features, int pass_stride) -{ - features[0] = pixel.x; - features[1] = pixel.y; - features[2] = ccl_get_feature(0); - features[3] = ccl_get_feature(1); - features[4] = ccl_get_feature(2); - features[5] = ccl_get_feature(3); - features[6] = ccl_get_feature(4); - features[7] = ccl_get_feature(5); - features[8] = ccl_get_feature(6); - features[9] = ccl_get_feature(7); -} - ccl_device_inline void filter_get_features(int2 pixel, ccl_global float ccl_readonly_ptr buffer, ccl_local_param float *features, float ccl_readonly_ptr mean, int pass_stride) { features[0] = pixel.x; features[1] = pixel.y; - features[2] = ccl_get_feature(0); - features[3] = ccl_get_feature(1); - features[4] = ccl_get_feature(2); - features[5] = ccl_get_feature(3); - features[6] = ccl_get_feature(4); - features[7] = ccl_get_feature(5); - features[8] = ccl_get_feature(6); - features[9] = ccl_get_feature(7); + features[2] = ccl_get_feature(buffer, 0); + features[3] = ccl_get_feature(buffer, 1); + features[4] = ccl_get_feature(buffer, 2); + features[5] = ccl_get_feature(buffer, 3); + features[6] = ccl_get_feature(buffer, 4); + features[7] = ccl_get_feature(buffer, 5); + features[8] = ccl_get_feature(buffer, 6); + features[9] = ccl_get_feature(buffer, 7); if(mean) { for(int i = 0; i < DENOISE_FEATURES; i++) features[i] -= mean[i]; @@ -64,14 +50,14 @@ ccl_device_inline void filter_get_feature_scales(int2 pixel, ccl_global float cc { scales[0] = fabsf(pixel.x - mean[0]); scales[1] = fabsf(pixel.y - mean[1]); - scales[2] = fabsf(ccl_get_feature(0) - mean[2]); - scales[3] = len_squared(make_float3(ccl_get_feature(1) - mean[3], - ccl_get_feature(2) - mean[4], - ccl_get_feature(3) - mean[5])); - scales[4] = fabsf(ccl_get_feature(4) - mean[6]); - scales[5] = len_squared(make_float3(ccl_get_feature(5) - mean[7], - ccl_get_feature(6) - mean[8], - ccl_get_feature(7) - mean[9])); + scales[2] = fabsf(ccl_get_feature(buffer, 0) - mean[2]); + scales[3] = len_squared(make_float3(ccl_get_feature(buffer, 1) - mean[3], + ccl_get_feature(buffer, 2) - mean[4], + ccl_get_feature(buffer, 3) - mean[5])); + scales[4] = fabsf(ccl_get_feature(buffer, 4) - mean[6]); + scales[5] = len_squared(make_float3(ccl_get_feature(buffer, 5) - mean[7], + ccl_get_feature(buffer, 6) - mean[8], + ccl_get_feature(buffer, 7) - mean[9])); } ccl_device_inline void filter_calculate_scale(float *scale) @@ -86,12 +72,12 @@ ccl_device_inline void filter_calculate_scale(float *scale) ccl_device_inline float3 filter_get_pixel_color(ccl_global float ccl_readonly_ptr buffer, int pass_stride) { - return make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2)); + return make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2)); } ccl_device_inline float filter_get_pixel_variance(ccl_global float ccl_readonly_ptr buffer, int pass_stride) { - return average(make_float3(ccl_get_feature(0), ccl_get_feature(1), ccl_get_feature(2))); + return average(make_float3(ccl_get_feature(buffer, 0), ccl_get_feature(buffer, 1), ccl_get_feature(buffer, 2))); } ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_variance, float3 center_color, float sqrt_center_variance) @@ -101,27 +87,41 @@ ccl_device_inline bool filter_firefly_rejection(float3 pixel_color, float pixel_ return (color_diff > 3.0f*variance); } +ccl_device_inline void design_row_add(float ccl_local_param *design_row, + int rank, + ccl_global float ccl_readonly_ptr transform, + int stride, + int row, + float feature) +{ + for(int i = 0; i < rank; i++) { + design_row[1+i] += transform[(row*DENOISE_FEATURES + i)*stride]*feature; + } +} + /* Fill the design row without computing the weight. */ -ccl_device_inline void filter_get_design_row_transform(int2 pixel, - ccl_global float ccl_readonly_ptr buffer, - float ccl_readonly_ptr feature_means, +ccl_device_inline void filter_get_design_row_transform(int2 p_pixel, + ccl_global float ccl_readonly_ptr p_buffer, + int2 q_pixel, + ccl_global float ccl_readonly_ptr q_buffer, int pass_stride, - ccl_local_param float *features, int rank, - float *design_row, - ccl_global float ccl_readonly_ptr feature_transform, - int transform_stride) + float ccl_local_param *design_row, + ccl_global float ccl_readonly_ptr transform, + int stride) { - filter_get_features(pixel, buffer, features, feature_means, pass_stride); design_row[0] = 1.0f; - for(int d = 0; d < rank; d++) { -#ifdef __KERNEL_GPU__ - float x = math_vector_dot_strided(features, feature_transform + d*DENOISE_FEATURES*transform_stride, transform_stride, DENOISE_FEATURES); -#else - float x = math_vector_dot(features, feature_transform + d*DENOISE_FEATURES, DENOISE_FEATURES); -#endif - design_row[1+d] = x; - } + math_local_vector_zero(design_row+1, rank); + design_row_add(design_row, rank, transform, stride, 0, q_pixel.x - p_pixel.x); + design_row_add(design_row, rank, transform, stride, 1, q_pixel.y - p_pixel.y); + design_row_add(design_row, rank, transform, stride, 2, ccl_get_feature(q_buffer, 0) - ccl_get_feature(p_buffer, 0)); + design_row_add(design_row, rank, transform, stride, 3, ccl_get_feature(q_buffer, 1) - ccl_get_feature(p_buffer, 1)); + design_row_add(design_row, rank, transform, stride, 4, ccl_get_feature(q_buffer, 2) - ccl_get_feature(p_buffer, 2)); + design_row_add(design_row, rank, transform, stride, 5, ccl_get_feature(q_buffer, 3) - ccl_get_feature(p_buffer, 3)); + design_row_add(design_row, rank, transform, stride, 6, ccl_get_feature(q_buffer, 4) - ccl_get_feature(p_buffer, 4)); + design_row_add(design_row, rank, transform, stride, 7, ccl_get_feature(q_buffer, 5) - ccl_get_feature(p_buffer, 5)); + design_row_add(design_row, rank, transform, stride, 8, ccl_get_feature(q_buffer, 6) - ccl_get_feature(p_buffer, 6)); + design_row_add(design_row, rank, transform, stride, 9, ccl_get_feature(q_buffer, 7) - ccl_get_feature(p_buffer, 7)); } CCL_NAMESPACE_END diff --git a/intern/cycles/filter/filter_reconstruction.h b/intern/cycles/filter/filter_reconstruction.h index f665e7e5f1e..70dfedce453 100644 --- a/intern/cycles/filter/filter_reconstruction.h +++ b/intern/cycles/filter/filter_reconstruction.h @@ -38,11 +38,11 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, const int stride = 1; (void)storage_stride; (void)localIdx; - float features[DENOISE_FEATURES]; + float design_row[DENOISE_FEATURES+1]; #else const int stride = storage_stride; - ccl_local float shared_features[DENOISE_FEATURES*CCL_MAX_LOCAL_SIZE]; - ccl_local_param float *features = shared_features + localIdx*DENOISE_FEATURES; + ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE]; + ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1); #endif float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride); @@ -55,11 +55,9 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y, return; } - float feature_means[DENOISE_FEATURES]; - filter_get_feature_mean(make_int2(x, y), buffer + p_offset, feature_means, pass_stride); - - float design_row[DENOISE_FEATURES+1]; - filter_get_design_row_transform(make_int2(x+dx, y+dy), buffer + q_offset, feature_means, pass_stride, features, *rank, design_row, transform, stride); + filter_get_design_row_transform(make_int2(x, y), buffer + p_offset, + make_int2(x+dx, y+dy), buffer + q_offset, + pass_stride, *rank, design_row, transform, stride); math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride); math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride); diff --git a/intern/cycles/filter/filter_transform.h b/intern/cycles/filter/filter_transform.h index 3f161b06024..7e2504612e7 100644 --- a/intern/cycles/filter/filter_transform.h +++ b/intern/cycles/filter/filter_transform.h @@ -106,6 +106,7 @@ ccl_device void kernel_filter_construct_transform(int sample, float ccl_readonly math_vector_mul(transform + (*rank)*DENOISE_FEATURES, feature_scale, DENOISE_FEATURES); } } + math_matrix_transpose(transform, DENOISE_FEATURES, 1); } CCL_NAMESPACE_END diff @@ Diff output truncated at 10240 characters. @@ _______________________________________________ Bf-blender-cvs mailing list Bf-blender-cvs@blender.org https://lists.blender.org/mailman/listinfo/bf-blender-cvs