diff options
Diffstat (limited to 'src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp')
-rw-r--r-- | src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp | 60 |
1 files changed, 39 insertions, 21 deletions
diff --git a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp index 8021476ed..559a213b9 100644 --- a/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp +++ b/src/video_core/host_shaders/queries_prefix_scan_sum_nosubgroups.comp @@ -32,25 +32,30 @@ #endif BEGIN_PUSH_CONSTANTS -UNIFORM(0) uint max_accumulation_base; -UNIFORM(1) uint accumulation_limit; +UNIFORM(0) uint min_accumulation_base; +UNIFORM(1) uint max_accumulation_base; +UNIFORM(2) uint accumulation_limit; +UNIFORM(3) uint buffer_offset; END_PUSH_CONSTANTS -layout(local_size_x = 32) in; +#define LOCAL_RESULTS 4 +#define QUERIES_PER_INVOC 2048 + +layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in; layout(std430, binding = 0) readonly buffer block1 { - uvec2 input_data[gl_WorkGroupSize.x]; + uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; }; layout(std430, binding = 1) writeonly coherent buffer block2 { - uvec2 output_data[gl_WorkGroupSize.x]; + uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; }; layout(std430, binding = 2) coherent buffer block3 { uvec2 accumulated_data; }; -shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; +shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS]; uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uint carry = 0; @@ -62,23 +67,31 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { void main(void) { uint id = gl_LocalInvocationID.x; - uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); - uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); + uvec2 base_value[LOCAL_RESULTS]; + const uvec2 accum = accumulated_data; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base + ? accumulated_data + : uvec2(0); + } uint work_size = gl_WorkGroupSize.x; uint rd_id; uint wr_id; uint mask; - uvec2 input_1 = input_data[id * 2]; - uvec2 input_2 = input_data[id * 2 + 1]; + uvec2 inputs[LOCAL_RESULTS]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i]; + } // The number of steps is the log base 2 of the // work group size, which should be a power of 2 - const uint steps = uint(log2(work_size)) + 1; + const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS)); uint step = 0; // Each invocation is responsible for the content of // two elements of the output array - shared_data[id * 2] = input_1; - shared_data[id * 2 + 1] = input_2; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = inputs[i]; + } // Synchronize to make sure that everyone has initialized // their elements of shared_data[] with data loaded from // the input arrays @@ -100,21 +113,26 @@ void main(void) { memoryBarrierShared(); } // Add the accumulation - shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); - shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); + for (uint i = 0; i < LOCAL_RESULTS; i++) { + shared_data[id * LOCAL_RESULTS + i] = + AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]); + } barrier(); memoryBarrierShared(); // Finally write our data back to the output buffer - output_data[id * 2] = shared_data[id * 2]; - output_data[id * 2 + 1] = shared_data[id * 2 + 1]; + for (uint i = 0; i < LOCAL_RESULTS; i++) { + output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i]; + } if (id == 0) { - if (max_accumulation_base >= accumulation_limit + 1) { + if (min_accumulation_base >= accumulation_limit + 1) { accumulated_data = shared_data[accumulation_limit]; return; } - uvec2 value_1 = shared_data[max_accumulation_base]; - uvec2 value_2 = shared_data[accumulation_limit]; - accumulated_data = AddUint64(value_1, -value_2); + uvec2 reset_value = shared_data[max_accumulation_base - 1]; + uvec2 final_value = shared_data[accumulation_limit]; + // Two complements + reset_value = AddUint64(uvec2(1, 0), ~reset_value); + accumulated_data = AddUint64(final_value, reset_value); } }
\ No newline at end of file |