// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #version 450 #ifdef VULKAN #define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { #define END_PUSH_CONSTANTS }; #define UNIFORM(n) #define BINDING_INPUT_BUFFER 0 #define BINDING_OUTPUT_IMAGE 1 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv #define BEGIN_PUSH_CONSTANTS #define END_PUSH_CONSTANTS #define UNIFORM(n) layout(location = n) uniform #define BINDING_INPUT_BUFFER 0 #define BINDING_OUTPUT_IMAGE 0 #endif layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; BEGIN_PUSH_CONSTANTS UNIFORM(1) uvec2 block_dims; UNIFORM(2) uint layer_stride; UNIFORM(3) uint block_size; UNIFORM(4) uint x_shift; UNIFORM(5) uint block_height; UNIFORM(6) uint block_height_mask; END_PUSH_CONSTANTS struct EncodingData { uint data; }; layout(binding = BINDING_INPUT_BUFFER, std430) readonly restrict buffer InputBufferU32 { uvec4 astc_data[]; }; layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly restrict image2DArray dest_image; const uint GOB_SIZE_X_SHIFT = 6; const uint GOB_SIZE_Y_SHIFT = 3; const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; const uint BYTES_PER_BLOCK_LOG2 = 4; const uint JUST_BITS = 0u; const uint QUINT = 1u; const uint TRIT = 2u; // ASTC Encodings data, sorted in ascending order based on their BitLength value // (see GetBitLength() function) const uvec4 encoding_values[6] = uvec4[]( uvec4((JUST_BITS), (JUST_BITS | (1u << 8u)), (TRIT), (JUST_BITS | (2u << 8u))), uvec4((QUINT), (TRIT | (1u << 8u)), (JUST_BITS | (3u << 8u)), (QUINT | (1u << 8u))), uvec4((TRIT | (2u << 8u)), (JUST_BITS | (4u << 8u)), (QUINT | (2u << 8u)), (TRIT | (3u << 8u))), uvec4((JUST_BITS | (5u << 8u)), (QUINT | (3u << 8u)), (TRIT | (4u << 8u)), (JUST_BITS | (6u << 8u))), uvec4((QUINT | (4u << 8u)), (TRIT | (5u << 8u)), (JUST_BITS | (7u << 8u)), (QUINT | (5u << 8u))), uvec4((TRIT | (6u << 8u)), (JUST_BITS | (8u << 8u)), 0u, 0u)); // Input ASTC texture globals int total_bitsread = 0; uvec4 local_buff; // Color data globals uvec4 color_endpoint_data; int color_bitsread = 0; // Global "vector" to be pushed into when decoding // At most will require BLOCK_WIDTH x BLOCK_HEIGHT in single plane mode // At most will require BLOCK_WIDTH x BLOCK_HEIGHT x 2 in dual plane mode // So the maximum would be 144 (12 x 12) elements, x 2 for two planes #define DIVCEIL(number, divisor) (number + divisor - 1) / divisor #define ARRAY_NUM_ELEMENTS 144 #define VECTOR_ARRAY_SIZE DIVCEIL(ARRAY_NUM_ELEMENTS * 2, 4) uvec4 result_vector[VECTOR_ARRAY_SIZE]; uint result_index = 0; uint result_vector_max_index; bool result_limit_reached = false; // EncodingData helpers uint Encoding(EncodingData val) { return bitfieldExtract(val.data, 0, 8); } uint NumBits(EncodingData val) { return bitfieldExtract(val.data, 8, 8); } uint BitValue(EncodingData val) { return bitfieldExtract(val.data, 16, 8); } uint QuintTritValue(EncodingData val) { return bitfieldExtract(val.data, 24, 8); } void Encoding(inout EncodingData val, uint v) { val.data = bitfieldInsert(val.data, v, 0, 8); } void NumBits(inout EncodingData val, uint v) { val.data = bitfieldInsert(val.data, v, 8, 8); } void BitValue(inout EncodingData val, uint v) { val.data = bitfieldInsert(val.data, v, 16, 8); } void QuintTritValue(inout EncodingData val, uint v) { val.data = bitfieldInsert(val.data, v, 24, 8); } EncodingData CreateEncodingData(uint encoding, uint num_bits, uint bit_val, uint quint_trit_val) { return EncodingData(((encoding) << 0u) | ((num_bits) << 8u) | ((bit_val) << 16u) | ((quint_trit_val) << 24u)); } void ResultEmplaceBack(EncodingData val) { if (result_index >= result_vector_max_index) { // Alert callers to avoid decoding more than needed by this phase result_limit_reached = true; return; } const uint array_index = result_index / 4u; const uint vector_index = result_index % 4u; result_vector[array_index][vector_index] = val.data; ++result_index; } uvec4 ReplicateByteTo16(uvec4 value) { return value * 0x101; } uint ReplicateBitTo7(uint value) { return value * 127; } uint ReplicateBitTo9(uint value) { return value * 511; } uint FastReplicateTo8(uint value, uint num_bits) { if (value == 0) { return 0; } const uint array_index = value / 4; const uint vector_index = bitfieldExtract(value, 0, 2); switch (num_bits) { case 1: return 255; case 2: { const uvec4 REPLICATE_2_BIT_TO_8_TABLE = (uvec4(0, 85, 170, 255)); return REPLICATE_2_BIT_TO_8_TABLE[vector_index]; } case 3: { const uvec4 REPLICATE_3_BIT_TO_8_TABLE[2] = uvec4[](uvec4(0, 36, 73, 109), uvec4(146, 182, 219, 255)); return REPLICATE_3_BIT_TO_8_TABLE[array_index][vector_index]; } case 4: { const uvec4 REPLICATE_4_BIT_TO_8_TABLE[4] = uvec4[](uvec4(0, 17, 34, 51), uvec4(68, 85, 102, 119), uvec4(136, 153, 170, 187), uvec4(204, 221, 238, 255)); return REPLICATE_4_BIT_TO_8_TABLE[array_index][vector_index]; } case 5: { const uvec4 REPLICATE_5_BIT_TO_8_TABLE[8] = uvec4[](uvec4(0, 8, 16, 24), uvec4(33, 41, 49, 57), uvec4(66, 74, 82, 90), uvec4(99, 107, 115, 123), uvec4(132, 140, 148, 156), uvec4(165, 173, 181, 189), uvec4(198, 206, 214, 222), uvec4(231, 239, 247, 255)); return REPLICATE_5_BIT_TO_8_TABLE[array_index][vector_index]; } case 6: { const uvec4 REPLICATE_6_BIT_TO_8_TABLE[16] = uvec4[]( uvec4(0, 4, 8, 12), uvec4(16, 20, 24, 28), uvec4(32, 36, 40, 44), uvec4(48, 52, 56, 60), uvec4(65, 69, 73, 77), uvec4(81, 85, 89, 93), uvec4(97, 101, 105, 109), uvec4(113, 117, 121, 125), uvec4(130, 134, 138, 142), uvec4(146, 150, 154, 158), uvec4(162, 166, 170, 174), uvec4(178, 182, 186, 190), uvec4(195, 199, 203, 207), uvec4(211, 215, 219, 223), uvec4(227, 231, 235, 239), uvec4(243, 247, 251, 255)); return REPLICATE_6_BIT_TO_8_TABLE[array_index][vector_index]; } case 7: { const uvec4 REPLICATE_7_BIT_TO_8_TABLE[32] = uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22), uvec4(24, 26, 28, 30), uvec4(32, 34, 36, 38), uvec4(40, 42, 44, 46), uvec4(48, 50, 52, 54), uvec4(56, 58, 60, 62), uvec4(64, 66, 68, 70), uvec4(72, 74, 76, 78), uvec4(80, 82, 84, 86), uvec4(88, 90, 92, 94), uvec4(96, 98, 100, 102), uvec4(104, 106, 108, 110), uvec4(112, 114, 116, 118), uvec4(120, 122, 124, 126), uvec4(129, 131, 133, 135), uvec4(137, 139, 141, 143), uvec4(145, 147, 149, 151), uvec4(153, 155, 157, 159), uvec4(161, 163, 165, 167), uvec4(169, 171, 173, 175), uvec4(177, 179, 181, 183), uvec4(185, 187, 189, 191), uvec4(193, 195, 197, 199), uvec4(201, 203, 205, 207), uvec4(209, 211, 213, 215), uvec4(217, 219, 221, 223), uvec4(225, 227, 229, 231), uvec4(233, 235, 237, 239), uvec4(241, 243, 245, 247), uvec4(249, 251, 253, 255)); return REPLICATE_7_BIT_TO_8_TABLE[array_index][vector_index]; } } return value; } uint FastReplicateTo6(uint value, uint num_bits) { if (value == 0) { return 0; } const uint array_index = value / 4; const uint vector_index = bitfieldExtract(value, 0, 2); switch (num_bits) { case 1: return 63; case 2: { const uvec4 REPLICATE_2_BIT_TO_6_TABLE = uvec4(0, 21, 42, 63); return REPLICATE_2_BIT_TO_6_TABLE[vector_index]; } case 3: { const uvec4 REPLICATE_3_BIT_TO_6_TABLE[2] = uvec4[](uvec4(0, 9, 18, 27), uvec4(36, 45, 54, 63)); return REPLICATE_3_BIT_TO_6_TABLE[array_index][vector_index]; } case 4: { const uvec4 REPLICATE_4_BIT_TO_6_TABLE[4] = uvec4[](uvec4(0, 4, 8, 12), uvec4(17, 21, 25, 29), uvec4(34, 38, 42, 46), uvec4(51, 55, 59, 63)); return REPLICATE_4_BIT_TO_6_TABLE[array_index][vector_index]; } case 5: { const uvec4 REPLICATE_5_BIT_TO_6_TABLE[8] = uvec4[](uvec4(0, 2, 4, 6), uvec4(8, 10, 12, 14), uvec4(16, 18, 20, 22), uvec4(24, 26, 28, 30), uvec4(33, 35, 37, 39), uvec4(41, 43, 45, 47), uvec4(49, 51, 53, 55), uvec4(57, 59, 61, 63)); return REPLICATE_5_BIT_TO_6_TABLE[array_index][vector_index]; } } return value; } uint Div3Floor(uint v) { return (v * 0x5556) >> 16; } uint Div3Ceil(uint v) { return Div3Floor(v + 2); } uint Div5Floor(uint v) { return (v * 0x3334) >> 16; } uint Div5Ceil(uint v) { return Div5Floor(v + 4); } uint Hash52(uint p) { p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; p ^= p << 6; p ^= p >> 17; return p; } uint Select2DPartition(uint seed, uint x, uint y, uint partition_count) { if ((block_dims.y * block_dims.x) < 32) { x <<= 1; y <<= 1; } seed += (partition_count - 1) * 1024; const uint rnum = Hash52(uint(seed)); uint seed1 = uint(rnum & 0xF); uint seed2 = uint((rnum >> 4) & 0xF); uint seed3 = uint((rnum >> 8) & 0xF); uint seed4 = uint((rnum >> 12) & 0xF); uint seed5 = uint((rnum >> 16) & 0xF); uint seed6 = uint((rnum >> 20) & 0xF); uint seed7 = uint((rnum >> 24) & 0xF); uint seed8 = uint((rnum >> 28) & 0xF); seed1 = (seed1 * seed1); seed2 = (seed2 * seed2); seed3 = (seed3 * seed3); seed4 = (seed4 * seed4); seed5 = (seed5 * seed5); seed6 = (seed6 * seed6); seed7 = (seed7 * seed7); seed8 = (seed8 * seed8); uint sh1, sh2; if ((seed & 1) > 0) { sh1 = (seed & 2) > 0 ? 4 : 5; sh2 = (partition_count == 3) ? 6 : 5; } else { sh1 = (partition_count == 3) ? 6 : 5; sh2 = (seed & 2) > 0 ? 4 : 5; } seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2; seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2; uint a = seed1 * x + seed2 * y + (rnum >> 14); uint b = seed3 * x + seed4 * y + (rnum >> 10); uint c = seed5 * x + seed6 * y + (rnum >> 6); uint d = seed7 * x + seed8 * y + (rnum >> 2); a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F; if (partition_count < 4) { d = 0; } if (partition_count < 3) { c = 0; } if (a >= b && a >= c && a >= d) { return 0; } else if (b >= c && b >= d) { return 1; } else if (c >= d) { return 2; } else { return 3; } } uint ExtractBits(uvec4 payload, int offset, int bits) { if (bits <= 0) { return 0; } if (bits > 32) { return 0; } const int last_offset = offset + bits - 1; const int shifted_offset = offset >> 5; if ((last_offset >> 5) == shifted_offset) { return bitfieldExtract(payload[shifted_offset], offset & 31, bits); } const int first_bits = 32 - (offset & 31); const int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); const int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); return result_first | (result_second << first_bits); } uint StreamBits(uint num_bits) { const int int_bits = int(num_bits); const uint ret = ExtractBits(local_buff, total_bitsread, int_bits); total_bitsread += int_bits; return ret; } void SkipBits(uint num_bits) { const int int_bits = int(num_bits); total_bitsread += int_bits; } uint StreamColorBits(uint num_bits) { const int int_bits = int(num_bits); const uint ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); color_bitsread += int_bits; return ret; } EncodingData GetEncodingFromVector(uint index) { const uint array_index = index / 4; const uint vector_index = index % 4; const uint data = result_vector[array_index][vector_index]; return EncodingData(data); } // Returns the number of bits required to encode n_vals values. uint GetBitLength(uint n_vals, uint encoding_index) { const EncodingData encoding_value = EncodingData(encoding_values[encoding_index / 4][encoding_index % 4]); const uint encoding = Encoding(encoding_value); uint total_bits = NumBits(encoding_value) * n_vals; if (encoding == TRIT) { total_bits += Div5Ceil(n_vals * 8); } else if (encoding == QUINT) { total_bits += Div3Ceil(n_vals * 7); } return total_bits; } uint GetNumWeightValues(uvec2 size, bool dual_plane) { uint n_vals = size.x * size.y; if (dual_plane) { n_vals *= 2; } return n_vals; } uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) { const uint n_vals = GetNumWeightValues(size, dual_plane); return GetBitLength(n_vals, max_weight); } uint BitsBracket(uint bits, uint pos) { return ((bits >> pos) & 1); } uint BitsOp(uint bits, uint start, uint end) { const uint mask = (1 << (end - start + 1)) - 1; return ((bits >> start) & mask); } void DecodeQuintBlock(uint num_bits) { uvec3 m; uvec4 qQ; m[0] = StreamColorBits(num_bits); qQ.w = StreamColorBits(3); m[1] = StreamColorBits(num_bits); qQ.w |= StreamColorBits(2) << 3; m[2] = StreamColorBits(num_bits); qQ.w |= StreamColorBits(2) << 5; if (BitsOp(qQ.w, 1, 2) == 3 && BitsOp(qQ.w, 5, 6) == 0) { qQ.x = 4; qQ.y = 4; qQ.z = (BitsBracket(qQ.w, 0) << 2) | ((BitsBracket(qQ.w, 4) & ~BitsBracket(qQ.w, 0)) << 1) | (BitsBracket(qQ.w, 3) & ~BitsBracket(qQ.w, 0)); } else { uint C = 0; if (BitsOp(qQ.w, 1, 2) == 3) { qQ.z = 4; C = (BitsOp(qQ.w, 3, 4) << 3) | ((~BitsOp(qQ.w, 5, 6) & 3) << 1) | BitsBracket(qQ.w, 0); } else { qQ.z = BitsOp(qQ.w, 5, 6); C = BitsOp(qQ.w, 0, 4); } if (BitsOp(C, 0, 2) == 5) { qQ.y = 4; qQ.x = BitsOp(C, 3, 4); } else { qQ.y = BitsOp(C, 3, 4); qQ.x = BitsOp(C, 0, 2); } } for (uint i = 0; i < 3; i++) { const EncodingData val = CreateEncodingData(QUINT, num_bits, m[i], qQ[i]); ResultEmplaceBack(val); } } void DecodeTritBlock(uint num_bits) { uvec4 m; uvec4 t; uvec3 Tm5t5; m[0] = StreamColorBits(num_bits); Tm5t5.x = StreamColorBits(2); m[1] = StreamColorBits(num_bits); Tm5t5.x |= StreamColorBits(2) << 2; m[2] = StreamColorBits(num_bits); Tm5t5.x |= StreamColorBits(1) << 4; m[3] = StreamColorBits(num_bits); Tm5t5.x |= StreamColorBits(2) << 5; Tm5t5.y = StreamColorBits(num_bits); Tm5t5.x |= StreamColorBits(1) << 7; uint C = 0; if (BitsOp(Tm5t5.x, 2, 4) == 7) { C = (BitsOp(Tm5t5.x, 5, 7) << 2) | BitsOp(Tm5t5.x, 0, 1); Tm5t5.z = 2; t[3] = 2; } else { C = BitsOp(Tm5t5.x, 0, 4); if (BitsOp(Tm5t5.x, 5, 6) == 3) { Tm5t5.z = 2; t[3] = BitsBracket(Tm5t5.x, 7); } else { Tm5t5.z = BitsBracket(Tm5t5.x, 7); t[3] = BitsOp(Tm5t5.x, 5, 6); } } if (BitsOp(C, 0, 1) == 3) { t[2] = 2; t[1] = BitsBracket(C, 4); t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3)); } else if (BitsOp(C, 2, 3) == 3) { t[2] = 2; t[1] = 2; t[0] = BitsOp(C, 0, 1); } else { t[2] = BitsBracket(C, 4); t[1] = BitsOp(C, 2, 3); t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1)); } for (uint i = 0; i < 4; i++) { const EncodingData val = CreateEncodingData(TRIT, num_bits, m[i], t[i]); ResultEmplaceBack(val); } const EncodingData val = CreateEncodingData(TRIT, num_bits, Tm5t5.y, Tm5t5.z); ResultEmplaceBack(val); } void DecodeIntegerSequence(uint max_range, uint num_values) { EncodingData val = EncodingData(encoding_values[max_range / 4][max_range % 4]); const uint encoding = Encoding(val); const uint num_bits = NumBits(val); uint vals_decoded = 0; while (vals_decoded < num_values && !result_limit_reached) { switch (encoding) { case QUINT: DecodeQuintBlock(num_bits); vals_decoded += 3; break; case TRIT: DecodeTritBlock(num_bits); vals_decoded += 5; break; case JUST_BITS: BitValue(val, StreamColorBits(num_bits)); ResultEmplaceBack(val); vals_decoded++; break; } } } uvec4 color_values[8]; void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { uint num_values = 0; for (uint i = 0; i < num_partitions; i++) { num_values += ((modes[i] >> 2) + 1) << 1; } // Find the largest encoding that's within color_data_bits // TODO(ameerj): profile with binary search int range = 0; while (++range < ((encoding_values.length() * 4) - 2)) { const uint bit_length = GetBitLength(num_values, range); if (bit_length > color_data_bits) { break; } } DecodeIntegerSequence(range - 1, num_values); uint out_index = 0; for (int itr = 0; itr < result_index; ++itr) { if (out_index >= num_values) { break; } const EncodingData val = GetEncodingFromVector(itr); const uint encoding = Encoding(val); const uint bitlen = NumBits(val); const uint bitval = BitValue(val); uint A = 0, B = 0, C = 0, D = 0; A = ReplicateBitTo9((bitval & 1)); switch (encoding) { case JUST_BITS: color_values[out_index / 4][out_index % 4] = FastReplicateTo8(bitval, bitlen); ++out_index; break; case TRIT: { D = QuintTritValue(val); switch (bitlen) { case 1: C = 204; break; case 2: { C = 93; const uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 4) | (b << 2) | (b << 1); break; } case 3: { C = 44; const uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 2) | cb; break; } case 4: { C = 22; const uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | dcb; break; } case 5: { C = 11; const uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 2); break; } case 6: { C = 5; const uint fedcb = (bitval >> 1) & 0x1F; B = (fedcb << 4) | (fedcb >> 4); break; } } break; } case QUINT: { D = QuintTritValue(val); switch (bitlen) { case 1: C = 113; break; case 2: { C = 54; const uint b = (bitval >> 1) & 1; B = (b << 8) | (b << 3) | (b << 2); break; } case 3: { C = 26; const uint cb = (bitval >> 1) & 3; B = (cb << 7) | (cb << 1) | (cb >> 1); break; } case 4: { C = 13; const uint dcb = (bitval >> 1) & 7; B = (dcb << 6) | (dcb >> 1); break; } case 5: { C = 6; const uint edcb = (bitval >> 1) & 0xF; B = (edcb << 5) | (edcb >> 3); break; } } break; } } if (encoding != JUST_BITS) { uint T = (D * C) + B; T ^= A; T = (A & 0x80) | (T >> 2); color_values[out_index / 4][out_index % 4] = T; ++out_index; } } } ivec2 BitTransferSigned(int a, int b) { ivec2 transferred; transferred.y = b >> 1; transferred.y |= a & 0x80; transferred.x = a >> 1; transferred.x &= 0x3F; if ((transferred.x & 0x20) > 0) { transferred.x -= 0x40; } return transferred; } uvec4 ClampByte(ivec4 color) { return uvec4(clamp(color, 0, 255)); } ivec4 BlueContract(int a, int r, int g, int b) { return ivec4(a, (r + b) >> 1, (g + b) >> 1, b); } void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode, inout uint colvals_index) { #define READ_UINT_VALUES(N) \ uvec4 V[2]; \ for (uint i = 0; i < N; i++) { \ V[i / 4][i % 4] = color_values[colvals_index / 4][colvals_index % 4]; \ ++colvals_index; \ } #define READ_INT_VALUES(N) \ ivec4 V[2]; \ for (uint i = 0; i < N; i++) { \ V[i / 4][i % 4] = int(color_values[colvals_index / 4][colvals_index % 4]); \ ++colvals_index; \ } switch (color_endpoint_mode) { case 0: { READ_UINT_VALUES(2) ep1 = uvec4(0xFF, V[0].x, V[0].x, V[0].x); ep2 = uvec4(0xFF, V[0].y, V[0].y, V[0].y); break; } case 1: { READ_UINT_VALUES(2) const uint L0 = (V[0].x >> 2) | (V[0].y & 0xC0); const uint L1 = min(L0 + (V[0].y & 0x3F), 0xFFU); ep1 = uvec4(0xFF, L0, L0, L0); ep2 = uvec4(0xFF, L1, L1, L1); break; } case 4: { READ_UINT_VALUES(4) ep1 = uvec4(V[0].z, V[0].x, V[0].x, V[0].x); ep2 = uvec4(V[0].w, V[0].y, V[0].y, V[0].y); break; } case 5: { READ_INT_VALUES(4) ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); V[0].y = transferred.x; V[0].x = transferred.y; transferred = BitTransferSigned(V[0].w, V[0].z); V[0].w = transferred.x; V[0].z = transferred.y; ep1 = ClampByte(ivec4(V[0].z, V[0].x, V[0].x, V[0].x)); ep2 = ClampByte(ivec4(V[0].z + V[0].w, V[0].x + V[0].y, V[0].x + V[0].y, V[0].x + V[0].y)); break; } case 6: { READ_UINT_VALUES(4) ep1 = uvec4(0xFF, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); ep2 = uvec4(0xFF, V[0].x, V[0].y, V[0].z); break; } case 8: { READ_UINT_VALUES(6) if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { ep1 = uvec4(0xFF, V[0].x, V[0].z, V[1].x); ep2 = uvec4(0xFF, V[0].y, V[0].w, V[1].y); } else { ep1 = uvec4(BlueContract(0xFF, int(V[0].y), int(V[0].w), int(V[1].y))); ep2 = uvec4(BlueContract(0xFF, int(V[0].x), int(V[0].z), int(V[1].x))); } break; } case 9: { READ_INT_VALUES(6) ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); V[0].y = transferred.x; V[0].x = transferred.y; transferred = BitTransferSigned(V[0].w, V[0].z); V[0].w = transferred.x; V[0].z = transferred.y; transferred = BitTransferSigned(V[1].y, V[1].x); V[1].y = transferred.x; V[1].x = transferred.y; if ((V[0].y + V[0].w + V[1].y) >= 0) { ep1 = ClampByte(ivec4(0xFF, V[0].x, V[0].z, V[1].x)); ep2 = ClampByte(ivec4(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); } else { ep1 = ClampByte(BlueContract(0xFF, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); ep2 = ClampByte(BlueContract(0xFF, V[0].x, V[0].z, V[1].x)); } break; } case 10: { READ_UINT_VALUES(6) ep1 = uvec4(V[1].x, (V[0].x * V[0].w) >> 8, (V[0].y * V[0].w) >> 8, (V[0].z * V[0].w) >> 8); ep2 = uvec4(V[1].y, V[0].x, V[0].y, V[0].z); break; } case 12: { READ_UINT_VALUES(8) if ((V[0].y + V[0].w + V[1].y) >= (V[0].x + V[0].z + V[1].x)) { ep1 = uvec4(V[1].z, V[0].x, V[0].z, V[1].x); ep2 = uvec4(V[1].w, V[0].y, V[0].w, V[1].y); } else { ep1 = uvec4(BlueContract(int(V[1].w), int(V[0].y), int(V[0].w), int(V[1].y))); ep2 = uvec4(BlueContract(int(V[1].z), int(V[0].x), int(V[0].z), int(V[1].x))); } break; } case 13: { READ_INT_VALUES(8) ivec2 transferred = BitTransferSigned(V[0].y, V[0].x); V[0].y = transferred.x; V[0].x = transferred.y; transferred = BitTransferSigned(V[0].w, V[0].z); V[0].w = transferred.x; V[0].z = transferred.y; transferred = BitTransferSigned(V[1].y, V[1].x); V[1].y = transferred.x; V[1].x = transferred.y; transferred = BitTransferSigned(V[1].w, V[1].z); V[1].w = transferred.x; V[1].z = transferred.y; if ((V[0].y + V[0].w + V[1].y) >= 0) { ep1 = ClampByte(ivec4(V[1].z, V[0].x, V[0].z, V[1].x)); ep2 = ClampByte(ivec4(V[1].w + V[1].z, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); } else { ep1 = ClampByte(BlueContract(V[1].z + V[1].w, V[0].x + V[0].y, V[0].z + V[0].w, V[1].x + V[1].y)); ep2 = ClampByte(BlueContract(V[1].z, V[0].x, V[0].z, V[1].x)); } break; } default: { // HDR mode, or more likely a bug computing the color_endpoint_mode ep1 = uvec4(0xFF, 0xFF, 0, 0); ep2 = uvec4(0xFF, 0xFF, 0, 0); break; } } #undef READ_UINT_VALUES #undef READ_INT_VALUES } uint UnquantizeTexelWeight(EncodingData val) { const uint encoding = Encoding(val); const uint bitlen = NumBits(val); const uint bitval = BitValue(val); const uint A = ReplicateBitTo7((bitval & 1)); uint B = 0, C = 0, D = 0; uint result = 0; switch (encoding) { case JUST_BITS: result = FastReplicateTo6(bitval, bitlen); break; case TRIT: { D = QuintTritValue(val); switch (bitlen) { case 0: { const uint results[3] = {0, 32, 63}; result = results[D]; break; } case 1: { C = 50; break; } case 2: { C = 23; const uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 2) | b; break; } case 3: { C = 11; const uint cb = (bitval >> 1) & 3; B = (cb << 5) | cb; break; } default: break; } break; } case QUINT: { D = QuintTritValue(val); switch (bitlen) { case 0: { const uint results[5] = {0, 16, 32, 47, 63}; result = results[D]; break; } case 1: { C = 28; break; } case 2: { C = 13; const uint b = (bitval >> 1) & 1; B = (b << 6) | (b << 1); break; } } break; } } if (encoding != JUST_BITS && bitlen > 0) { result = D * C + B; result ^= A; result = (A & 0x20) | (result >> 2); } if (result > 32) { result += 1; } return result; } uvec4 unquantized_texel_weights[VECTOR_ARRAY_SIZE]; void UnquantizeTexelWeights(uvec2 size, bool is_dual_plane) { const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1)); const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1)); const uint num_planes = is_dual_plane ? 2 : 1; const uint area = size.x * size.y; const uint loop_count = min(result_index, area * num_planes); for (uint itr = 0; itr < loop_count; ++itr) { const uint array_index = itr / 4; const uint vector_index = itr % 4; result_vector[array_index][vector_index] = UnquantizeTexelWeight(GetEncodingFromVector(itr)); } for (uint plane = 0; plane < num_planes; ++plane) { for (uint t = 0; t < block_dims.y; t++) { for (uint s = 0; s < block_dims.x; s++) { const uint cs = Ds * s; const uint ct = Dt * t; const uint gs = (cs * (size.x - 1) + 32) >> 6; const uint gt = (ct * (size.y - 1) + 32) >> 6; const uint js = gs >> 4; const uint fs = gs & 0xF; const uint jt = gt >> 4; const uint ft = gt & 0x0F; const uint w11 = (fs * ft + 8) >> 4; const uint w10 = ft - w11; const uint w01 = fs - w11; const uint w00 = 16 - fs - ft + w11; const uvec4 w = uvec4(w00, w01, w10, w11); const uint v0 = jt * size.x + js; uvec4 p = uvec4(0); #define VectorIndicesFromBase(offset_base) \ const uint offset = is_dual_plane ? 2 * offset_base + plane : offset_base; \ const uint array_index = offset / 4; \ const uint vector_index = offset % 4; if (v0 < area) { const uint offset_base = v0; VectorIndicesFromBase(offset_base); p.x = result_vector[array_index][vector_index]; } if ((v0 + 1) < (area)) { const uint offset_base = v0 + 1; VectorIndicesFromBase(offset_base); p.y = result_vector[array_index][vector_index]; } if ((v0 + size.x) < (area)) { const uint offset_base = v0 + size.x; VectorIndicesFromBase(offset_base); p.z = result_vector[array_index][vector_index]; } if ((v0 + size.x + 1) < (area)) { const uint offset_base = v0 + size.x + 1; VectorIndicesFromBase(offset_base); p.w = result_vector[array_index][vector_index]; } const uint offset = (t * block_dims.x + s) + ARRAY_NUM_ELEMENTS * plane; const uint array_index = offset / 4; const uint vector_index = offset % 4; unquantized_texel_weights[array_index][vector_index] = (uint(dot(p, w)) + 8) >> 4; } } } } int FindLayout(uint mode) { if ((mode & 3) != 0) { if ((mode & 8) != 0) { if ((mode & 4) != 0) { if ((mode & 0x100) != 0) { return 4; } return 3; } return 2; } if ((mode & 4) != 0) { return 1; } return 0; } if ((mode & 0x100) != 0) { if ((mode & 0x80) != 0) { if ((mode & 0x20) != 0) { return 8; } return 7; } return 9; } if ((mode & 0x80) != 0) { return 6; } return 5; } void FillError(ivec3 coord) { for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { imageStore(dest_image, coord + ivec3(i, j, 0), vec4(0.0, 0.0, 0.0, 0.0)); } } } void FillVoidExtentLDR(ivec3 coord) { SkipBits(52); const uint r_u = StreamBits(16); const uint g_u = StreamBits(16); const uint b_u = StreamBits(16); const uint a_u = StreamBits(16); const float a = float(a_u) / 65535.0f; const float r = float(r_u) / 65535.0f; const float g = float(g_u) / 65535.0f; const float b = float(b_u) / 65535.0f; for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a)); } } } bool IsError(uint mode) { if ((mode & 0x1ff) == 0x1fc) { if ((mode & 0x200) != 0) { // params.void_extent_hdr = true; return true; } if ((mode & 0x400) == 0 || StreamBits(1) == 0) { return true; } return false; } if ((mode & 0xf) == 0) { return true; } if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) { return true; } return false; } uvec2 DecodeBlockSize(uint mode) { uint A, B; switch (FindLayout(mode)) { case 0: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; return uvec2(B + 4, A + 2); case 1: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; return uvec2(B + 8, A + 2); case 2: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x3; return uvec2(A + 2, B + 8); case 3: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x1; return uvec2(A + 2, B + 6); case 4: A = (mode >> 5) & 0x3; B = (mode >> 7) & 0x1; return uvec2(B + 2, A + 2); case 5: A = (mode >> 5) & 0x3; return uvec2(12, A + 2); case 6: A = (mode >> 5) & 0x3; return uvec2(A + 2, 12); case 7: return uvec2(6, 10); case 8: return uvec2(10, 6); case 9: A = (mode >> 5) & 0x3; B = (mode >> 9) & 0x3; return uvec2(A + 6, B + 6); default: return uvec2(0); } } uint DecodeMaxWeight(uint mode) { const uint mode_layout = FindLayout(mode); uint weight_index = (mode & 0x10) != 0 ? 1 : 0; if (mode_layout < 5) { weight_index |= (mode & 0x3) << 1; } else { weight_index |= (mode & 0xc) >> 1; } weight_index -= 2; if ((mode_layout != 9) && ((mode & 0x200) != 0)) { weight_index += 6; } return weight_index + 1; } void DecompressBlock(ivec3 coord) { uint mode = StreamBits(11); if (IsError(mode)) { FillError(coord); return; } if ((mode & 0x1ff) == 0x1fc) { // params.void_extent_ldr = true; FillVoidExtentLDR(coord); return; } const uvec2 size_params = DecodeBlockSize(mode); if ((size_params.x > block_dims.x) || (size_params.y > block_dims.y)) { FillError(coord); return; } const uint num_partitions = StreamBits(2) + 1; const uint mode_layout = FindLayout(mode); const bool dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0); if (num_partitions > 4 || (num_partitions == 4 && dual_plane)) { FillError(coord); return; } uint partition_index = 1; uvec4 color_endpoint_mode = uvec4(0); uint ced_pointer = 0; uint base_cem = 0; if (num_partitions == 1) { color_endpoint_mode.x = StreamBits(4); partition_index = 0; } else { partition_index = StreamBits(10); base_cem = StreamBits(6); } const uint base_mode = base_cem & 3; const uint max_weight = DecodeMaxWeight(mode); const uint weight_bits = GetPackedBitSize(size_params, dual_plane, max_weight); uint remaining_bits = 128 - weight_bits - total_bitsread; uint extra_cem_bits = 0; if (base_mode > 0) { switch (num_partitions) { case 2: extra_cem_bits += 2; break; case 3: extra_cem_bits += 5; break; case 4: extra_cem_bits += 8; break; default: return; } } remaining_bits -= extra_cem_bits; const uint plane_selector_bits = dual_plane ? 2 : 0; remaining_bits -= plane_selector_bits; if (remaining_bits > 128) { // Bad data, more remaining bits than 4 bytes // return early return; } // Read color data... const uint color_data_bits = remaining_bits; while (remaining_bits > 0) { const int nb = int(min(remaining_bits, 32U)); const uint b = StreamBits(nb); color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); ++ced_pointer; remaining_bits -= nb; } const uint plane_index = uint(StreamBits(plane_selector_bits)); if (base_mode > 0) { const uint extra_cem = StreamBits(extra_cem_bits); uint cem = (extra_cem << 6) | base_cem; cem >>= 2; uvec4 C = uvec4(0); for (uint i = 0; i < num_partitions; i++) { C[i] = (cem & 1); cem >>= 1; } uvec4 M = uvec4(0); for (uint i = 0; i < num_partitions; i++) { M[i] = cem & 3; cem >>= 2; } for (uint i = 0; i < num_partitions; i++) { color_endpoint_mode[i] = base_mode; if (C[i] == 0) { --color_endpoint_mode[i]; } color_endpoint_mode[i] <<= 2; color_endpoint_mode[i] |= M[i]; } } else if (num_partitions > 1) { const uint cem = base_cem >> 2; for (uint i = 0; i < num_partitions; i++) { color_endpoint_mode[i] = cem; } } uvec4 endpoints0[4]; uvec4 endpoints1[4]; { // This decode phase should at most push 32 elements into the vector result_vector_max_index = 32; uint colvals_index = 0; DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits); for (uint i = 0; i < num_partitions; i++) { ComputeEndpoints(endpoints0[i], endpoints1[i], color_endpoint_mode[i], colvals_index); } } color_endpoint_data = local_buff; color_endpoint_data = bitfieldReverse(color_endpoint_data).wzyx; const uint clear_byte_start = (weight_bits >> 3) + 1; const uint byte_insert = ExtractBits(color_endpoint_data, int(clear_byte_start - 1) * 8, 8) & uint(((1 << (weight_bits % 8)) - 1)); const uint vec_index = (clear_byte_start - 1) >> 2; color_endpoint_data[vec_index] = bitfieldInsert(color_endpoint_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); for (uint i = clear_byte_start; i < 16; ++i) { const uint idx = i >> 2; color_endpoint_data[idx] = bitfieldInsert(color_endpoint_data[idx], 0, int(i % 4) * 8, 8); } // Re-init vector variables for next decode phase result_index = 0; color_bitsread = 0; result_limit_reached = false; // The limit for the Unquantize phase, avoids decoding more data than needed. result_vector_max_index = size_params.x * size_params.y; if (dual_plane) { result_vector_max_index *= 2; } DecodeIntegerSequence(max_weight, GetNumWeightValues(size_params, dual_plane)); UnquantizeTexelWeights(size_params, dual_plane); for (uint j = 0; j < block_dims.y; j++) { for (uint i = 0; i < block_dims.x; i++) { uint local_partition = 0; if (num_partitions > 1) { local_partition = Select2DPartition(partition_index, i, j, num_partitions); } const uvec4 C0 = ReplicateByteTo16(endpoints0[local_partition]); const uvec4 C1 = ReplicateByteTo16(endpoints1[local_partition]); const uint weight_offset = (j * block_dims.x + i); const uint array_index = weight_offset / 4; const uint vector_index = weight_offset % 4; const uint primary_weight = unquantized_texel_weights[array_index][vector_index]; uvec4 weight_vec = uvec4(primary_weight); if (dual_plane) { const uint secondary_weight_offset = (j * block_dims.x + i) + ARRAY_NUM_ELEMENTS; const uint secondary_array_index = secondary_weight_offset / 4; const uint secondary_vector_index = secondary_weight_offset % 4; const uint secondary_weight = unquantized_texel_weights[secondary_array_index][secondary_vector_index]; for (uint c = 0; c < 4; c++) { const bool is_secondary = ((plane_index + 1u) & 3u) == c; weight_vec[c] = is_secondary ? secondary_weight : primary_weight; } } const vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64); const vec4 p = (Cf / 65535.0); imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar); } } } uint SwizzleOffset(uvec2 pos) { const uint x = pos.x; const uint y = pos.y; return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + (y % 2) * 16 + (x % 16); } void main() { uvec3 pos = gl_GlobalInvocationID; pos.x <<= BYTES_PER_BLOCK_LOG2; const uint swizzle = SwizzleOffset(pos.xy); const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; uint offset = 0; offset += pos.z * layer_stride; offset += (block_y >> block_height) * block_size; offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; offset += swizzle; const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1)); if (any(greaterThanEqual(coord, imageSize(dest_image)))) { return; } local_buff = astc_data[offset / 16]; DecompressBlock(coord); }