7 files changed, 138 insertions, 68 deletions
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h
index 7f50ac2f8..d03d480b4 100644
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -6,7 +6,7 @@
 
 #include <array>
 #include <bitset>
-#include <xbyak.h>
+#include <xbyak/xbyak.h>
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/x64/xbyak_abi.h"
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 882eff880..c60ed6453 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -463,6 +463,7 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
         ++page_index;
         page_offset = 0;
         remaining_size -= num_bytes;
+        old_page_addr = page_addr;
     }
     split();
     return result;
diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
index a37ca1fdf..f316c4f92 100644
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -281,7 +281,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw
         .supported_spirv = device.IsKhrSpirv1_4Supported() ? 0x00010400U : 0x00010000U,
         .unified_descriptor_binding = true,
         .support_descriptor_aliasing = true,
-        .support_int8 = true,
+        .support_int8 = device.IsInt8Supported(),
         .support_int16 = device.IsShaderInt16Supported(),
         .support_int64 = device.IsShaderInt64Supported(),
         .support_vertex_instance_id = false,
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index c32ae956a..c010b9353 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -84,56 +84,31 @@ template <bool TO_LINEAR>
 void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
     switch (bytes_per_pixel) {
-    case 1:
-        return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height,
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleImpl<TO_LINEAR, x>(output, input, width, height, depth, block_height,        \
                                          block_depth, stride_alignment);
-    case 2:
-        return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height,
-                                         block_depth, stride_alignment);
-    case 3:
-        return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height,
-                                         block_depth, stride_alignment);
-    case 4:
-        return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height,
-                                         block_depth, stride_alignment);
-    case 6:
-        return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height,
-                                         block_depth, stride_alignment);
-    case 8:
-        return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height,
-                                         block_depth, stride_alignment);
-    case 12:
-        return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height,
-                                          block_depth, stride_alignment);
-    case 16:
-        return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height,
-                                          block_depth, stride_alignment);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
     default:
         UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
     }
 }
-} // Anonymous namespace
-
-void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
-                      u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
-                      u32 stride_alignment) {
-    Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
-                   stride_alignment);
-}
-
-void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-                    u32 height, u32 depth, u32 block_height, u32 block_depth,
-                    u32 stride_alignment) {
-    Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
-                  stride_alignment);
-}
 
+template <u32 BYTES_PER_PIXEL>
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
-                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
+                    u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
+                    u32 offset_x, u32 offset_y) {
     const u32 block_height = 1U << block_height_bit;
     const u32 image_width_in_gobs =
-        (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
+        (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
     for (u32 line = 0; line < subrect_height; ++line) {
         const u32 dst_y = line + offset_y;
         const u32 gob_address_y =
@@ -143,20 +118,21 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
         for (u32 x = 0; x < subrect_width; ++x) {
             const u32 dst_x = x + offset_x;
             const u32 gob_address =
-                gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height;
-            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X];
-            const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel;
+                gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * BYTES_PER_PIXEL) % GOB_SIZE_X];
+            const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
 
             const u8* const source_line = unswizzled_data + unswizzled_offset;
             u8* const dest_addr = swizzled_data + swizzled_offset;
-            std::memcpy(dest_addr, source_line, bytes_per_pixel);
+            std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
         }
     }
 }
 
-void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
-                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
-    const u32 stride = width * bytes_per_pixel;
+template <u32 BYTES_PER_PIXEL>
+void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
+                      u32 origin_x, u32 origin_y, u8* output, const u8* input) {
+    const u32 stride = width * BYTES_PER_PIXEL;
     const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
 
@@ -171,24 +147,25 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
         const u32 src_offset_y = (block_y >> block_height) * block_size +
                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
         for (u32 column = 0; column < line_length_in; ++column) {
-            const u32 src_x = (column + origin_x) * bytes_per_pixel;
+            const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
             const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
 
             const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X];
-            const u32 unswizzled_offset = line * pitch + column * bytes_per_pixel;
+            const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
 
-            std::memcpy(output + unswizzled_offset, input + swizzled_offset, bytes_per_pixel);
+            std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
         }
     }
 }
 
+template <u32 BYTES_PER_PIXEL>
 void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
-                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
-                         u32 origin_y, u8* output, const u8* input) {
+                         u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
+                         const u8* input) {
     UNIMPLEMENTED_IF(origin_x > 0);
     UNIMPLEMENTED_IF(origin_y > 0);
 
-    const u32 stride = width * bytes_per_pixel;
+    const u32 stride = width * BYTES_PER_PIXEL;
     const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
 
@@ -203,11 +180,93 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt
         for (u32 x = 0; x < line_length_in; ++x) {
             const u32 dst_offset =
                 ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X];
-            const u32 src_offset = x * bytes_per_pixel + line * pitch;
-            std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel);
+            const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
+            std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
         }
     }
 }
+} // Anonymous namespace
+
+void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
+                      u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth,
+                      u32 stride_alignment) {
+    Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
+                   stride_alignment);
+}
+
+void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
+                    u32 height, u32 depth, u32 block_height, u32 block_depth,
+                    u32 stride_alignment) {
+    Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth,
+                  stride_alignment);
+}
+
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
+                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width,      \
+                                 swizzled_data, unswizzled_data, block_height_bit, offset_x,       \
+                                 offset_y);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
+    }
+}
+
+void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
+                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height,         \
+                                   origin_x, origin_y, output, input);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
+    }
+}
+
+void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
+                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
+                         u32 origin_y, u8* output, const u8* input) {
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height,            \
+                                      block_height, block_depth, origin_x, origin_y, output,       \
+                                      input);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
+    }
+}
 
 void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
                    const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
@@ -228,7 +287,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
             u8* dest_addr = swizzle_data + swizzled_offset;
             count++;
 
-            std::memcpy(dest_addr, source_line, 1);
+            *dest_addr = *source_line;
         }
     }
 }
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index 1a9399455..7994cb859 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -159,7 +159,7 @@ static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size");
         return {raw, raw};
     } else {
         const Tegra::Texture::TextureHandle handle{raw};
-        return {handle.tic_id, via_header_index ? handle.tic_id : handle.tsc_id};
+        return {handle.tic_id, handle.tsc_id};
     }
 }
 
diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp
index 8e56a89e1..86ca4be54 100644
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -368,18 +368,21 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
     };
     SetNext(next, demote);
 
-    VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
-    if (is_float16_supported) {
-        float16_int8 = {
+    if (is_int8_supported || is_float16_supported) {
+        VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8{
             .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR,
             .pNext = nullptr,
-            .shaderFloat16 = true,
-            .shaderInt8 = false,
+            .shaderFloat16 = is_float16_supported,
+            .shaderInt8 = is_int8_supported,
         };
         SetNext(next, float16_int8);
-    } else {
+    }
+    if (!is_float16_supported) {
         LOG_INFO(Render_Vulkan, "Device doesn't support float16 natively");
     }
+    if (!is_int8_supported) {
+        LOG_INFO(Render_Vulkan, "Device doesn't support int8 natively");
+    }
 
     if (!nv_viewport_swizzle) {
         LOG_INFO(Render_Vulkan, "Device doesn't support viewport swizzles");
@@ -909,6 +912,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
 
         physical.GetFeatures2KHR(features);
         is_float16_supported = float16_int8_features.shaderFloat16;
+        is_int8_supported = float16_int8_features.shaderInt8;
         extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
     }
     if (has_ext_subgroup_size_control) {
diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h
index c19f40746..234d74129 100644
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -139,11 +139,16 @@ public:
         return is_optimal_astc_supported;
     }
 
-    /// Returns true if the device supports float16 natively
+    /// Returns true if the device supports float16 natively.
     bool IsFloat16Supported() const {
         return is_float16_supported;
     }
 
+    /// Returns true if the device supports int8 natively.
+    bool IsInt8Supported() const {
+        return is_int8_supported;
+    }
+
     /// Returns true if the device warp size can potentially be bigger than guest's warp size.
     bool IsWarpSizePotentiallyBiggerThanGuest() const {
         return is_warp_potentially_bigger;
@@ -367,7 +372,8 @@ private:
     u64 device_access_memory{};                 ///< Total size of device local memory in bytes.
     u32 max_push_descriptors{};                 ///< Maximum number of push descriptors
     bool is_optimal_astc_supported{};           ///< Support for native ASTC.
-    bool is_float16_supported{};                ///< Support for float16 arithmetics.
+    bool is_float16_supported{};                ///< Support for float16 arithmetic.
+    bool is_int8_supported{};                   ///< Support for int8 arithmetic.
     bool is_warp_potentially_bigger{};          ///< Host warp size can be bigger than guest.
     bool is_formatless_image_load_supported{};  ///< Support for shader image read without format.
     bool is_depth_bounds_supported{};           ///< Support for depth bounds.