diff options
Diffstat (limited to 'src')
39 files changed, 974 insertions, 247 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 9b0c3db68..9afc6105d 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -15,6 +15,10 @@ endif () if (DEFINED ENV{DISPLAYVERSION}) set(DISPLAY_VERSION $ENV{DISPLAYVERSION}) endif () + +# Pass the path to git to the GenerateSCMRev.cmake as well +find_package(Git QUIET) + add_custom_command(OUTPUT scm_rev.cpp COMMAND ${CMAKE_COMMAND} -DSRC_DIR="${CMAKE_SOURCE_DIR}" @@ -23,6 +27,7 @@ add_custom_command(OUTPUT scm_rev.cpp -DTITLE_BAR_FORMAT_RUNNING="${TITLE_BAR_FORMAT_RUNNING}" -DBUILD_TAG="${BUILD_TAG}" -DBUILD_ID="${DISPLAY_VERSION}" + -DGIT_EXECUTABLE="${GIT_EXECUTABLE}" -P "${CMAKE_SOURCE_DIR}/CMakeModules/GenerateSCMRev.cmake" DEPENDS # WARNING! It was too much work to try and make a common location for this list, diff --git a/src/common/telemetry.cpp b/src/common/telemetry.cpp index f53a8d193..200c6489a 100644 --- a/src/common/telemetry.cpp +++ b/src/common/telemetry.cpp @@ -44,20 +44,6 @@ template class Field<std::string>; template class Field<const char*>; template class Field<std::chrono::microseconds>; -#ifdef ARCHITECTURE_x86_64 -static const char* CpuVendorToStr(Common::CPUVendor vendor) { - switch (vendor) { - case Common::CPUVendor::INTEL: - return "Intel"; - case Common::CPUVendor::AMD: - return "Amd"; - case Common::CPUVendor::OTHER: - return "Other"; - } - UNREACHABLE(); -} -#endif - void AppendBuildInfo(FieldCollection& fc) { const bool is_git_dirty{std::strstr(Common::g_scm_desc, "dirty") != nullptr}; fc.AddField(FieldType::App, "Git_IsDirty", is_git_dirty); @@ -71,7 +57,6 @@ void AppendCPUInfo(FieldCollection& fc) { #ifdef ARCHITECTURE_x86_64 fc.AddField(FieldType::UserSystem, "CPU_Model", Common::GetCPUCaps().cpu_string); fc.AddField(FieldType::UserSystem, "CPU_BrandString", Common::GetCPUCaps().brand_string); - fc.AddField(FieldType::UserSystem, "CPU_Vendor", CpuVendorToStr(Common::GetCPUCaps().vendor)); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx); fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2); diff --git a/src/common/x64/cpu_detect.cpp b/src/common/x64/cpu_detect.cpp index 2dfcd39c8..c9349a6b4 100644 --- a/src/common/x64/cpu_detect.cpp +++ b/src/common/x64/cpu_detect.cpp @@ -3,8 +3,6 @@ // Refer to the license.txt file included. #include <cstring> -#include <string> -#include <thread> #include "common/common_types.h" #include "common/x64/cpu_detect.h" @@ -51,8 +49,6 @@ namespace Common { static CPUCaps Detect() { CPUCaps caps = {}; - caps.num_cores = std::thread::hardware_concurrency(); - // Assumes the CPU supports the CPUID instruction. Those that don't would likely not support // yuzu at all anyway @@ -70,12 +66,6 @@ static CPUCaps Detect() { __cpuid(cpu_id, 0x80000000); u32 max_ex_fn = cpu_id[0]; - if (!strcmp(caps.brand_string, "GenuineIntel")) - caps.vendor = CPUVendor::INTEL; - else if (!strcmp(caps.brand_string, "AuthenticAMD")) - caps.vendor = CPUVendor::AMD; - else - caps.vendor = CPUVendor::OTHER; // Set reasonable default brand string even if brand string not available strcpy(caps.cpu_string, caps.brand_string); @@ -96,15 +86,9 @@ static CPUCaps Detect() { caps.sse4_1 = true; if ((cpu_id[2] >> 20) & 1) caps.sse4_2 = true; - if ((cpu_id[2] >> 22) & 1) - caps.movbe = true; if ((cpu_id[2] >> 25) & 1) caps.aes = true; - if ((cpu_id[3] >> 24) & 1) { - caps.fxsave_fxrstor = true; - } - // AVX support requires 3 separate checks: // - Is the AVX bit set in CPUID? // - Is the XSAVE bit set in CPUID? @@ -129,8 +113,6 @@ static CPUCaps Detect() { } } - caps.flush_to_zero = caps.sse; - if (max_ex_fn >= 0x80000004) { // Extract CPU model string __cpuid(cpu_id, 0x80000002); @@ -144,14 +126,8 @@ static CPUCaps Detect() { if (max_ex_fn >= 0x80000001) { // Check for more features __cpuid(cpu_id, 0x80000001); - if (cpu_id[2] & 1) - caps.lahf_sahf_64 = true; - if ((cpu_id[2] >> 5) & 1) - caps.lzcnt = true; if ((cpu_id[2] >> 16) & 1) caps.fma4 = true; - if ((cpu_id[3] >> 29) & 1) - caps.long_mode = true; } return caps; @@ -162,48 +138,4 @@ const CPUCaps& GetCPUCaps() { return caps; } -std::string GetCPUCapsString() { - auto caps = GetCPUCaps(); - - std::string sum(caps.cpu_string); - sum += " ("; - sum += caps.brand_string; - sum += ")"; - - if (caps.sse) - sum += ", SSE"; - if (caps.sse2) { - sum += ", SSE2"; - if (!caps.flush_to_zero) - sum += " (without DAZ)"; - } - - if (caps.sse3) - sum += ", SSE3"; - if (caps.ssse3) - sum += ", SSSE3"; - if (caps.sse4_1) - sum += ", SSE4.1"; - if (caps.sse4_2) - sum += ", SSE4.2"; - if (caps.avx) - sum += ", AVX"; - if (caps.avx2) - sum += ", AVX2"; - if (caps.bmi1) - sum += ", BMI1"; - if (caps.bmi2) - sum += ", BMI2"; - if (caps.fma) - sum += ", FMA"; - if (caps.aes) - sum += ", AES"; - if (caps.movbe) - sum += ", MOVBE"; - if (caps.long_mode) - sum += ", 64-bit support"; - - return sum; -} - } // namespace Common diff --git a/src/common/x64/cpu_detect.h b/src/common/x64/cpu_detect.h index 0af3a8adb..20f2ba234 100644 --- a/src/common/x64/cpu_detect.h +++ b/src/common/x64/cpu_detect.h @@ -4,23 +4,12 @@ #pragma once -#include <string> - namespace Common { -/// x86/x64 CPU vendors that may be detected by this module -enum class CPUVendor { - INTEL, - AMD, - OTHER, -}; - /// x86/x64 CPU capabilities that may be detected by this module struct CPUCaps { - CPUVendor vendor; char cpu_string[0x21]; char brand_string[0x41]; - int num_cores; bool sse; bool sse2; bool sse3; @@ -35,20 +24,6 @@ struct CPUCaps { bool fma; bool fma4; bool aes; - - // Support for the FXSAVE and FXRSTOR instructions - bool fxsave_fxrstor; - - bool movbe; - - // This flag indicates that the hardware supports some mode in which denormal inputs and outputs - // are automatically set to (signed) zero. - bool flush_to_zero; - - // Support for LAHF and SAHF instructions in 64-bit mode - bool lahf_sahf_64; - - bool long_mode; }; /** @@ -57,10 +32,4 @@ struct CPUCaps { */ const CPUCaps& GetCPUCaps(); -/** - * Gets a string summary of the name and supported capabilities of the host CPU - * @return String summary - */ -std::string GetCPUCapsString(); - } // namespace Common diff --git a/src/core/hle/kernel/physical_memory.h b/src/core/hle/kernel/physical_memory.h index 090565310..b689e8e8b 100644 --- a/src/core/hle/kernel/physical_memory.h +++ b/src/core/hle/kernel/physical_memory.h @@ -14,6 +14,9 @@ namespace Kernel { // - Second to ensure all host backing memory used is aligned to 256 bytes due // to strict alignment restrictions on GPU memory. -using PhysicalMemory = std::vector<u8, Common::AlignmentAllocator<u8, 256>>; +using PhysicalMemoryVector = std::vector<u8, Common::AlignmentAllocator<u8, 256>>; +class PhysicalMemory final : public PhysicalMemoryVector { + using PhysicalMemoryVector::PhysicalMemoryVector; +}; } // namespace Kernel diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp index 12ea4ebe3..b9035a0be 100644 --- a/src/core/hle/kernel/process.cpp +++ b/src/core/hle/kernel/process.cpp @@ -317,6 +317,8 @@ void Process::FreeTLSRegion(VAddr tls_address) { } void Process::LoadModule(CodeSet module_, VAddr base_addr) { + code_memory_size += module_.memory.size(); + const auto memory = std::make_shared<PhysicalMemory>(std::move(module_.memory)); const auto MapSegment = [&](const CodeSet::Segment& segment, VMAPermission permissions, @@ -332,8 +334,6 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) { MapSegment(module_.CodeSegment(), VMAPermission::ReadExecute, MemoryState::Code); MapSegment(module_.RODataSegment(), VMAPermission::Read, MemoryState::CodeData); MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData); - - code_memory_size += module_.memory.size(); } Process::Process(Core::System& system) diff --git a/src/core/hle/kernel/vm_manager.cpp b/src/core/hle/kernel/vm_manager.cpp index a9a20ef76..0b3500fce 100644 --- a/src/core/hle/kernel/vm_manager.cpp +++ b/src/core/hle/kernel/vm_manager.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <algorithm> +#include <cstring> #include <iterator> #include <utility> #include "common/alignment.h" @@ -269,18 +270,9 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) { // If necessary, expand backing vector to cover new heap extents in // the case of allocating. Otherwise, shrink the backing memory, // if a smaller heap has been requested. - const u64 old_heap_size = GetCurrentHeapSize(); - if (size > old_heap_size) { - const u64 alloc_size = size - old_heap_size; - - heap_memory->insert(heap_memory->end(), alloc_size, 0); - RefreshMemoryBlockMappings(heap_memory.get()); - } else if (size < old_heap_size) { - heap_memory->resize(size); - heap_memory->shrink_to_fit(); - - RefreshMemoryBlockMappings(heap_memory.get()); - } + heap_memory->resize(size); + heap_memory->shrink_to_fit(); + RefreshMemoryBlockMappings(heap_memory.get()); heap_end = heap_region_base + size; ASSERT(GetCurrentHeapSize() == heap_memory->size()); @@ -752,24 +744,20 @@ void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryAre // Always merge allocated memory blocks, even when they don't share the same backing block. if (left.type == VMAType::AllocatedMemoryBlock && (left.backing_block != right.backing_block || left.offset + left.size != right.offset)) { - const auto right_begin = right.backing_block->begin() + right.offset; - const auto right_end = right_begin + right.size; // Check if we can save work. if (left.offset == 0 && left.size == left.backing_block->size()) { // Fast case: left is an entire backing block. - left.backing_block->insert(left.backing_block->end(), right_begin, right_end); + left.backing_block->resize(left.size + right.size); + std::memcpy(left.backing_block->data() + left.size, + right.backing_block->data() + right.offset, right.size); } else { // Slow case: make a new memory block for left and right. - const auto left_begin = left.backing_block->begin() + left.offset; - const auto left_end = left_begin + left.size; - const auto left_size = static_cast<std::size_t>(std::distance(left_begin, left_end)); - const auto right_size = static_cast<std::size_t>(std::distance(right_begin, right_end)); - auto new_memory = std::make_shared<PhysicalMemory>(); - new_memory->reserve(left_size + right_size); - new_memory->insert(new_memory->end(), left_begin, left_end); - new_memory->insert(new_memory->end(), right_begin, right_end); + new_memory->resize(left.size + right.size); + std::memcpy(new_memory->data(), left.backing_block->data() + left.offset, left.size); + std::memcpy(new_memory->data() + left.size, right.backing_block->data() + right.offset, + right.size); left.backing_block = std::move(new_memory); left.offset = 0; @@ -792,8 +780,7 @@ void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) { memory.UnmapRegion(page_table, vma.base, vma.size); break; case VMAType::AllocatedMemoryBlock: - memory.MapMemoryRegion(page_table, vma.base, vma.size, - vma.backing_block->data() + vma.offset); + memory.MapMemoryRegion(page_table, vma.base, vma.size, *vma.backing_block, vma.offset); break; case VMAType::BackingMemory: memory.MapMemoryRegion(page_table, vma.base, vma.size, vma.backing_memory); diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp index f1795fdd6..8908e5328 100644 --- a/src/core/loader/elf.cpp +++ b/src/core/loader/elf.cpp @@ -335,7 +335,8 @@ Kernel::CodeSet ElfReader::LoadInto(VAddr vaddr) { codeset_segment->addr = segment_addr; codeset_segment->size = aligned_size; - memcpy(&program_image[current_image_position], GetSegmentPtr(i), p->p_filesz); + std::memcpy(program_image.data() + current_image_position, GetSegmentPtr(i), + p->p_filesz); current_image_position += aligned_size; } } diff --git a/src/core/loader/kip.cpp b/src/core/loader/kip.cpp index 474b55cb1..092103abe 100644 --- a/src/core/loader/kip.cpp +++ b/src/core/loader/kip.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <cstring> #include "core/file_sys/kernel_executable.h" #include "core/file_sys/program_metadata.h" #include "core/gdbstub/gdbstub.h" @@ -76,8 +77,8 @@ AppLoader::LoadResult AppLoader_KIP::Load(Kernel::Process& process) { segment.addr = offset; segment.offset = offset; segment.size = PageAlignSize(static_cast<u32>(data.size())); - program_image.resize(offset); - program_image.insert(program_image.end(), data.begin(), data.end()); + program_image.resize(offset + data.size()); + std::memcpy(program_image.data() + offset, data.data(), data.size()); }; load_segment(codeset.CodeSegment(), kip->GetTextSection(), kip->GetTextOffset()); diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp index f629892ae..515c5accb 100644 --- a/src/core/loader/nso.cpp +++ b/src/core/loader/nso.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <cinttypes> +#include <cstring> #include <vector> #include "common/common_funcs.h" @@ -96,8 +97,9 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process, if (nso_header.IsSegmentCompressed(i)) { data = DecompressSegment(data, nso_header.segments[i]); } - program_image.resize(nso_header.segments[i].location); - program_image.insert(program_image.end(), data.begin(), data.end()); + program_image.resize(nso_header.segments[i].location + data.size()); + std::memcpy(program_image.data() + nso_header.segments[i].location, data.data(), + data.size()); codeset.segments[i].addr = nso_header.segments[i].location; codeset.segments[i].offset = nso_header.segments[i].location; codeset.segments[i].size = PageAlignSize(static_cast<u32>(data.size())); @@ -139,12 +141,12 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process, std::vector<u8> pi_header; pi_header.insert(pi_header.begin(), reinterpret_cast<u8*>(&nso_header), reinterpret_cast<u8*>(&nso_header) + sizeof(NSOHeader)); - pi_header.insert(pi_header.begin() + sizeof(NSOHeader), program_image.begin(), - program_image.end()); + pi_header.insert(pi_header.begin() + sizeof(NSOHeader), program_image.data(), + program_image.data() + program_image.size()); pi_header = pm->PatchNSO(pi_header, file.GetName()); - std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.begin()); + std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.data()); } // Apply cheats if they exist and the program has a valid title ID diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 3c2a29d9b..f0888327f 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -14,6 +14,7 @@ #include "common/swap.h" #include "core/arm/arm_interface.h" #include "core/core.h" +#include "core/hle/kernel/physical_memory.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/vm_manager.h" #include "core/memory.h" @@ -38,6 +39,11 @@ struct Memory::Impl { system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width); } + void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, + Kernel::PhysicalMemory& memory, VAddr offset) { + MapMemoryRegion(page_table, base, size, memory.data() + offset); + } + void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, u8* target) { ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size); ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", base); @@ -601,6 +607,11 @@ void Memory::SetCurrentPageTable(Kernel::Process& process) { impl->SetCurrentPageTable(process); } +void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, + Kernel::PhysicalMemory& memory, VAddr offset) { + impl->MapMemoryRegion(page_table, base, size, memory, offset); +} + void Memory::MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, u8* target) { impl->MapMemoryRegion(page_table, base, size, target); } diff --git a/src/core/memory.h b/src/core/memory.h index 1428a6d60..8913a9da4 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -19,8 +19,9 @@ class System; } namespace Kernel { +class PhysicalMemory; class Process; -} +} // namespace Kernel namespace Memory { @@ -66,6 +67,19 @@ public: void SetCurrentPageTable(Kernel::Process& process); /** + * Maps an physical buffer onto a region of the emulated process address space. + * + * @param page_table The page table of the emulated process. + * @param base The address to start mapping at. Must be page-aligned. + * @param size The amount of bytes to map. Must be page-aligned. + * @param memory Physical buffer with the memory backing the mapping. Must be of length + * at least `size + offset`. + * @param offset The offset within the physical memory. Must be page-aligned. + */ + void MapMemoryRegion(Common::PageTable& page_table, VAddr base, u64 size, + Kernel::PhysicalMemory& memory, VAddr offset); + + /** * Maps an allocated buffer onto a region of the emulated process address space. * * @param page_table The page table of the emulated process. diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 142852082..729ee4a01 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -190,8 +190,11 @@ if (ENABLE_VULKAN) renderer_vulkan/vk_stream_buffer.h renderer_vulkan/vk_swapchain.cpp renderer_vulkan/vk_swapchain.h + renderer_vulkan/vk_texture_cache.cpp + renderer_vulkan/vk_texture_cache.h renderer_vulkan/vk_update_descriptor.cpp - renderer_vulkan/vk_update_descriptor.h) + renderer_vulkan/vk_update_descriptor.h + ) target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) target_compile_definitions(video_core PRIVATE HAS_VULKAN) diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index a35e7a195..ee79260fc 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -1018,7 +1018,14 @@ public: } } instanced_arrays; - INSERT_UNION_PADDING_WORDS(0x6); + INSERT_UNION_PADDING_WORDS(0x4); + + union { + BitField<0, 1, u32> enable; + BitField<4, 8, u32> unk4; + } vp_point_size; + + INSERT_UNION_PADDING_WORDS(1); Cull cull; @@ -1271,8 +1278,6 @@ public: } dirty{}; - std::array<u8, Regs::NUM_REGS> dirty_pointers{}; - /// Reads a register value located at the input method address u32 GetRegisterValue(u32 method) const; @@ -1367,6 +1372,8 @@ private: bool execute_on{true}; + std::array<u8, Regs::NUM_REGS> dirty_pointers{}; + /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1503,6 +1510,7 @@ ASSERT_REG_POSITION(primitive_restart, 0x591); ASSERT_REG_POSITION(index_array, 0x5F2); ASSERT_REG_POSITION(polygon_offset_clamp, 0x61F); ASSERT_REG_POSITION(instanced_arrays, 0x620); +ASSERT_REG_POSITION(vp_point_size, 0x644); ASSERT_REG_POSITION(cull, 0x646); ASSERT_REG_POSITION(pixel_center_integer, 0x649); ASSERT_REG_POSITION(viewport_transform_enabled, 0x64B); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index 57b57c647..6f98bd827 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -215,6 +215,18 @@ enum class F2fRoundingOp : u64 { Trunc = 11, }; +enum class AtomicOp : u64 { + Add = 0, + Min = 1, + Max = 2, + Inc = 3, + Dec = 4, + And = 5, + Or = 6, + Xor = 7, + Exch = 8, +}; + enum class UniformType : u64 { UnsignedByte = 0, SignedByte = 1, @@ -236,6 +248,13 @@ enum class StoreType : u64 { Bits128 = 6, }; +enum class AtomicType : u64 { + U32 = 0, + S32 = 1, + U64 = 2, + S64 = 3, +}; + enum class IMinMaxExchange : u64 { None = 0, XLo = 1, @@ -939,6 +958,16 @@ union Instruction { } stg; union { + BitField<52, 4, AtomicOp> operation; + BitField<28, 2, AtomicType> type; + BitField<30, 22, s64> offset; + + s32 GetImmediateOffset() const { + return static_cast<s32>(offset << 2); + } + } atoms; + + union { BitField<32, 1, PhysicalAttributeDirection> direction; BitField<47, 3, AttributeSize> size; BitField<20, 11, u64> address; @@ -1659,9 +1688,10 @@ public: ST_A, ST_L, ST_S, - ST, // Store in generic memory - STG, // Store in global memory - AL2P, // Transforms attribute memory into physical memory + ST, // Store in generic memory + STG, // Store in global memory + ATOMS, // Atomic operation on shared memory + AL2P, // Transforms attribute memory into physical memory TEX, TEX_B, // Texture Load Bindless TXQ, // Texture Query @@ -1964,6 +1994,7 @@ private: INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"), INST("101-------------", Id::ST, Type::Memory, "ST"), INST("1110111011011---", Id::STG, Type::Memory, "STG"), + INST("11101100--------", Id::ATOMS, Type::Memory, "ATOMS"), INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"), INST("110000----111---", Id::TEX, Type::Texture, "TEX"), INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"), diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 672051102..926bccd42 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -1272,6 +1272,7 @@ void RasterizerOpenGL::SyncPointState() { const auto& regs = system.GPU().Maxwell3D().regs; // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid // in OpenGL). + state.point.program_control = regs.vp_point_size.enable ? GL_TRUE : GL_FALSE; state.point.size = std::max(1.0f, regs.point_size); } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index de742d11c..a4acb3796 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -34,9 +34,6 @@ using VideoCommon::Shader::ShaderIR; namespace { -// One UBO is always reserved for emulation values on staged shaders -constexpr u32 STAGE_RESERVED_UBOS = 1; - constexpr u32 STAGE_MAIN_OFFSET = 10; constexpr u32 KERNEL_MAIN_OFFSET = 0; @@ -243,7 +240,6 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp if (!code_b.empty()) { ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker); } - const auto entries = GLShader::GetEntries(ir); std::string source = fmt::format(R"(// {} #version 430 core @@ -314,9 +310,10 @@ std::unordered_set<GLenum> GetSupportedFormats() { CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type, GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b) - : RasterizerCacheObject{params.host_ptr}, system{params.system}, disk_cache{params.disk_cache}, - device{params.device}, cpu_addr{params.cpu_addr}, unique_identifier{params.unique_identifier}, - shader_type{shader_type}, entries{entries}, code{std::move(code)}, code_b{std::move(code_b)} { + : RasterizerCacheObject{params.host_ptr}, system{params.system}, + disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr}, + unique_identifier{params.unique_identifier}, shader_type{shader_type}, + entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} { if (!params.precompiled_variants) { return; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index f9f7a97b5..19751939a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -1856,6 +1856,16 @@ private: Type::Uint}; } + template <const std::string_view& opname, Type type> + Expression Atomic(Operation operation) { + ASSERT(stage == ShaderType::Compute); + auto& smem = std::get<SmemNode>(*operation[0]); + + return {fmt::format("atomic{}(smem[{} >> 2], {})", opname, Visit(smem.GetAddress()).AsInt(), + Visit(operation[1]).As(type)), + type}; + } + Expression Branch(Operation operation) { const auto target = std::get_if<ImmediateNode>(&*operation[0]); UNIMPLEMENTED_IF(!target); @@ -2194,6 +2204,8 @@ private: &GLSLDecompiler::AtomicImage<Func::Xor>, &GLSLDecompiler::AtomicImage<Func::Exchange>, + &GLSLDecompiler::Atomic<Func::Add, Type::Uint>, + &GLSLDecompiler::Branch, &GLSLDecompiler::BranchIndirect, &GLSLDecompiler::PushFlowStack, diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index df2e2395a..cc185e9e1 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -127,6 +127,7 @@ void OpenGLState::ApplyClipDistances() { } void OpenGLState::ApplyPointSize() { + Enable(GL_PROGRAM_POINT_SIZE, cur_state.point.program_control, point.program_control); if (UpdateValue(cur_state.point.size, point.size)) { glPointSize(point.size); } diff --git a/src/video_core/renderer_opengl/gl_state.h b/src/video_core/renderer_opengl/gl_state.h index fb180f302..71d418776 100644 --- a/src/video_core/renderer_opengl/gl_state.h +++ b/src/video_core/renderer_opengl/gl_state.h @@ -131,7 +131,8 @@ public: std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports; struct { - float size = 1.0f; // GL_POINT_SIZE + GLboolean program_control = GL_FALSE; // GL_PROGRAM_POINT_SIZE + GLfloat size = 1.0f; // GL_POINT_SIZE } point; struct { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b790b0ef4..e95eb069e 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -44,7 +44,7 @@ struct FormatTuple { constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false}, // ABGR8U - {GL_RGBA8, GL_RGBA, GL_BYTE, false}, // ABGR8S + {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE, false}, // ABGR8S {GL_RGBA8UI, GL_RGBA_INTEGER, GL_UNSIGNED_BYTE, false}, // ABGR8UI {GL_RGB565, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, false}, // B5G6R5U {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV, false}, // A2B10G10R10U @@ -83,9 +83,9 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_RGB32F, GL_RGB, GL_FLOAT, false}, // RGB32F {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV, false}, // RGBA8_SRGB {GL_RG8, GL_RG, GL_UNSIGNED_BYTE, false}, // RG8U - {GL_RG8, GL_RG, GL_BYTE, false}, // RG8S + {GL_RG8_SNORM, GL_RG, GL_BYTE, false}, // RG8S {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false}, // RG32UI - {GL_RGB16F, GL_RGBA16, GL_HALF_FLOAT, false}, // RGBX16F + {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBX16F {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false}, // R32UI {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X5 @@ -253,14 +253,12 @@ void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level))); glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level); + u8* const mip_data = staging_buffer.data() + mip_offset; + const GLsizei size = static_cast<GLsizei>(params.GetHostMipmapSize(level)); if (is_compressed) { - glGetCompressedTextureImage(texture.handle, level, - static_cast<GLsizei>(params.GetHostMipmapSize(level)), - staging_buffer.data() + mip_offset); + glGetCompressedTextureImage(texture.handle, level, size, mip_data); } else { - glGetTextureImage(texture.handle, level, format, type, - static_cast<GLsizei>(params.GetHostMipmapSize(level)), - staging_buffer.data() + mip_offset); + glGetTextureImage(texture.handle, level, format, type, size, mip_data); } } } diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp index 9770dda1c..ac99e6385 100644 --- a/src/video_core/renderer_opengl/utils.cpp +++ b/src/video_core/renderer_opengl/utils.cpp @@ -6,16 +6,20 @@ #include <vector> #include <fmt/format.h> - #include <glad/glad.h> -#include "common/assert.h" #include "common/common_types.h" -#include "common/scope_exit.h" #include "video_core/renderer_opengl/utils.h" namespace OpenGL { +struct VertexArrayPushBuffer::Entry { + GLuint binding_index{}; + const GLuint* buffer{}; + GLintptr offset{}; + GLsizei stride{}; +}; + VertexArrayPushBuffer::VertexArrayPushBuffer() = default; VertexArrayPushBuffer::~VertexArrayPushBuffer() = default; @@ -47,6 +51,13 @@ void VertexArrayPushBuffer::Bind() { } } +struct BindBuffersRangePushBuffer::Entry { + GLuint binding; + const GLuint* buffer; + GLintptr offset; + GLsizeiptr size; +}; + BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {} BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default; diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h index d56153fe7..3ad7c02d4 100644 --- a/src/video_core/renderer_opengl/utils.h +++ b/src/video_core/renderer_opengl/utils.h @@ -26,12 +26,7 @@ public: void Bind(); private: - struct Entry { - GLuint binding_index{}; - const GLuint* buffer{}; - GLintptr offset{}; - GLsizei stride{}; - }; + struct Entry; GLuint vao{}; const GLuint* index_buffer{}; @@ -50,12 +45,7 @@ public: void Bind(); private: - struct Entry { - GLuint binding; - const GLuint* buffer; - GLintptr offset; - GLsizeiptr size; - }; + struct Entry; GLenum target; std::vector<Entry> entries; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 000e3616d..331808113 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -44,7 +44,7 @@ vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filt return {}; } -vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, +vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter) { switch (wrap_mode) { case Tegra::Texture::WrapMode::Wrap: @@ -56,7 +56,12 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, case Tegra::Texture::WrapMode::Border: return vk::SamplerAddressMode::eClampToBorder; case Tegra::Texture::WrapMode::Clamp: - // TODO(Rodrigo): Emulate GL_CLAMP properly + if (device.GetDriverID() == vk::DriverIdKHR::eNvidiaProprietary) { + // Nvidia's Vulkan driver defaults to GL_CLAMP on invalid enumerations, we can hack this + // by sending an invalid enumeration. + return static_cast<vk::SamplerAddressMode>(0xcafe); + } + // TODO(Rodrigo): Emulate GL_CLAMP properly on other vendors switch (filter) { case Tegra::Texture::TextureFilter::Nearest: return vk::SamplerAddressMode::eClampToEdge; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 1534b738b..7e9678b7b 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -22,7 +22,7 @@ vk::Filter Filter(Tegra::Texture::TextureFilter filter); vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter); -vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode, +vk::SamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter); vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func); diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index 1ce583f75..0a8ec8398 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -46,9 +46,9 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) {}, MaxwellToVK::Sampler::Filter(tsc.mag_filter), MaxwellToVK::Sampler::Filter(tsc.min_filter), MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_u, tsc.mag_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_v, tsc.mag_filter), - MaxwellToVK::Sampler::WrapMode(tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), + MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(), has_anisotropy, max_anisotropy, tsc.depth_compare_enabled, MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(), tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack), diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 8fe852ce8..0cf97cafa 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -1796,6 +1796,11 @@ private: return {}; } + Expression UAtomicAdd(Operation) { + UNIMPLEMENTED(); + return {}; + } + Expression Branch(Operation operation) { const auto& target = std::get<ImmediateNode>(*operation[0]); OpStore(jmp_to, Constant(t_uint, target.GetValue())); @@ -2373,6 +2378,8 @@ private: &SPIRVDecompiler::AtomicImageXor, &SPIRVDecompiler::AtomicImageExchange, + &SPIRVDecompiler::UAtomicAdd, + &SPIRVDecompiler::Branch, &SPIRVDecompiler::BranchIndirect, &SPIRVDecompiler::PushFlowStack, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 02310375f..4d9488f49 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -13,6 +13,7 @@ #include "video_core/renderer_vulkan/declarations.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_resource_manager.h" namespace Vulkan { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp new file mode 100644 index 000000000..51b0d38a6 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -0,0 +1,475 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <cstddef> +#include <cstring> +#include <memory> +#include <variant> +#include <vector> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/common_types.h" +#include "core/core.h" +#include "core/memory.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/morton.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/vk_device.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/surface.h" +#include "video_core/textures/convert.h" + +namespace Vulkan { + +using VideoCore::MortonSwizzle; +using VideoCore::MortonSwizzleMode; + +using Tegra::Texture::SwizzleSource; +using VideoCore::Surface::PixelFormat; +using VideoCore::Surface::SurfaceCompression; +using VideoCore::Surface::SurfaceTarget; + +namespace { + +vk::ImageType SurfaceTargetToImage(SurfaceTarget target) { + switch (target) { + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1DArray: + return vk::ImageType::e1D; + case SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2DArray: + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + return vk::ImageType::e2D; + case SurfaceTarget::Texture3D: + return vk::ImageType::e3D; + } + UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); + return {}; +} + +vk::ImageAspectFlags PixelFormatToImageAspect(PixelFormat pixel_format) { + if (pixel_format < PixelFormat::MaxColorFormat) { + return vk::ImageAspectFlagBits::eColor; + } else if (pixel_format < PixelFormat::MaxDepthFormat) { + return vk::ImageAspectFlagBits::eDepth; + } else if (pixel_format < PixelFormat::MaxDepthStencilFormat) { + return vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil; + } else { + UNREACHABLE_MSG("Invalid pixel format={}", static_cast<u32>(pixel_format)); + return vk::ImageAspectFlagBits::eColor; + } +} + +vk::ImageViewType GetImageViewType(SurfaceTarget target) { + switch (target) { + case SurfaceTarget::Texture1D: + return vk::ImageViewType::e1D; + case SurfaceTarget::Texture2D: + return vk::ImageViewType::e2D; + case SurfaceTarget::Texture3D: + return vk::ImageViewType::e3D; + case SurfaceTarget::Texture1DArray: + return vk::ImageViewType::e1DArray; + case SurfaceTarget::Texture2DArray: + return vk::ImageViewType::e2DArray; + case SurfaceTarget::TextureCubemap: + return vk::ImageViewType::eCube; + case SurfaceTarget::TextureCubeArray: + return vk::ImageViewType::eCubeArray; + case SurfaceTarget::TextureBuffer: + break; + } + UNREACHABLE(); + return {}; +} + +UniqueBuffer CreateBuffer(const VKDevice& device, const SurfaceParams& params) { + // TODO(Rodrigo): Move texture buffer creation to the buffer cache + const vk::BufferCreateInfo buffer_ci({}, params.GetHostSizeInBytes(), + vk::BufferUsageFlagBits::eUniformTexelBuffer | + vk::BufferUsageFlagBits::eTransferSrc | + vk::BufferUsageFlagBits::eTransferDst, + vk::SharingMode::eExclusive, 0, nullptr); + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + return dev.createBufferUnique(buffer_ci, nullptr, dld); +} + +vk::BufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, + const SurfaceParams& params, + vk::Buffer buffer) { + ASSERT(params.IsBuffer()); + + const auto format = + MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format; + return vk::BufferViewCreateInfo({}, buffer, format, 0, params.GetHostSizeInBytes()); +} + +vk::ImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) { + constexpr auto sample_count = vk::SampleCountFlagBits::e1; + constexpr auto tiling = vk::ImageTiling::eOptimal; + + ASSERT(!params.IsBuffer()); + + const auto [format, attachable, storage] = + MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format); + + auto image_usage = vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst | + vk::ImageUsageFlagBits::eTransferSrc; + if (attachable) { + image_usage |= params.IsPixelFormatZeta() ? vk::ImageUsageFlagBits::eDepthStencilAttachment + : vk::ImageUsageFlagBits::eColorAttachment; + } + if (storage) { + image_usage |= vk::ImageUsageFlagBits::eStorage; + } + + vk::ImageCreateFlags flags; + vk::Extent3D extent; + switch (params.target) { + case SurfaceTarget::TextureCubemap: + case SurfaceTarget::TextureCubeArray: + flags |= vk::ImageCreateFlagBits::eCubeCompatible; + [[fallthrough]]; + case SurfaceTarget::Texture1D: + case SurfaceTarget::Texture1DArray: + case SurfaceTarget::Texture2D: + case SurfaceTarget::Texture2DArray: + extent = vk::Extent3D(params.width, params.height, 1); + break; + case SurfaceTarget::Texture3D: + extent = vk::Extent3D(params.width, params.height, params.depth); + break; + case SurfaceTarget::TextureBuffer: + UNREACHABLE(); + } + + return vk::ImageCreateInfo(flags, SurfaceTargetToImage(params.target), format, extent, + params.num_levels, static_cast<u32>(params.GetNumLayers()), + sample_count, tiling, image_usage, vk::SharingMode::eExclusive, 0, + nullptr, vk::ImageLayout::eUndefined); +} + +} // Anonymous namespace + +CachedSurface::CachedSurface(Core::System& system, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool, + GPUVAddr gpu_addr, const SurfaceParams& params) + : SurfaceBase<View>{gpu_addr, params}, system{system}, device{device}, + resource_manager{resource_manager}, memory_manager{memory_manager}, scheduler{scheduler}, + staging_pool{staging_pool} { + if (params.IsBuffer()) { + buffer = CreateBuffer(device, params); + commit = memory_manager.Commit(*buffer, false); + + const auto buffer_view_ci = GenerateBufferViewCreateInfo(device, params, *buffer); + format = buffer_view_ci.format; + + const auto dev = device.GetLogical(); + const auto& dld = device.GetDispatchLoader(); + buffer_view = dev.createBufferViewUnique(buffer_view_ci, nullptr, dld); + } else { + const auto image_ci = GenerateImageCreateInfo(device, params); + format = image_ci.format; + + image.emplace(device, scheduler, image_ci, PixelFormatToImageAspect(params.pixel_format)); + commit = memory_manager.Commit(image->GetHandle(), false); + } + + // TODO(Rodrigo): Move this to a virtual function. + main_view = CreateViewInner( + ViewParams(params.target, 0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels), + true); +} + +CachedSurface::~CachedSurface() = default; + +void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { + // To upload data we have to be outside of a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + if (params.IsBuffer()) { + UploadBuffer(staging_buffer); + } else { + UploadImage(staging_buffer); + } +} + +void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { + UNIMPLEMENTED_IF(params.IsBuffer()); + + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed"); + } + + // We can't copy images to buffers inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead, + vk::ImageLayout::eTransferSrcOptimal); + + const auto& buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + // TODO(Rodrigo): Do this in a single copy + for (u32 level = 0; level < params.num_levels; ++level) { + scheduler.Record([image = image->GetHandle(), buffer = *buffer.handle, + copy = GetBufferImageCopy(level)](auto cmdbuf, auto& dld) { + cmdbuf.copyImageToBuffer(image, vk::ImageLayout::eTransferSrcOptimal, buffer, {copy}, + dld); + }); + } + scheduler.Finish(); + + // TODO(Rodrigo): Use an intern buffer for staging buffers and avoid this unnecessary memcpy. + std::memcpy(staging_buffer.data(), buffer.commit->Map(host_memory_size), host_memory_size); +} + +void CachedSurface::DecorateSurfaceName() { + // TODO(Rodrigo): Add name decorations +} + +View CachedSurface::CreateView(const ViewParams& params) { + return CreateViewInner(params, false); +} + +View CachedSurface::CreateViewInner(const ViewParams& params, bool is_proxy) { + // TODO(Rodrigo): Add name decorations + return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params, is_proxy); +} + +void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { + const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); + + scheduler.Record([src_buffer = *src_buffer.handle, dst_buffer = *buffer, + size = params.GetHostSizeInBytes()](auto cmdbuf, auto& dld) { + const vk::BufferCopy copy(0, 0, size); + cmdbuf.copyBuffer(src_buffer, dst_buffer, {copy}, dld); + + cmdbuf.pipelineBarrier( + vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eVertexShader, {}, {}, + {vk::BufferMemoryBarrier(vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eShaderRead, 0, 0, dst_buffer, 0, size)}, + {}, dld); + }); +} + +void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { + const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); + std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); + + FullTransition(vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferWrite, + vk::ImageLayout::eTransferDstOptimal); + + for (u32 level = 0; level < params.num_levels; ++level) { + vk::BufferImageCopy copy = GetBufferImageCopy(level); + const auto& dld = device.GetDispatchLoader(); + if (image->GetAspectMask() == + (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { + vk::BufferImageCopy depth = copy; + vk::BufferImageCopy stencil = copy; + depth.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eDepth; + stencil.imageSubresource.aspectMask = vk::ImageAspectFlagBits::eStencil; + scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(), depth, + stencil](auto cmdbuf, auto& dld) { + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + {depth, stencil}, dld); + }); + } else { + scheduler.Record([buffer = *src_buffer.handle, image = image->GetHandle(), + copy](auto cmdbuf, auto& dld) { + cmdbuf.copyBufferToImage(buffer, image, vk::ImageLayout::eTransferDstOptimal, + {copy}, dld); + }); + } + } +} + +vk::BufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const { + const u32 vk_depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1; + const auto compression_type = params.GetCompressionType(); + const std::size_t mip_offset = compression_type == SurfaceCompression::Converted + ? params.GetConvertedMipmapOffset(level) + : params.GetHostMipmapLevelOffset(level); + + return vk::BufferImageCopy( + mip_offset, 0, 0, + {image->GetAspectMask(), level, 0, static_cast<u32>(params.GetNumLayers())}, {0, 0, 0}, + {params.GetMipWidth(level), params.GetMipHeight(level), vk_depth}); +} + +vk::ImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { + return {image->GetAspectMask(), 0, params.num_levels, 0, + static_cast<u32>(params.GetNumLayers())}; +} + +CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, + const ViewParams& params, bool is_proxy) + : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, + image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, + aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, + base_layer{params.base_layer}, num_layers{params.num_layers}, base_level{params.base_level}, + num_levels{params.num_levels}, image_view_type{image ? GetImageViewType(params.target) + : vk::ImageViewType{}} {} + +CachedSurfaceView::~CachedSurfaceView() = default; + +vk::ImageView CachedSurfaceView::GetHandle(SwizzleSource x_source, SwizzleSource y_source, + SwizzleSource z_source, SwizzleSource w_source) { + const u32 swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); + if (last_image_view && last_swizzle == swizzle) { + return last_image_view; + } + last_swizzle = swizzle; + + const auto [entry, is_cache_miss] = view_cache.try_emplace(swizzle); + auto& image_view = entry->second; + if (!is_cache_miss) { + return last_image_view = *image_view; + } + + auto swizzle_x = MaxwellToVK::SwizzleSource(x_source); + auto swizzle_y = MaxwellToVK::SwizzleSource(y_source); + auto swizzle_z = MaxwellToVK::SwizzleSource(z_source); + auto swizzle_w = MaxwellToVK::SwizzleSource(w_source); + + if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5U) { + // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. + std::swap(swizzle_x, swizzle_z); + } + + // Games can sample depth or stencil values on textures. This is decided by the swizzle value on + // hardware. To emulate this on Vulkan we specify it in the aspect. + vk::ImageAspectFlags aspect = aspect_mask; + if (aspect == (vk::ImageAspectFlagBits::eDepth | vk::ImageAspectFlagBits::eStencil)) { + UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); + const bool is_first = x_source == SwizzleSource::R; + switch (params.pixel_format) { + case VideoCore::Surface::PixelFormat::Z24S8: + case VideoCore::Surface::PixelFormat::Z32FS8: + aspect = is_first ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eStencil; + break; + case VideoCore::Surface::PixelFormat::S8Z24: + aspect = is_first ? vk::ImageAspectFlagBits::eStencil : vk::ImageAspectFlagBits::eDepth; + break; + default: + aspect = vk::ImageAspectFlagBits::eDepth; + UNIMPLEMENTED(); + } + + // Vulkan doesn't seem to understand swizzling of a depth stencil image, use identity + swizzle_x = vk::ComponentSwizzle::eR; + swizzle_y = vk::ComponentSwizzle::eG; + swizzle_z = vk::ComponentSwizzle::eB; + swizzle_w = vk::ComponentSwizzle::eA; + } + + const vk::ImageViewCreateInfo image_view_ci( + {}, surface.GetImageHandle(), image_view_type, surface.GetImage().GetFormat(), + {swizzle_x, swizzle_y, swizzle_z, swizzle_w}, + {aspect, base_level, num_levels, base_layer, num_layers}); + + const auto dev = device.GetLogical(); + image_view = dev.createImageViewUnique(image_view_ci, nullptr, device.GetDispatchLoader()); + return last_image_view = *image_view; +} + +VKTextureCache::VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKResourceManager& resource_manager, + VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool) + : TextureCache(system, rasterizer), device{device}, resource_manager{resource_manager}, + memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} {} + +VKTextureCache::~VKTextureCache() = default; + +Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { + return std::make_shared<CachedSurface>(system, device, resource_manager, memory_manager, + scheduler, staging_pool, gpu_addr, params); +} + +void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) { + const bool src_3d = src_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; + const bool dst_3d = dst_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; + UNIMPLEMENTED_IF(src_3d); + + // The texture cache handles depth in OpenGL terms, we have to handle it as subresource and + // dimension respectively. + const u32 dst_base_layer = dst_3d ? 0 : copy_params.dest_z; + const u32 dst_offset_z = dst_3d ? copy_params.dest_z : 0; + + const u32 extent_z = dst_3d ? copy_params.depth : 1; + const u32 num_layers = dst_3d ? 1 : copy_params.depth; + + // We can't copy inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + src_surface->Transition(copy_params.source_z, copy_params.depth, copy_params.source_level, 1, + vk::PipelineStageFlagBits::eTransfer, vk::AccessFlagBits::eTransferRead, + vk::ImageLayout::eTransferSrcOptimal); + dst_surface->Transition( + dst_base_layer, num_layers, copy_params.dest_level, 1, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite, vk::ImageLayout::eTransferDstOptimal); + + const auto& dld{device.GetDispatchLoader()}; + const vk::ImageSubresourceLayers src_subresource( + src_surface->GetAspectMask(), copy_params.source_level, copy_params.source_z, num_layers); + const vk::ImageSubresourceLayers dst_subresource( + dst_surface->GetAspectMask(), copy_params.dest_level, dst_base_layer, num_layers); + const vk::Offset3D src_offset(copy_params.source_x, copy_params.source_y, 0); + const vk::Offset3D dst_offset(copy_params.dest_x, copy_params.dest_y, dst_offset_z); + const vk::Extent3D extent(copy_params.width, copy_params.height, extent_z); + const vk::ImageCopy copy(src_subresource, src_offset, dst_subresource, dst_offset, extent); + const vk::Image src_image = src_surface->GetImageHandle(); + const vk::Image dst_image = dst_surface->GetImageHandle(); + scheduler.Record([src_image, dst_image, copy](auto cmdbuf, auto& dld) { + cmdbuf.copyImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, + vk::ImageLayout::eTransferDstOptimal, {copy}, dld); + }); +} + +void VKTextureCache::ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) { + // We can't blit inside a renderpass + scheduler.RequestOutsideRenderPassOperationContext(); + + src_view->Transition(vk::ImageLayout::eTransferSrcOptimal, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferRead); + dst_view->Transition(vk::ImageLayout::eTransferDstOptimal, vk::PipelineStageFlagBits::eTransfer, + vk::AccessFlagBits::eTransferWrite); + + const auto& cfg = copy_config; + const auto src_top_left = vk::Offset3D(cfg.src_rect.left, cfg.src_rect.top, 0); + const auto src_bot_right = vk::Offset3D(cfg.src_rect.right, cfg.src_rect.bottom, 1); + const auto dst_top_left = vk::Offset3D(cfg.dst_rect.left, cfg.dst_rect.top, 0); + const auto dst_bot_right = vk::Offset3D(cfg.dst_rect.right, cfg.dst_rect.bottom, 1); + const vk::ImageBlit blit(src_view->GetImageSubresourceLayers(), {src_top_left, src_bot_right}, + dst_view->GetImageSubresourceLayers(), {dst_top_left, dst_bot_right}); + const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; + + const auto& dld{device.GetDispatchLoader()}; + scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, + is_linear](auto cmdbuf, auto& dld) { + cmdbuf.blitImage(src_image, vk::ImageLayout::eTransferSrcOptimal, dst_image, + vk::ImageLayout::eTransferDstOptimal, {blit}, + is_linear ? vk::Filter::eLinear : vk::Filter::eNearest, dld); + }); +} + +void VKTextureCache::BufferCopy(Surface& src_surface, Surface& dst_surface) { + // Currently unimplemented. PBO copies should be dropped and we should use a render pass to + // convert from color to depth and viceversa. + LOG_WARNING(Render_Vulkan, "Unimplemented"); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h new file mode 100644 index 000000000..d3edbe80c --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -0,0 +1,239 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/logging/log.h" +#include "common/math_util.h" +#include "video_core/gpu.h" +#include "video_core/rasterizer_cache.h" +#include "video_core/renderer_vulkan/declarations.h" +#include "video_core/renderer_vulkan/vk_image.h" +#include "video_core/renderer_vulkan/vk_memory_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/texture_cache/surface_base.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/textures/decoders.h" + +namespace Core { +class System; +} + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Vulkan { + +class RasterizerVulkan; +class VKDevice; +class VKResourceManager; +class VKScheduler; +class VKStagingBufferPool; + +class CachedSurfaceView; +class CachedSurface; + +using Surface = std::shared_ptr<CachedSurface>; +using View = std::shared_ptr<CachedSurfaceView>; +using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; + +using VideoCommon::SurfaceParams; +using VideoCommon::ViewParams; + +class CachedSurface final : public VideoCommon::SurfaceBase<View> { + friend CachedSurfaceView; + +public: + explicit CachedSurface(Core::System& system, const VKDevice& device, + VKResourceManager& resource_manager, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStagingBufferPool& staging_pool, + GPUVAddr gpu_addr, const SurfaceParams& params); + ~CachedSurface(); + + void UploadTexture(const std::vector<u8>& staging_buffer) override; + void DownloadTexture(std::vector<u8>& staging_buffer) override; + + void FullTransition(vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access, + vk::ImageLayout new_layout) { + image->Transition(0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels, + new_stage_mask, new_access, new_layout); + } + + void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, + vk::PipelineStageFlags new_stage_mask, vk::AccessFlags new_access, + vk::ImageLayout new_layout) { + image->Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, + new_access, new_layout); + } + + VKImage& GetImage() { + return *image; + } + + const VKImage& GetImage() const { + return *image; + } + + vk::Image GetImageHandle() const { + return image->GetHandle(); + } + + vk::ImageAspectFlags GetAspectMask() const { + return image->GetAspectMask(); + } + + vk::BufferView GetBufferViewHandle() const { + return *buffer_view; + } + +protected: + void DecorateSurfaceName(); + + View CreateView(const ViewParams& params) override; + View CreateViewInner(const ViewParams& params, bool is_proxy); + +private: + void UploadBuffer(const std::vector<u8>& staging_buffer); + + void UploadImage(const std::vector<u8>& staging_buffer); + + vk::BufferImageCopy GetBufferImageCopy(u32 level) const; + + vk::ImageSubresourceRange GetImageSubresourceRange() const; + + Core::System& system; + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; + + std::optional<VKImage> image; + UniqueBuffer buffer; + UniqueBufferView buffer_view; + VKMemoryCommit commit; + + vk::Format format; +}; + +class CachedSurfaceView final : public VideoCommon::ViewBase { +public: + explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, + const ViewParams& params, bool is_proxy); + ~CachedSurfaceView(); + + vk::ImageView GetHandle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source); + + bool IsSameSurface(const CachedSurfaceView& rhs) const { + return &surface == &rhs.surface; + } + + vk::ImageView GetHandle() { + return GetHandle(Tegra::Texture::SwizzleSource::R, Tegra::Texture::SwizzleSource::G, + Tegra::Texture::SwizzleSource::B, Tegra::Texture::SwizzleSource::A); + } + + u32 GetWidth() const { + return params.GetMipWidth(base_level); + } + + u32 GetHeight() const { + return params.GetMipHeight(base_level); + } + + bool IsBufferView() const { + return buffer_view; + } + + vk::Image GetImage() const { + return image; + } + + vk::BufferView GetBufferView() const { + return buffer_view; + } + + vk::ImageSubresourceRange GetImageSubresourceRange() const { + return {aspect_mask, base_level, num_levels, base_layer, num_layers}; + } + + vk::ImageSubresourceLayers GetImageSubresourceLayers() const { + return {surface.GetAspectMask(), base_level, base_layer, num_layers}; + } + + void Transition(vk::ImageLayout new_layout, vk::PipelineStageFlags new_stage_mask, + vk::AccessFlags new_access) const { + surface.Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, + new_access, new_layout); + } + + void MarkAsModified(u64 tick) { + surface.MarkAsModified(true, tick); + } + +private: + static u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, + Tegra::Texture::SwizzleSource y_source, + Tegra::Texture::SwizzleSource z_source, + Tegra::Texture::SwizzleSource w_source) { + return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | + (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); + } + + // Store a copy of these values to avoid double dereference when reading them + const SurfaceParams params; + const vk::Image image; + const vk::BufferView buffer_view; + const vk::ImageAspectFlags aspect_mask; + + const VKDevice& device; + CachedSurface& surface; + const u32 base_layer; + const u32 num_layers; + const u32 base_level; + const u32 num_levels; + const vk::ImageViewType image_view_type; + + vk::ImageView last_image_view; + u32 last_swizzle{}; + + std::unordered_map<u32, UniqueImageView> view_cache; +}; + +class VKTextureCache final : public TextureCacheBase { +public: + explicit VKTextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + const VKDevice& device, VKResourceManager& resource_manager, + VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool); + ~VKTextureCache(); + +private: + Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; + + void ImageCopy(Surface& src_surface, Surface& dst_surface, + const VideoCommon::CopyParams& copy_params) override; + + void ImageBlit(View& src_view, View& dst_view, + const Tegra::Engines::Fermi2D::Config& copy_config) override; + + void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + + const VKDevice& device; + VKResourceManager& resource_manager; + VKMemoryManager& memory_manager; + VKScheduler& scheduler; + VKStagingBufferPool& staging_pool; +}; + +} // namespace Vulkan diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index b427ac873..0229733b6 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -65,7 +65,7 @@ struct BlockInfo { struct CFGRebuildState { explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker) - : program_code{program_code}, start{start}, locker{locker} {} + : program_code{program_code}, locker{locker}, start{start} {} const ProgramCode& program_code; ConstBufferLocker& locker; diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index c934d0719..7591a715f 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -6,6 +6,7 @@ #include <vector> #include <fmt/format.h> +#include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" @@ -15,6 +16,8 @@ namespace VideoCommon::Shader { +using Tegra::Shader::AtomicOp; +using Tegra::Shader::AtomicType; using Tegra::Shader::Attribute; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; @@ -22,34 +25,39 @@ using Tegra::Shader::Register; namespace { -u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) { +bool IsUnaligned(Tegra::Shader::UniformType uniform_type) { + return uniform_type == Tegra::Shader::UniformType::UnsignedByte || + uniform_type == Tegra::Shader::UniformType::UnsignedShort; +} + +u32 GetUnalignedMask(Tegra::Shader::UniformType uniform_type) { switch (uniform_type) { case Tegra::Shader::UniformType::UnsignedByte: - case Tegra::Shader::UniformType::Single: - return 1; - case Tegra::Shader::UniformType::Double: - return 2; - case Tegra::Shader::UniformType::Quad: - case Tegra::Shader::UniformType::UnsignedQuad: - return 4; + return 0b11; + case Tegra::Shader::UniformType::UnsignedShort: + return 0b10; default: - UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); - return 1; + UNREACHABLE(); + return 0; } } -u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) { +u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) { switch (uniform_type) { + case Tegra::Shader::UniformType::UnsignedByte: + return 8; + case Tegra::Shader::UniformType::UnsignedShort: + return 16; case Tegra::Shader::UniformType::Single: - return 1; + return 32; case Tegra::Shader::UniformType::Double: - return 2; + return 64; case Tegra::Shader::UniformType::Quad: case Tegra::Shader::UniformType::UnsignedQuad: - return 4; + return 128; default: UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); - return 1; + return 32; } } @@ -184,9 +192,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { }(); const auto [real_address_base, base_address, descriptor] = - TrackGlobalMemory(bb, instr, false); + TrackGlobalMemory(bb, instr, true, false); - const u32 count = GetLdgMemorySize(type); + const u32 size = GetMemorySize(type); + const u32 count = Common::AlignUp(size, 32) / 32; if (!real_address_base || !base_address) { // Tracking failed, load zeroes. for (u32 i = 0; i < count; ++i) { @@ -200,14 +209,15 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); - if (type == Tegra::Shader::UniformType::UnsignedByte) { - // To handle unaligned loads get the byte used to dereferenced global memory - // and extract that byte from the loaded uint32. - Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3)); - byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3)); + // To handle unaligned loads get the bytes used to dereference global memory and extract + // those bytes from the loaded u32. + if (IsUnaligned(type)) { + Node mask = Immediate(GetUnalignedMask(type)); + Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3)); - gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte), - Immediate(8)); + gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), + std::move(offset), Immediate(size)); } SetTemporary(bb, i, gmem); @@ -295,23 +305,53 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } }(); + // For unaligned reads we have to read memory too. + const bool is_read = IsUnaligned(type); const auto [real_address_base, base_address, descriptor] = - TrackGlobalMemory(bb, instr, true); + TrackGlobalMemory(bb, instr, is_read, true); if (!real_address_base || !base_address) { // Tracking failed, skip the store. break; } - const u32 count = GetStgMemorySize(type); + const u32 size = GetMemorySize(type); + const u32 count = Common::AlignUp(size, 32) / 32; for (u32 i = 0; i < count; ++i) { const Node it_offset = Immediate(i * 4); const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset); const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor); - const Node value = GetRegister(instr.gpr0.Value() + i); + Node value = GetRegister(instr.gpr0.Value() + i); + + if (IsUnaligned(type)) { + Node mask = Immediate(GetUnalignedMask(type)); + Node offset = Operation(OperationCode::UBitwiseAnd, real_address, std::move(mask)); + offset = Operation(OperationCode::ULogicalShiftLeft, offset, Immediate(3)); + + value = Operation(OperationCode::UBitfieldInsert, gmem, std::move(value), offset, + Immediate(size)); + } + bb.push_back(Operation(OperationCode::Assign, gmem, value)); } break; } + case OpCode::Id::ATOMS: { + UNIMPLEMENTED_IF_MSG(instr.atoms.operation != AtomicOp::Add, "operation={}", + static_cast<int>(instr.atoms.operation.Value())); + UNIMPLEMENTED_IF_MSG(instr.atoms.type != AtomicType::U32, "type={}", + static_cast<int>(instr.atoms.type.Value())); + + const s32 offset = instr.atoms.GetImmediateOffset(); + Node address = GetRegister(instr.gpr8); + address = Operation(OperationCode::IAdd, std::move(address), Immediate(offset)); + + Node memory = GetSharedMemory(std::move(address)); + Node data = GetRegister(instr.gpr20); + + Node value = Operation(OperationCode::UAtomicAdd, std::move(memory), std::move(data)); + SetRegister(bb, instr.gpr0, std::move(value)); + break; + } case OpCode::Id::AL2P: { // Ignore al2p.direction since we don't care about it. @@ -336,7 +376,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& bb, Instruction instr, - bool is_write) { + bool is_read, bool is_write) { const auto addr_register{GetRegister(instr.gmem.gpr)}; const auto immediate_offset{static_cast<u32>(instr.gmem.offset)}; @@ -351,11 +391,8 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackGlobalMemory(NodeBlock& const GlobalMemoryBase descriptor{index, offset}; const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor); auto& usage = entry->second; - if (is_write) { - usage.is_written = true; - } else { - usage.is_read = true; - } + usage.is_written |= is_write; + usage.is_read |= is_read; const auto real_address = Operation(OperationCode::UAdd, NO_PRECISE, Immediate(immediate_offset), addr_register); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 4b14cdf58..cd984f763 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -794,14 +794,10 @@ std::tuple<std::size_t, std::size_t> ShaderIR::ValidateAndGetCoordinateElement( std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coord_count, bool is_tld4) { - const auto [coord_offsets, size, wrap_value, - diff_value] = [is_tld4]() -> std::tuple<std::array<u32, 3>, u32, s32, s32> { - if (is_tld4) { - return {{0, 8, 16}, 6, 32, 64}; - } else { - return {{0, 4, 8}, 4, 8, 16}; - } - }(); + const std::array coord_offsets = is_tld4 ? std::array{0U, 8U, 16U} : std::array{0U, 4U, 8U}; + const u32 size = is_tld4 ? 6 : 4; + const s32 wrap_value = is_tld4 ? 32 : 8; + const s32 diff_value = is_tld4 ? 64 : 16; const u32 mask = (1U << size) - 1; std::vector<Node> aoffi; @@ -814,7 +810,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor LOG_WARNING(HW_GPU, "AOFFI constant folding failed, some hardware might have graphical issues"); for (std::size_t coord = 0; coord < coord_count; ++coord) { - const Node value = BitfieldExtract(aoffi_reg, coord_offsets.at(coord), size); + const Node value = BitfieldExtract(aoffi_reg, coord_offsets[coord], size); const Node condition = Operation(OperationCode::LogicalIGreaterEqual, value, Immediate(wrap_value)); const Node negative = Operation(OperationCode::IAdd, value, Immediate(-diff_value)); @@ -824,7 +820,7 @@ std::vector<Node> ShaderIR::GetAoffiCoordinates(Node aoffi_reg, std::size_t coor } for (std::size_t coord = 0; coord < coord_count; ++coord) { - s32 value = (*aoffi_immediate >> coord_offsets.at(coord)) & mask; + s32 value = (*aoffi_immediate >> coord_offsets[coord]) & mask; if (value >= wrap_value) { value -= diff_value; } diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 4e155542a..075c7d07c 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -162,6 +162,8 @@ enum class OperationCode { AtomicImageXor, /// (MetaImage, int[N] coords) -> void AtomicImageExchange, /// (MetaImage, int[N] coords) -> void + UAtomicAdd, /// (smem, uint) -> uint + Branch, /// (uint branch_target) -> void BranchIndirect, /// (uint branch_target) -> void PushFlowStack, /// (uint branch_target) -> void diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index aacd0a0da..ba1db4c11 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -394,7 +394,7 @@ private: std::tuple<Node, Node, GlobalMemoryBase> TrackGlobalMemory(NodeBlock& bb, Tegra::Shader::Instruction instr, - bool is_write); + bool is_read, bool is_write); /// Register new amending code and obtain the reference id. std::size_t DeclareAmend(Node new_amend); diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 271e67533..81fb9f633 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -95,7 +95,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{ {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, - {TextureFormat::ZF32_X24S8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z32FS8}, + {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, {TextureFormat::DXT1, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1_SRGB}, diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 992b5c022..9256fd6d9 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -209,6 +209,11 @@ public: return target == VideoCore::Surface::SurfaceTarget::TextureBuffer; } + /// Returns the number of layers in the surface. + std::size_t GetNumLayers() const { + return is_layered ? depth : 1; + } + /// Returns the debug name of the texture for use in graphic debuggers. std::string TargetName() const; @@ -287,10 +292,6 @@ private: /// Returns the size of a layer std::size_t GetLayerSize(bool as_host_size, bool uncompressed) const; - std::size_t GetNumLayers() const { - return is_layered ? depth : 1; - } - /// Returns true if these parameters are from a layered surface. bool IsLayered() const; }; diff --git a/src/yuzu/configuration/configure_hotkeys.cpp b/src/yuzu/configuration/configure_hotkeys.cpp index 3ea0b8d67..fa9052136 100644 --- a/src/yuzu/configuration/configure_hotkeys.cpp +++ b/src/yuzu/configuration/configure_hotkeys.cpp @@ -48,6 +48,7 @@ void ConfigureHotkeys::Populate(const HotkeyRegistry& registry) { } ui->hotkey_list->expandAll(); + ui->hotkey_list->resizeColumnToContents(0); } void ConfigureHotkeys::changeEvent(QEvent* event) { diff --git a/src/yuzu/main.ui b/src/yuzu/main.ui index 581a10ddc..a2c9e4547 100644 --- a/src/yuzu/main.ui +++ b/src/yuzu/main.ui @@ -15,7 +15,7 @@ </property> <property name="windowIcon"> <iconset> - <normaloff>src/pcafe/res/icon3_64x64.ico</normaloff>src/pcafe/res/icon3_64x64.ico</iconset> + <normaloff>../dist/yuzu.ico</normaloff>../dist/yuzu.ico</iconset> </property> <property name="tabShape"> <enum>QTabWidget::Rounded</enum> |