diff options
-rw-r--r-- | src/citra_qt/configure.ui | 12 | ||||
-rw-r--r-- | src/common/file_util.cpp | 43 | ||||
-rw-r--r-- | src/common/file_util.h | 26 | ||||
-rw-r--r-- | src/common/thread.h | 46 | ||||
-rw-r--r-- | src/common/x64/emitter.cpp | 28 | ||||
-rw-r--r-- | src/common/x64/emitter.h | 2 | ||||
-rw-r--r-- | src/core/hle/config_mem.cpp | 7 | ||||
-rw-r--r-- | src/core/hle/hle.cpp | 2 | ||||
-rw-r--r-- | src/core/hle/service/soc_u.cpp | 100 | ||||
-rw-r--r-- | src/core/hw/y2r.cpp | 2 | ||||
-rw-r--r-- | src/core/loader/3dsx.cpp | 6 | ||||
-rw-r--r-- | src/core/loader/ncch.cpp | 4 | ||||
-rw-r--r-- | src/video_core/command_processor.cpp | 4 | ||||
-rw-r--r-- | src/video_core/debug_utils/debug_utils.cpp | 19 | ||||
-rw-r--r-- | src/video_core/rasterizer.cpp | 99 | ||||
-rw-r--r-- | src/video_core/shader/shader.cpp | 34 | ||||
-rw-r--r-- | src/video_core/shader/shader.h | 3 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 302 | ||||
-rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 58 |
19 files changed, 474 insertions, 323 deletions
diff --git a/src/citra_qt/configure.ui b/src/citra_qt/configure.ui index 3c1f2ebba..6ae056ff9 100644 --- a/src/citra_qt/configure.ui +++ b/src/citra_qt/configure.ui @@ -10,24 +10,12 @@ <height>501</height> </rect> </property> - <property name="minimumSize"> - <size> - <width>370</width> - <height>219</height> - </size> - </property> <property name="windowTitle"> <string>Citra Configuration</string> </property> <layout class="QVBoxLayout" name="verticalLayout"> <item> <widget class="QTabWidget" name="tabWidget"> - <property name="minimumSize"> - <size> - <width>371</width> - <height>221</height> - </size> - </property> <property name="currentIndex"> <number>0</number> </property> diff --git a/src/common/file_util.cpp b/src/common/file_util.cpp index 687b7ae5a..6e2867658 100644 --- a/src/common/file_util.cpp +++ b/src/common/file_util.cpp @@ -833,13 +833,12 @@ size_t WriteStringToFile(bool text_file, const std::string &str, const char *fil size_t ReadFileToString(bool text_file, const char *filename, std::string &str) { - FileUtil::IOFile file(filename, text_file ? "r" : "rb"); - auto const f = file.GetHandle(); + IOFile file(filename, text_file ? "r" : "rb"); - if (!f) + if (!file) return false; - str.resize(static_cast<u32>(GetSize(f))); + str.resize(static_cast<u32>(file.GetSize())); return file.ReadArray(&str[0], str.size()); } @@ -886,15 +885,10 @@ void SplitFilename83(const std::string& filename, std::array<char, 9>& short_nam } IOFile::IOFile() - : m_file(nullptr), m_good(true) -{} - -IOFile::IOFile(std::FILE* file) - : m_file(file), m_good(true) -{} +{ +} IOFile::IOFile(const std::string& filename, const char openmode[]) - : m_file(nullptr), m_good(true) { Open(filename, openmode); } @@ -905,7 +899,6 @@ IOFile::~IOFile() } IOFile::IOFile(IOFile&& other) - : m_file(nullptr), m_good(true) { Swap(other); } @@ -944,26 +937,12 @@ bool IOFile::Close() return m_good; } -std::FILE* IOFile::ReleaseHandle() -{ - std::FILE* const ret = m_file; - m_file = nullptr; - return ret; -} - -void IOFile::SetHandle(std::FILE* file) -{ - Close(); - Clear(); - m_file = file; -} - -u64 IOFile::GetSize() +u64 IOFile::GetSize() const { if (IsOpen()) return FileUtil::GetSize(m_file); - else - return 0; + + return 0; } bool IOFile::Seek(s64 off, int origin) @@ -974,12 +953,12 @@ bool IOFile::Seek(s64 off, int origin) return m_good; } -u64 IOFile::Tell() +u64 IOFile::Tell() const { if (IsOpen()) return ftello(m_file); - else - return -1; + + return -1; } bool IOFile::Flush() diff --git a/src/common/file_util.h b/src/common/file_util.h index 880b8a1e3..b54a9fb72 100644 --- a/src/common/file_util.h +++ b/src/common/file_util.h @@ -176,7 +176,6 @@ class IOFile : public NonCopyable { public: IOFile(); - explicit IOFile(std::FILE* file); IOFile(const std::string& filename, const char openmode[]); ~IOFile(); @@ -192,6 +191,9 @@ public: template <typename T> size_t ReadArray(T* data, size_t length) { + static_assert(std::is_standard_layout<T>(), "Given array does not consist of standard layout objects"); + static_assert(std::is_trivially_copyable<T>(), "Given array does not consist of trivially copyable objects"); + if (!IsOpen()) { m_good = false; return -1; @@ -207,9 +209,8 @@ public: template <typename T> size_t WriteArray(const T* data, size_t length) { - static_assert(std::is_standard_layout<T>::value, "Given array does not consist of standard layout objects"); - // TODO: gcc 4.8 does not support is_trivially_copyable, but we really should check for it here. - //static_assert(std::is_trivially_copyable<T>::value, "Given array does not consist of trivially copyable objects"); + static_assert(std::is_standard_layout<T>(), "Given array does not consist of standard layout objects"); + static_assert(std::is_trivially_copyable<T>(), "Given array does not consist of trivially copyable objects"); if (!IsOpen()) { m_good = false; @@ -243,25 +244,20 @@ public: // m_good is set to false when a read, write or other function fails bool IsGood() const { return m_good; } - operator void*() { return m_good ? m_file : nullptr; } - - std::FILE* ReleaseHandle(); - - std::FILE* GetHandle() { return m_file; } - - void SetHandle(std::FILE* file); + explicit operator bool() const { return IsGood(); } bool Seek(s64 off, int origin); - u64 Tell(); - u64 GetSize(); + u64 Tell() const; + u64 GetSize() const; bool Resize(u64 size); bool Flush(); // clear error state void Clear() { m_good = true; std::clearerr(m_file); } - std::FILE* m_file; - bool m_good; +private: + std::FILE* m_file = nullptr; + bool m_good = true; }; } // namespace diff --git a/src/common/thread.h b/src/common/thread.h index 8255ee6d3..bbfa8befa 100644 --- a/src/common/thread.h +++ b/src/common/thread.h @@ -30,8 +30,7 @@ # endif #endif -namespace Common -{ +namespace Common { int CurrentThreadId(); @@ -43,55 +42,55 @@ public: Event() : is_set(false) {} void Set() { - std::lock_guard<std::mutex> lk(m_mutex); + std::lock_guard<std::mutex> lk(mutex); if (!is_set) { is_set = true; - m_condvar.notify_one(); + condvar.notify_one(); } } void Wait() { - std::unique_lock<std::mutex> lk(m_mutex); - m_condvar.wait(lk, [&]{ return is_set; }); + std::unique_lock<std::mutex> lk(mutex); + condvar.wait(lk, [&]{ return is_set; }); is_set = false; } void Reset() { - std::unique_lock<std::mutex> lk(m_mutex); + std::unique_lock<std::mutex> lk(mutex); // no other action required, since wait loops on the predicate and any lingering signal will get cleared on the first iteration is_set = false; } private: bool is_set; - std::condition_variable m_condvar; - std::mutex m_mutex; + std::condition_variable condvar; + std::mutex mutex; }; class Barrier { public: - Barrier(size_t count) : m_count(count), m_waiting(0) {} + explicit Barrier(size_t count_) : count(count_), waiting(0), generation(0) {} /// Blocks until all "count" threads have called Sync() void Sync() { - std::unique_lock<std::mutex> lk(m_mutex); + std::unique_lock<std::mutex> lk(mutex); + const size_t current_generation = generation; - // TODO: broken when next round of Sync()s - // is entered before all waiting threads return from the notify_all - - if (++m_waiting == m_count) { - m_waiting = 0; - m_condvar.notify_all(); + if (++waiting == count) { + generation++; + waiting = 0; + condvar.notify_all(); } else { - m_condvar.wait(lk, [&]{ return m_waiting == 0; }); + condvar.wait(lk, [this, current_generation]{ return current_generation != generation; }); } } private: - std::condition_variable m_condvar; - std::mutex m_mutex; - const size_t m_count; - size_t m_waiting; + std::condition_variable condvar; + std::mutex mutex; + const size_t count; + size_t waiting; + size_t generation; // Incremented once each time the barrier is used }; void SleepCurrentThread(int ms); @@ -100,8 +99,7 @@ void SwitchCurrentThread(); // On Linux, this is equal to sleep 1ms // Use this function during a spin-wait to make the current thread // relax while another thread is working. This may be more efficient // than using events because event functions use kernel calls. -inline void YieldCPU() -{ +inline void YieldCPU() { std::this_thread::yield(); } diff --git a/src/common/x64/emitter.cpp b/src/common/x64/emitter.cpp index 1dcf2416c..5662f7f86 100644 --- a/src/common/x64/emitter.cpp +++ b/src/common/x64/emitter.cpp @@ -455,6 +455,18 @@ void XEmitter::CALL(const void* fnptr) Write32(u32(distance)); } +FixupBranch XEmitter::CALL() +{ + FixupBranch branch; + branch.type = 1; + branch.ptr = code + 5; + + Write8(0xE8); + Write32(0); + + return branch; +} + FixupBranch XEmitter::J(bool force5bytes) { FixupBranch branch; @@ -531,6 +543,22 @@ void XEmitter::SetJumpTarget(const FixupBranch& branch) } } +void XEmitter::SetJumpTarget(const FixupBranch& branch, const u8* target) +{ + if (branch.type == 0) + { + s64 distance = (s64)(target - branch.ptr); + ASSERT_MSG(distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true"); + branch.ptr[-1] = (u8)(s8)distance; + } + else if (branch.type == 1) + { + s64 distance = (s64)(target - branch.ptr); + ASSERT_MSG(distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register"); + ((s32*)branch.ptr)[-1] = (s32)distance; + } +} + //Single byte opcodes //There is no PUSHAD/POPAD in 64-bit mode. void XEmitter::INT3() {Write8(0xCC);} diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h index 7c6548fb5..a33724146 100644 --- a/src/common/x64/emitter.h +++ b/src/common/x64/emitter.h @@ -425,12 +425,14 @@ public: #undef CALL #endif void CALL(const void* fnptr); + FixupBranch CALL(); void CALLptr(OpArg arg); FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false); void J_CC(CCFlags conditionCode, const u8* addr, bool force5Bytes = false); void SetJumpTarget(const FixupBranch& branch); + void SetJumpTarget(const FixupBranch& branch, const u8* target); void SETcc(CCFlags flag, OpArg dest); // Note: CMOV brings small if any benefit on current cpus. diff --git a/src/core/hle/config_mem.cpp b/src/core/hle/config_mem.cpp index b1a72dc0c..ccd73cfcb 100644 --- a/src/core/hle/config_mem.cpp +++ b/src/core/hle/config_mem.cpp @@ -3,13 +3,6 @@ // Refer to the license.txt file included. #include <cstring> - -#include "common/assert.h" -#include "common/common_types.h" -#include "common/common_funcs.h" - -#include "core/core.h" -#include "core/memory.h" #include "core/hle/config_mem.h" //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp index 331b1b22a..e545de3b5 100644 --- a/src/core/hle/hle.cpp +++ b/src/core/hle/hle.cpp @@ -8,8 +8,6 @@ #include "core/arm/arm_interface.h" #include "core/core.h" #include "core/hle/hle.h" -#include "core/hle/config_mem.h" -#include "core/hle/shared_page.h" #include "core/hle/service/service.h" //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/core/hle/service/soc_u.cpp b/src/core/hle/service/soc_u.cpp index ff0af8f12..d3e5d4bca 100644 --- a/src/core/hle/service/soc_u.cpp +++ b/src/core/hle/service/soc_u.cpp @@ -151,6 +151,34 @@ static int TranslateError(int error) { return error; } +/// Holds the translation from system network socket options to 3DS network socket options +/// Note: -1 = No effect/unavailable +static const std::unordered_map<int, int> sockopt_map = { { + { 0x0004, SO_REUSEADDR }, + { 0x0080, -1 }, + { 0x0100, -1 }, + { 0x1001, SO_SNDBUF }, + { 0x1002, SO_RCVBUF }, + { 0x1003, -1 }, +#ifdef _WIN32 + /// Unsupported in WinSock2 + { 0x1004, -1 }, +#else + { 0x1004, SO_RCVLOWAT }, +#endif + { 0x1008, SO_TYPE }, + { 0x1009, SO_ERROR }, +}}; + +/// Converts a socket option from 3ds-specific to platform-specific +static int TranslateSockOpt(int console_opt_name) { + auto found = sockopt_map.find(console_opt_name); + if (found != sockopt_map.end()) { + return found->second; + } + return console_opt_name; +} + /// Holds information about a particular socket struct SocketHolder { u32 socket_fd; ///< The socket descriptor @@ -568,7 +596,7 @@ static void RecvFrom(Service::Interface* self) { socklen_t src_addr_len = sizeof(src_addr); int ret = ::recvfrom(socket_handle, (char*)output_buff, len, flags, &src_addr, &src_addr_len); - if (buffer_parameters.output_src_address_buffer != 0) { + if (ret >= 0 && buffer_parameters.output_src_address_buffer != 0 && src_addr_len > 0) { CTRSockAddr* ctr_src_addr = reinterpret_cast<CTRSockAddr*>(Memory::GetPointer(buffer_parameters.output_src_address_buffer)); *ctr_src_addr = CTRSockAddr::FromPlatform(src_addr); } @@ -724,6 +752,72 @@ static void ShutdownSockets(Service::Interface* self) { cmd_buffer[1] = 0; } +static void GetSockOpt(Service::Interface* self) { + u32* cmd_buffer = Kernel::GetCommandBuffer(); + u32 socket_handle = cmd_buffer[1]; + u32 level = cmd_buffer[2]; + int optname = TranslateSockOpt(cmd_buffer[3]); + socklen_t optlen = (socklen_t)cmd_buffer[4]; + + int ret = -1; + int err = 0; + + if(optname < 0) { +#ifdef _WIN32 + err = WSAEINVAL; +#else + err = EINVAL; +#endif + } else { + // 0x100 = static buffer offset (bytes) + // + 0x4 = 2nd pointer (u32) position + // >> 2 = convert to u32 offset instead of byte offset (cmd_buffer = u32*) + char* optval = reinterpret_cast<char *>(Memory::GetPointer(cmd_buffer[0x104 >> 2])); + + ret = ::getsockopt(socket_handle, level, optname, optval, &optlen); + err = 0; + if (ret == SOCKET_ERROR_VALUE) { + err = TranslateError(GET_ERRNO); + } + } + + cmd_buffer[0] = IPC::MakeHeader(0x11, 4, 2); + cmd_buffer[1] = ret; + cmd_buffer[2] = err; + cmd_buffer[3] = optlen; +} + +static void SetSockOpt(Service::Interface* self) { + u32* cmd_buffer = Kernel::GetCommandBuffer(); + u32 socket_handle = cmd_buffer[1]; + u32 level = cmd_buffer[2]; + int optname = TranslateSockOpt(cmd_buffer[3]); + + int ret = -1; + int err = 0; + + if(optname < 0) { +#ifdef _WIN32 + err = WSAEINVAL; +#else + err = EINVAL; +#endif + } else { + socklen_t optlen = static_cast<socklen_t>(cmd_buffer[4]); + const char* optval = reinterpret_cast<const char *>(Memory::GetPointer(cmd_buffer[8])); + + ret = static_cast<u32>(::setsockopt(socket_handle, level, optname, optval, optlen)); + err = 0; + if (ret == SOCKET_ERROR_VALUE) { + err = TranslateError(GET_ERRNO); + } + } + + cmd_buffer[0] = IPC::MakeHeader(0x12, 4, 4); + cmd_buffer[1] = ret; + cmd_buffer[2] = err; +} + const Interface::FunctionInfo FunctionTable[] = { {0x00010044, InitializeSockets, "InitializeSockets"}, {0x000200C2, Socket, "Socket"}, @@ -741,8 +835,8 @@ const Interface::FunctionInfo FunctionTable[] = { {0x000E00C2, nullptr, "GetHostByAddr"}, {0x000F0106, nullptr, "GetAddrInfo"}, {0x00100102, nullptr, "GetNameInfo"}, - {0x00110102, nullptr, "GetSockOpt"}, - {0x00120104, nullptr, "SetSockOpt"}, + {0x00110102, GetSockOpt, "GetSockOpt"}, + {0x00120104, SetSockOpt, "SetSockOpt"}, {0x001300C2, Fcntl, "Fcntl"}, {0x00140084, Poll, "Poll"}, {0x00150042, nullptr, "SockAtMark"}, diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp index 48c45564f..083391e83 100644 --- a/src/core/hw/y2r.cpp +++ b/src/core/hw/y2r.cpp @@ -261,7 +261,7 @@ void PerformConversion(ConversionConfiguration& cvt) { ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); // Tiles per row size_t num_tiles = cvt.input_line_width / 8; - ASSERT(num_tiles < MAX_TILES); + ASSERT(num_tiles <= MAX_TILES); // Buffer used as a CDMA source/target. std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]); diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp index 8eed6a50a..5fb3b9e2b 100644 --- a/src/core/loader/3dsx.cpp +++ b/src/core/loader/3dsx.cpp @@ -10,13 +10,9 @@ #include "core/file_sys/archive_romfs.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/resource_limit.h" -#include "core/hle/service/fs/archive.h" -#include "core/loader/elf.h" -#include "core/loader/ncch.h" +#include "core/loader/3dsx.h" #include "core/memory.h" -#include "3dsx.h" - namespace Loader { /* diff --git a/src/core/loader/ncch.cpp b/src/core/loader/ncch.cpp index e63cab33f..a4b47ef8c 100644 --- a/src/core/loader/ncch.cpp +++ b/src/core/loader/ncch.cpp @@ -174,7 +174,7 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>& return ResultStatus::Error; LOG_DEBUG(Loader, "%d sections:", kMaxSections); - // Iterate through the ExeFs archive until we find the .code file... + // Iterate through the ExeFs archive until we find a section with the specified name... for (unsigned section_number = 0; section_number < kMaxSections; section_number++) { const auto& section = exefs_header.section[section_number]; @@ -186,7 +186,7 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>& s64 section_offset = (section.offset + exefs_offset + sizeof(ExeFs_Header) + ncch_offset); file.Seek(section_offset, SEEK_SET); - if (is_compressed) { + if (strcmp(section.name, ".code") == 0 && is_compressed) { // Section is compressed, read compressed .code section... std::unique_ptr<u8[]> temp_buffer; try { diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 08ec2907a..3abe79c09 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -140,7 +140,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { immediate_attribute_id = 0; Shader::UnitState<false> shader_unit; - Shader::Setup(shader_unit); + Shader::Setup(); if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input)); @@ -300,7 +300,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { vertex_cache_ids.fill(-1); Shader::UnitState<false> shader_unit; - Shader::Setup(shader_unit); + Shader::Setup(); for (unsigned int index = 0; index < regs.num_vertices; ++index) { diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp index 693f93597..c3a9c9598 100644 --- a/src/video_core/debug_utils/debug_utils.cpp +++ b/src/video_core/debug_utils/debug_utils.cpp @@ -286,7 +286,7 @@ void StartPicaTracing() } std::lock_guard<std::mutex> lock(pica_trace_mutex); - pica_trace = std::unique_ptr<PicaTrace>(new PicaTrace); + pica_trace = std::make_unique<PicaTrace>(); is_pica_tracing = true; } @@ -586,6 +586,21 @@ TextureInfo TextureInfo::FromPicaRegister(const Regs::TextureConfig& config, return info; } +#ifdef HAVE_PNG +// Adapter functions to libpng to write/flush to File::IOFile instances. +static void WriteIOFile(png_structp png_ptr, png_bytep data, png_size_t length) { + auto* fp = static_cast<FileUtil::IOFile*>(png_get_io_ptr(png_ptr)); + if (!fp->WriteBytes(data, length)) + png_error(png_ptr, "Failed to write to output PNG file."); +} + +static void FlushIOFile(png_structp png_ptr) { + auto* fp = static_cast<FileUtil::IOFile*>(png_get_io_ptr(png_ptr)); + if (!fp->Flush()) + png_error(png_ptr, "Failed to flush to output PNG file."); +} +#endif + void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) { #ifndef HAVE_PNG return; @@ -629,7 +644,7 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data) { goto finalise; } - png_init_io(png_ptr, fp.GetHandle()); + png_set_write_fn(png_ptr, static_cast<void*>(&fp), WriteIOFile, FlushIOFile); // Write header (8 bit color depth) png_set_IHDR(png_ptr, info_ptr, texture_config.width, texture_config.height, diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 5b9ed7c64..0434ad05a 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -923,92 +923,72 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, if (output_merger.alphablend_enable) { auto params = output_merger.alpha_blending; - auto LookupFactorRGB = [&](Regs::BlendFactor factor) -> Math::Vec3<u8> { + auto LookupFactor = [&](unsigned channel, Regs::BlendFactor factor) -> u8 { + DEBUG_ASSERT(channel < 4); + + const Math::Vec4<u8> blend_const = { + static_cast<u8>(output_merger.blend_const.r), + static_cast<u8>(output_merger.blend_const.g), + static_cast<u8>(output_merger.blend_const.b), + static_cast<u8>(output_merger.blend_const.a) + }; + switch (factor) { - case Regs::BlendFactor::Zero : - return Math::Vec3<u8>(0, 0, 0); + case Regs::BlendFactor::Zero: + return 0; - case Regs::BlendFactor::One : - return Math::Vec3<u8>(255, 255, 255); + case Regs::BlendFactor::One: + return 255; case Regs::BlendFactor::SourceColor: - return combiner_output.rgb(); + return combiner_output[channel]; case Regs::BlendFactor::OneMinusSourceColor: - return Math::Vec3<u8>(255 - combiner_output.r(), 255 - combiner_output.g(), 255 - combiner_output.b()); + return 255 - combiner_output[channel]; case Regs::BlendFactor::DestColor: - return dest.rgb(); + return dest[channel]; case Regs::BlendFactor::OneMinusDestColor: - return Math::Vec3<u8>(255 - dest.r(), 255 - dest.g(), 255 - dest.b()); + return 255 - dest[channel]; case Regs::BlendFactor::SourceAlpha: - return Math::Vec3<u8>(combiner_output.a(), combiner_output.a(), combiner_output.a()); + return combiner_output.a(); case Regs::BlendFactor::OneMinusSourceAlpha: - return Math::Vec3<u8>(255 - combiner_output.a(), 255 - combiner_output.a(), 255 - combiner_output.a()); + return 255 - combiner_output.a(); case Regs::BlendFactor::DestAlpha: - return Math::Vec3<u8>(dest.a(), dest.a(), dest.a()); + return dest.a(); case Regs::BlendFactor::OneMinusDestAlpha: - return Math::Vec3<u8>(255 - dest.a(), 255 - dest.a(), 255 - dest.a()); + return 255 - dest.a(); case Regs::BlendFactor::ConstantColor: - return Math::Vec3<u8>(output_merger.blend_const.r, output_merger.blend_const.g, output_merger.blend_const.b); + return blend_const[channel]; case Regs::BlendFactor::OneMinusConstantColor: - return Math::Vec3<u8>(255 - output_merger.blend_const.r, 255 - output_merger.blend_const.g, 255 - output_merger.blend_const.b); + return 255 - blend_const[channel]; case Regs::BlendFactor::ConstantAlpha: - return Math::Vec3<u8>(output_merger.blend_const.a, output_merger.blend_const.a, output_merger.blend_const.a); + return blend_const.a(); case Regs::BlendFactor::OneMinusConstantAlpha: - return Math::Vec3<u8>(255 - output_merger.blend_const.a, 255 - output_merger.blend_const.a, 255 - output_merger.blend_const.a); - - default: - LOG_CRITICAL(HW_GPU, "Unknown color blend factor %x", factor); - UNIMPLEMENTED(); - break; - } - - return {}; - }; - - auto LookupFactorA = [&](Regs::BlendFactor factor) -> u8 { - switch (factor) { - case Regs::BlendFactor::Zero: - return 0; - - case Regs::BlendFactor::One: - return 255; - - case Regs::BlendFactor::SourceAlpha: - return combiner_output.a(); - - case Regs::BlendFactor::OneMinusSourceAlpha: - return 255 - combiner_output.a(); + return 255 - blend_const.a(); - case Regs::BlendFactor::DestAlpha: - return dest.a(); - - case Regs::BlendFactor::OneMinusDestAlpha: - return 255 - dest.a(); - - case Regs::BlendFactor::ConstantAlpha: - return output_merger.blend_const.a; - - case Regs::BlendFactor::OneMinusConstantAlpha: - return 255 - output_merger.blend_const.a; + case Regs::BlendFactor::SourceAlphaSaturate: + // Returns 1.0 for the alpha channel + if (channel == 3) + return 255; + return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a())); default: - LOG_CRITICAL(HW_GPU, "Unknown alpha blend factor %x", factor); + LOG_CRITICAL(HW_GPU, "Unknown blend factor %x", factor); UNIMPLEMENTED(); break; } - return {}; + return combiner_output[channel]; }; static auto EvaluateBlendEquation = [](const Math::Vec4<u8>& src, const Math::Vec4<u8>& srcfactor, @@ -1060,10 +1040,15 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, MathUtil::Clamp(result.a(), 0, 255)); }; - auto srcfactor = Math::MakeVec(LookupFactorRGB(params.factor_source_rgb), - LookupFactorA(params.factor_source_a)); - auto dstfactor = Math::MakeVec(LookupFactorRGB(params.factor_dest_rgb), - LookupFactorA(params.factor_dest_a)); + auto srcfactor = Math::MakeVec(LookupFactor(0, params.factor_source_rgb), + LookupFactor(1, params.factor_source_rgb), + LookupFactor(2, params.factor_source_rgb), + LookupFactor(3, params.factor_source_a)); + + auto dstfactor = Math::MakeVec(LookupFactor(0, params.factor_dest_rgb), + LookupFactor(1, params.factor_dest_rgb), + LookupFactor(2, params.factor_dest_rgb), + LookupFactor(3, params.factor_dest_a)); blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_rgb); blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, params.blend_equation_a).a(); diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 78d295c76..75301accd 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -28,36 +28,24 @@ namespace Pica { namespace Shader { #ifdef ARCHITECTURE_x86_64 -static std::unordered_map<u64, CompiledShader*> shader_map; -static JitCompiler jit; -static CompiledShader* jit_shader; - -static void ClearCache() { - shader_map.clear(); - jit.Clear(); - LOG_INFO(HW_GPU, "Shader JIT cache cleared"); -} +static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; +static const JitShader* jit_shader; #endif // ARCHITECTURE_x86_64 -void Setup(UnitState<false>& state) { +void Setup() { #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) { u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ - Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)) ^ - g_state.regs.vs.main_offset); + Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data))); auto iter = shader_map.find(cache_key); if (iter != shader_map.end()) { - jit_shader = iter->second; + jit_shader = iter->second.get(); } else { - // Check if remaining JIT code space is enough for at least one more (massive) shader - if (jit.GetSpaceLeft() < jit_shader_size) { - // If not, clear the cache of all previously compiled shaders - ClearCache(); - } - - jit_shader = jit.Compile(); - shader_map.emplace(cache_key, jit_shader); + auto shader = std::make_unique<JitShader>(); + shader->Compile(); + jit_shader = shader.get(); + shader_map[cache_key] = std::move(shader); } } #endif // ARCHITECTURE_x86_64 @@ -65,7 +53,7 @@ void Setup(UnitState<false>& state) { void Shutdown() { #ifdef ARCHITECTURE_x86_64 - ClearCache(); + shader_map.clear(); #endif // ARCHITECTURE_x86_64 } @@ -109,7 +97,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr #ifdef ARCHITECTURE_x86_64 if (VideoCore::g_shader_jit_enabled) - jit_shader(&state.registers); + jit_shader->Run(&state.registers, g_state.regs.vs.main_offset); else RunInterpreter(state); #else diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 7af8f1fa1..9c5bd97bd 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -339,9 +339,8 @@ struct UnitState { /** * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per * vertex, which would happen within the `Run` function). - * @param state Shader unit state, must be setup per shader and per shader unit */ -void Setup(UnitState<false>& state); +void Setup(); /// Performs any cleanup when the emulator is shutdown void Shutdown(); diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index dffe051ef..b47d3beda 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <algorithm> #include <smmintrin.h> #include "common/x64/abi.h" @@ -19,73 +20,73 @@ namespace Shader { using namespace Gen; -typedef void (JitCompiler::*JitFunction)(Instruction instr); +typedef void (JitShader::*JitFunction)(Instruction instr); const JitFunction instr_table[64] = { - &JitCompiler::Compile_ADD, // add - &JitCompiler::Compile_DP3, // dp3 - &JitCompiler::Compile_DP4, // dp4 - &JitCompiler::Compile_DPH, // dph + &JitShader::Compile_ADD, // add + &JitShader::Compile_DP3, // dp3 + &JitShader::Compile_DP4, // dp4 + &JitShader::Compile_DPH, // dph nullptr, // unknown - &JitCompiler::Compile_EX2, // ex2 - &JitCompiler::Compile_LG2, // lg2 + &JitShader::Compile_EX2, // ex2 + &JitShader::Compile_LG2, // lg2 nullptr, // unknown - &JitCompiler::Compile_MUL, // mul - &JitCompiler::Compile_SGE, // sge - &JitCompiler::Compile_SLT, // slt - &JitCompiler::Compile_FLR, // flr - &JitCompiler::Compile_MAX, // max - &JitCompiler::Compile_MIN, // min - &JitCompiler::Compile_RCP, // rcp - &JitCompiler::Compile_RSQ, // rsq + &JitShader::Compile_MUL, // mul + &JitShader::Compile_SGE, // sge + &JitShader::Compile_SLT, // slt + &JitShader::Compile_FLR, // flr + &JitShader::Compile_MAX, // max + &JitShader::Compile_MIN, // min + &JitShader::Compile_RCP, // rcp + &JitShader::Compile_RSQ, // rsq nullptr, // unknown nullptr, // unknown - &JitCompiler::Compile_MOVA, // mova - &JitCompiler::Compile_MOV, // mov + &JitShader::Compile_MOVA, // mova + &JitShader::Compile_MOV, // mov nullptr, // unknown nullptr, // unknown nullptr, // unknown nullptr, // unknown - &JitCompiler::Compile_DPH, // dphi + &JitShader::Compile_DPH, // dphi nullptr, // unknown - &JitCompiler::Compile_SGE, // sgei - &JitCompiler::Compile_SLT, // slti + &JitShader::Compile_SGE, // sgei + &JitShader::Compile_SLT, // slti nullptr, // unknown nullptr, // unknown nullptr, // unknown nullptr, // unknown nullptr, // unknown - &JitCompiler::Compile_NOP, // nop - &JitCompiler::Compile_END, // end + &JitShader::Compile_NOP, // nop + &JitShader::Compile_END, // end nullptr, // break - &JitCompiler::Compile_CALL, // call - &JitCompiler::Compile_CALLC, // callc - &JitCompiler::Compile_CALLU, // callu - &JitCompiler::Compile_IF, // ifu - &JitCompiler::Compile_IF, // ifc - &JitCompiler::Compile_LOOP, // loop + &JitShader::Compile_CALL, // call + &JitShader::Compile_CALLC, // callc + &JitShader::Compile_CALLU, // callu + &JitShader::Compile_IF, // ifu + &JitShader::Compile_IF, // ifc + &JitShader::Compile_LOOP, // loop nullptr, // emit nullptr, // sete - &JitCompiler::Compile_JMP, // jmpc - &JitCompiler::Compile_JMP, // jmpu - &JitCompiler::Compile_CMP, // cmp - &JitCompiler::Compile_CMP, // cmp - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // madi - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad - &JitCompiler::Compile_MAD, // mad + &JitShader::Compile_JMP, // jmpc + &JitShader::Compile_JMP, // jmpu + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_CMP, // cmp + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // madi + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad + &JitShader::Compile_MAD, // mad }; // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can @@ -138,13 +139,32 @@ static const u8 NO_SRC_REG_SWIZZLE = 0x1b; static const u8 NO_DEST_REG_MASK = 0xf; /** + * Get the vertex shader instruction for a given offset in the current shader program + * @param offset Offset in the current shader program of the instruction + * @return Instruction at the specified offset + */ +static Instruction GetVertexShaderInstruction(size_t offset) { + return { g_state.vs.program_code[offset] }; +} + +static void LogCritical(const char* msg) { + LOG_CRITICAL(HW_GPU, msg); +} + +void JitShader::Compile_Assert(bool condition, const char* msg) { + if (!condition) { + ABI_CallFunctionP(reinterpret_cast<const void*>(LogCritical), const_cast<char*>(msg)); + } +} + +/** * Loads and swizzles a source register into the specified XMM register. * @param instr VS instruction, used for determining how to load the source register * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) * @param src_reg SourceRegister object corresponding to the source register to load * @param dest Destination XMM register to store the loaded, swizzled source register */ -void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { +void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, X64Reg dest) { X64Reg src_ptr; size_t src_offset; @@ -216,7 +236,7 @@ void JitCompiler::Compile_SwizzleSrc(Instruction instr, unsigned src_num, Source } } -void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { +void JitShader::Compile_DestEnable(Instruction instr,X64Reg src) { DestRegister dest; unsigned operand_desc_id; if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || @@ -263,7 +283,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) { } } -void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { +void JitShader::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { MOVAPS(scratch, R(src1)); CMPPS(scratch, R(src2), CMP_ORD); @@ -276,7 +296,7 @@ void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen:: ANDPS(src1, R(scratch)); } -void JitCompiler::Compile_EvaluateCondition(Instruction instr) { +void JitShader::Compile_EvaluateCondition(Instruction instr) { // Note: NXOR is used below to check for equality switch (instr.flow_control.op) { case Instruction::FlowControlType::Or: @@ -307,23 +327,23 @@ void JitCompiler::Compile_EvaluateCondition(Instruction instr) { } } -void JitCompiler::Compile_UniformCondition(Instruction instr) { +void JitShader::Compile_UniformCondition(Instruction instr) { int offset = offsetof(decltype(g_state.vs.uniforms), b) + (instr.flow_control.bool_uniform_id * sizeof(bool)); CMP(sizeof(bool) * 8, MDisp(UNIFORMS, offset), Imm8(0)); } -BitSet32 JitCompiler::PersistentCallerSavedRegs() { +BitSet32 JitShader::PersistentCallerSavedRegs() { return persistent_regs & ABI_ALL_CALLER_SAVED; } -void JitCompiler::Compile_ADD(Instruction instr) { +void JitShader::Compile_ADD(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); ADDPS(SRC1, R(SRC2)); Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_DP3(Instruction instr) { +void JitShader::Compile_DP3(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); @@ -342,7 +362,7 @@ void JitCompiler::Compile_DP3(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_DP4(Instruction instr) { +void JitShader::Compile_DP4(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); @@ -359,7 +379,7 @@ void JitCompiler::Compile_DP4(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_DPH(Instruction instr) { +void JitShader::Compile_DPH(Instruction instr) { if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); @@ -391,7 +411,7 @@ void JitCompiler::Compile_DPH(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_EX2(Instruction instr) { +void JitShader::Compile_EX2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); MOVSS(XMM0, R(SRC1)); @@ -404,7 +424,7 @@ void JitCompiler::Compile_EX2(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_LG2(Instruction instr) { +void JitShader::Compile_LG2(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); MOVSS(XMM0, R(SRC1)); @@ -417,14 +437,14 @@ void JitCompiler::Compile_LG2(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_MUL(Instruction instr) { +void JitShader::Compile_MUL(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SanitizedMul(SRC1, SRC2, SCRATCH); Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_SGE(Instruction instr) { +void JitShader::Compile_SGE(Instruction instr) { if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); @@ -439,7 +459,7 @@ void JitCompiler::Compile_SGE(Instruction instr) { Compile_DestEnable(instr, SRC2); } -void JitCompiler::Compile_SLT(Instruction instr) { +void JitShader::Compile_SLT(Instruction instr) { if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); @@ -454,7 +474,7 @@ void JitCompiler::Compile_SLT(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_FLR(Instruction instr) { +void JitShader::Compile_FLR(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); if (Common::GetCPUCaps().sse4_1) { @@ -467,7 +487,7 @@ void JitCompiler::Compile_FLR(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_MAX(Instruction instr) { +void JitShader::Compile_MAX(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. @@ -475,7 +495,7 @@ void JitCompiler::Compile_MAX(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_MIN(Instruction instr) { +void JitShader::Compile_MIN(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. @@ -483,7 +503,7 @@ void JitCompiler::Compile_MIN(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_MOVA(Instruction instr) { +void JitShader::Compile_MOVA(Instruction instr) { SwizzlePattern swiz = { g_state.vs.swizzle_data[instr.common.operand_desc_id] }; if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { @@ -528,12 +548,12 @@ void JitCompiler::Compile_MOVA(Instruction instr) { } } -void JitCompiler::Compile_MOV(Instruction instr) { +void JitShader::Compile_MOV(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_RCP(Instruction instr) { +void JitShader::Compile_RCP(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica @@ -544,7 +564,7 @@ void JitCompiler::Compile_RCP(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_RSQ(Instruction instr) { +void JitShader::Compile_RSQ(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica @@ -555,36 +575,41 @@ void JitCompiler::Compile_RSQ(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_NOP(Instruction instr) { +void JitShader::Compile_NOP(Instruction instr) { } -void JitCompiler::Compile_END(Instruction instr) { +void JitShader::Compile_END(Instruction instr) { ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); RET(); } -void JitCompiler::Compile_CALL(Instruction instr) { - unsigned offset = instr.flow_control.dest_offset; - while (offset < (instr.flow_control.dest_offset + instr.flow_control.num_instructions)) { - Compile_NextInstr(&offset); - } +void JitShader::Compile_CALL(Instruction instr) { + // Push offset of the return + PUSH(64, Imm32(instr.flow_control.dest_offset + instr.flow_control.num_instructions)); + + // Call the subroutine + FixupBranch b = CALL(); + fixup_branches.push_back({ b, instr.flow_control.dest_offset }); + + // Skip over the return offset that's on the stack + ADD(64, R(RSP), Imm32(8)); } -void JitCompiler::Compile_CALLC(Instruction instr) { +void JitShader::Compile_CALLC(Instruction instr) { Compile_EvaluateCondition(instr); FixupBranch b = J_CC(CC_Z, true); Compile_CALL(instr); SetJumpTarget(b); } -void JitCompiler::Compile_CALLU(Instruction instr) { +void JitShader::Compile_CALLU(Instruction instr) { Compile_UniformCondition(instr); FixupBranch b = J_CC(CC_Z, true); Compile_CALL(instr); SetJumpTarget(b); } -void JitCompiler::Compile_CMP(Instruction instr) { +void JitShader::Compile_CMP(Instruction instr) { using Op = Instruction::Common::CompareOpType::Op; Op op_x = instr.common.compare_op.x; Op op_y = instr.common.compare_op.y; @@ -627,7 +652,7 @@ void JitCompiler::Compile_CMP(Instruction instr) { SHR(64, R(COND1), Imm8(63)); } -void JitCompiler::Compile_MAD(Instruction instr) { +void JitShader::Compile_MAD(Instruction instr) { Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { @@ -644,9 +669,8 @@ void JitCompiler::Compile_MAD(Instruction instr) { Compile_DestEnable(instr, SRC1); } -void JitCompiler::Compile_IF(Instruction instr) { - ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards if-statements (%d -> %d) not supported", - *offset_ptr, instr.flow_control.dest_offset.Value()); +void JitShader::Compile_IF(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards if-statements not supported"); // Evaluate the "IF" condition if (instr.opcode.Value() == OpCode::Id::IFU) { @@ -676,10 +700,9 @@ void JitCompiler::Compile_IF(Instruction instr) { SetJumpTarget(b2); } -void JitCompiler::Compile_LOOP(Instruction instr) { - ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards loops (%d -> %d) not supported", - *offset_ptr, instr.flow_control.dest_offset.Value()); - ASSERT_MSG(!looping, "Nested loops not supported"); +void JitShader::Compile_LOOP(Instruction instr) { + Compile_Assert(instr.flow_control.dest_offset >= program_counter, "Backwards loops not supported"); + Compile_Assert(!looping, "Nested loops not supported"); looping = true; @@ -705,10 +728,7 @@ void JitCompiler::Compile_LOOP(Instruction instr) { looping = false; } -void JitCompiler::Compile_JMP(Instruction instr) { - ASSERT_MSG(instr.flow_control.dest_offset > *offset_ptr, "Backwards jumps (%d -> %d) not supported", - *offset_ptr, instr.flow_control.dest_offset.Value()); - +void JitShader::Compile_JMP(Instruction instr) { if (instr.opcode.Value() == OpCode::Id::JMPC) Compile_EvaluateCondition(instr); else if (instr.opcode.Value() == OpCode::Id::JMPU) @@ -718,30 +738,38 @@ void JitCompiler::Compile_JMP(Instruction instr) { bool inverted_condition = (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); + FixupBranch b = J_CC(inverted_condition ? CC_Z : CC_NZ, true); + fixup_branches.push_back({ b, instr.flow_control.dest_offset }); +} - Compile_Block(instr.flow_control.dest_offset); +void JitShader::Compile_Block(unsigned end) { + while (program_counter < end) { + Compile_NextInstr(); + } +} + +void JitShader::Compile_Return() { + // Peek return offset on the stack and check if we're at that offset + MOV(64, R(RAX), MDisp(RSP, 8)); + CMP(32, R(RAX), Imm32(program_counter)); + // If so, jump back to before CALL + FixupBranch b = J_CC(CC_NZ, true); + RET(); SetJumpTarget(b); } -void JitCompiler::Compile_Block(unsigned end) { - // Save current offset pointer - unsigned* prev_offset_ptr = offset_ptr; - unsigned offset = *prev_offset_ptr; +void JitShader::Compile_NextInstr() { + if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { + Compile_Return(); + } - while (offset < end) - Compile_NextInstr(&offset); + ASSERT_MSG(code_ptr[program_counter] == nullptr, "Tried to compile already compiled shader location!"); + code_ptr[program_counter] = GetCodePtr(); - // Restore current offset pointer - offset_ptr = prev_offset_ptr; - *offset_ptr = offset; -} + Instruction instr = GetVertexShaderInstruction(program_counter++); -void JitCompiler::Compile_NextInstr(unsigned* offset) { - offset_ptr = offset; - - Instruction instr = *(Instruction*)&g_state.vs.program_code[(*offset_ptr)++]; OpCode::Id opcode = instr.opcode.Value(); auto instr_func = instr_table[static_cast<unsigned>(opcode)]; @@ -755,9 +783,35 @@ void JitCompiler::Compile_NextInstr(unsigned* offset) { } } -CompiledShader* JitCompiler::Compile() { - const u8* start = GetCodePtr(); - unsigned offset = g_state.regs.vs.main_offset; +void JitShader::FindReturnOffsets() { + return_offsets.clear(); + + for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { + Instruction instr = GetVertexShaderInstruction(offset); + + switch (instr.opcode.Value()) { + case OpCode::Id::CALL: + case OpCode::Id::CALLC: + case OpCode::Id::CALLU: + return_offsets.push_back(instr.flow_control.dest_offset + instr.flow_control.num_instructions); + break; + } + } + + // Sort for efficient binary search later + std::sort(return_offsets.begin(), return_offsets.end()); +} + +void JitShader::Compile() { + // Reset flow control state + program = (CompiledShader*)GetCodePtr(); + program_counter = 0; + looping = false; + code_ptr.fill(nullptr); + fixup_branches.clear(); + + // Find all `CALL` instructions and identify return locations + FindReturnOffsets(); // The stack pointer is 8 modulo 16 at the entry of a procedure ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); @@ -780,21 +834,31 @@ CompiledShader* JitCompiler::Compile() { MOV(PTRBITS, R(RAX), ImmPtr(&neg)); MOVAPS(NEGBIT, MatR(RAX)); - looping = false; + // Jump to start of the shader program + JMPptr(R(ABI_PARAM2)); + + // Compile entire program + Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); - while (offset < g_state.vs.program_code.size()) { - Compile_NextInstr(&offset); + // Set the target for any incomplete branches now that the entire shader program has been emitted + for (const auto& branch : fixup_branches) { + SetJumpTarget(branch.first, code_ptr[branch.second]); } - return (CompiledShader*)start; -} + // Free memory that's no longer needed + return_offsets.clear(); + return_offsets.shrink_to_fit(); + fixup_branches.clear(); + fixup_branches.shrink_to_fit(); + + uintptr_t size = reinterpret_cast<uintptr_t>(GetCodePtr()) - reinterpret_cast<uintptr_t>(program); + ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); -JitCompiler::JitCompiler() { - AllocCodeSpace(jit_cache_size); + LOG_DEBUG(HW_GPU, "Compiled shader size=%d", size); } -void JitCompiler::Clear() { - ClearCodeSpace(); +JitShader::JitShader() { + AllocCodeSpace(MAX_SHADER_SIZE); } } // namespace Shader diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 5357c964b..cd6280ade 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h @@ -4,6 +4,9 @@ #pragma once +#include <utility> +#include <vector> + #include <nihstro/shader_bytecode.h> #include "common/x64/emitter.h" @@ -19,24 +22,22 @@ namespace Pica { namespace Shader { -/// Memory needed to be available to compile the next shader (otherwise, clear the cache) -constexpr size_t jit_shader_size = 1024 * 512; -/// Memory allocated for the JIT code space cache -constexpr size_t jit_cache_size = 1024 * 1024 * 8; - -using CompiledShader = void(void* registers); +/// Memory allocated for each compiled shader (64Kb) +constexpr size_t MAX_SHADER_SIZE = 1024 * 64; /** * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 * code that can be executed on the host machine directly. */ -class JitCompiler : public Gen::XCodeBlock { +class JitShader : public Gen::XCodeBlock { public: - JitCompiler(); + JitShader(); - CompiledShader* Compile(); + void Run(void* registers, unsigned offset) const { + program(registers, code_ptr[offset]); + } - void Clear(); + void Compile(); void Compile_ADD(Instruction instr); void Compile_DP3(Instruction instr); @@ -66,8 +67,9 @@ public: void Compile_MAD(Instruction instr); private: + void Compile_Block(unsigned end); - void Compile_NextInstr(unsigned* offset); + void Compile_NextInstr(); void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); @@ -81,13 +83,39 @@ private: void Compile_EvaluateCondition(Instruction instr); void Compile_UniformCondition(Instruction instr); + /** + * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. + */ + void Compile_Return(); + BitSet32 PersistentCallerSavedRegs(); - /// Pointer to the variable that stores the current Pica code offset. Used to handle nested code blocks. - unsigned* offset_ptr = nullptr; + /** + * Assertion evaluated at compile-time, but only triggered if executed at runtime. + * @param msg Message to be logged if the assertion fails. + */ + void Compile_Assert(bool condition, const char* msg); + + /** + * Analyzes the entire shader program for `CALL` instructions before emitting any code, + * identifying the locations where a return needs to be inserted. + */ + void FindReturnOffsets(); + + /// Mapping of Pica VS instructions to pointers in the emitted code + std::array<const u8*, 1024> code_ptr; + + /// Offsets in code where a return needs to be inserted + std::vector<unsigned> return_offsets; + + unsigned program_counter = 0; ///< Offset of the next instruction to decode + bool looping = false; ///< True if compiling a loop, used to check for nested loops + + /// Branches that need to be fixed up once the entire shader program is compiled + std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches; - /// Set to true if currently in a loop, used to check for the existence of nested loops - bool looping = false; + using CompiledShader = void(void* registers, const u8* start_addr); + CompiledShader* program = nullptr; }; } // Shader |