diff options
37 files changed, 740 insertions, 264 deletions
diff --git a/src/audio_core/algorithm/interpolate.cpp b/src/audio_core/algorithm/interpolate.cpp index 5005ba519..a58f24169 100644 --- a/src/audio_core/algorithm/interpolate.cpp +++ b/src/audio_core/algorithm/interpolate.cpp @@ -5,6 +5,7 @@ #define _USE_MATH_DEFINES #include <algorithm> +#include <climits> #include <cmath> #include <vector> #include "audio_core/algorithm/interpolate.h" @@ -13,13 +14,131 @@ namespace AudioCore { -/// The Lanczos kernel -static double Lanczos(std::size_t a, double x) { - if (x == 0.0) - return 1.0; - const double px = M_PI * x; - return a * std::sin(px) * std::sin(px / a) / (px * px); -} +constexpr std::array<s16, 512> curve_lut0 = { + 6600, 19426, 6722, 3, 6479, 19424, 6845, 9, 6359, 19419, 6968, 15, 6239, + 19412, 7093, 22, 6121, 19403, 7219, 28, 6004, 19391, 7345, 34, 5888, 19377, + 7472, 41, 5773, 19361, 7600, 48, 5659, 19342, 7728, 55, 5546, 19321, 7857, + 62, 5434, 19298, 7987, 69, 5323, 19273, 8118, 77, 5213, 19245, 8249, 84, + 5104, 19215, 8381, 92, 4997, 19183, 8513, 101, 4890, 19148, 8646, 109, 4785, + 19112, 8780, 118, 4681, 19073, 8914, 127, 4579, 19031, 9048, 137, 4477, 18988, + 9183, 147, 4377, 18942, 9318, 157, 4277, 18895, 9454, 168, 4179, 18845, 9590, + 179, 4083, 18793, 9726, 190, 3987, 18738, 9863, 202, 3893, 18682, 10000, 215, + 3800, 18624, 10137, 228, 3709, 18563, 10274, 241, 3618, 18500, 10411, 255, 3529, + 18436, 10549, 270, 3441, 18369, 10687, 285, 3355, 18300, 10824, 300, 3269, 18230, + 10962, 317, 3186, 18157, 11100, 334, 3103, 18082, 11238, 351, 3022, 18006, 11375, + 369, 2942, 17927, 11513, 388, 2863, 17847, 11650, 408, 2785, 17765, 11788, 428, + 2709, 17681, 11925, 449, 2635, 17595, 12062, 471, 2561, 17507, 12198, 494, 2489, + 17418, 12334, 517, 2418, 17327, 12470, 541, 2348, 17234, 12606, 566, 2280, 17140, + 12741, 592, 2213, 17044, 12876, 619, 2147, 16946, 13010, 647, 2083, 16846, 13144, + 675, 2020, 16745, 13277, 704, 1958, 16643, 13409, 735, 1897, 16539, 13541, 766, + 1838, 16434, 13673, 798, 1780, 16327, 13803, 832, 1723, 16218, 13933, 866, 1667, + 16109, 14062, 901, 1613, 15998, 14191, 937, 1560, 15885, 14318, 975, 1508, 15772, + 14445, 1013, 1457, 15657, 14571, 1052, 1407, 15540, 14695, 1093, 1359, 15423, 14819, + 1134, 1312, 15304, 14942, 1177, 1266, 15185, 15064, 1221, 1221, 15064, 15185, 1266, + 1177, 14942, 15304, 1312, 1134, 14819, 15423, 1359, 1093, 14695, 15540, 1407, 1052, + 14571, 15657, 1457, 1013, 14445, 15772, 1508, 975, 14318, 15885, 1560, 937, 14191, + 15998, 1613, 901, 14062, 16109, 1667, 866, 13933, 16218, 1723, 832, 13803, 16327, + 1780, 798, 13673, 16434, 1838, 766, 13541, 16539, 1897, 735, 13409, 16643, 1958, + 704, 13277, 16745, 2020, 675, 13144, 16846, 2083, 647, 13010, 16946, 2147, 619, + 12876, 17044, 2213, 592, 12741, 17140, 2280, 566, 12606, 17234, 2348, 541, 12470, + 17327, 2418, 517, 12334, 17418, 2489, 494, 12198, 17507, 2561, 471, 12062, 17595, + 2635, 449, 11925, 17681, 2709, 428, 11788, 17765, 2785, 408, 11650, 17847, 2863, + 388, 11513, 17927, 2942, 369, 11375, 18006, 3022, 351, 11238, 18082, 3103, 334, + 11100, 18157, 3186, 317, 10962, 18230, 3269, 300, 10824, 18300, 3355, 285, 10687, + 18369, 3441, 270, 10549, 18436, 3529, 255, 10411, 18500, 3618, 241, 10274, 18563, + 3709, 228, 10137, 18624, 3800, 215, 10000, 18682, 3893, 202, 9863, 18738, 3987, + 190, 9726, 18793, 4083, 179, 9590, 18845, 4179, 168, 9454, 18895, 4277, 157, + 9318, 18942, 4377, 147, 9183, 18988, 4477, 137, 9048, 19031, 4579, 127, 8914, + 19073, 4681, 118, 8780, 19112, 4785, 109, 8646, 19148, 4890, 101, 8513, 19183, + 4997, 92, 8381, 19215, 5104, 84, 8249, 19245, 5213, 77, 8118, 19273, 5323, + 69, 7987, 19298, 5434, 62, 7857, 19321, 5546, 55, 7728, 19342, 5659, 48, + 7600, 19361, 5773, 41, 7472, 19377, 5888, 34, 7345, 19391, 6004, 28, 7219, + 19403, 6121, 22, 7093, 19412, 6239, 15, 6968, 19419, 6359, 9, 6845, 19424, + 6479, 3, 6722, 19426, 6600}; + +constexpr std::array<s16, 512> curve_lut1 = { + -68, 32639, 69, -5, -200, 32630, 212, -15, -328, 32613, 359, -26, -450, + 32586, 512, -36, -568, 32551, 669, -47, -680, 32507, 832, -58, -788, 32454, + 1000, -69, -891, 32393, 1174, -80, -990, 32323, 1352, -92, -1084, 32244, 1536, + -103, -1173, 32157, 1724, -115, -1258, 32061, 1919, -128, -1338, 31956, 2118, -140, + -1414, 31844, 2322, -153, -1486, 31723, 2532, -167, -1554, 31593, 2747, -180, -1617, + 31456, 2967, -194, -1676, 31310, 3192, -209, -1732, 31157, 3422, -224, -1783, 30995, + 3657, -240, -1830, 30826, 3897, -256, -1874, 30649, 4143, -272, -1914, 30464, 4393, + -289, -1951, 30272, 4648, -307, -1984, 30072, 4908, -325, -2014, 29866, 5172, -343, + -2040, 29652, 5442, -362, -2063, 29431, 5716, -382, -2083, 29203, 5994, -403, -2100, + 28968, 6277, -424, -2114, 28727, 6565, -445, -2125, 28480, 6857, -468, -2133, 28226, + 7153, -490, -2139, 27966, 7453, -514, -2142, 27700, 7758, -538, -2142, 27428, 8066, + -563, -2141, 27151, 8378, -588, -2136, 26867, 8694, -614, -2130, 26579, 9013, -641, + -2121, 26285, 9336, -668, -2111, 25987, 9663, -696, -2098, 25683, 9993, -724, -2084, + 25375, 10326, -753, -2067, 25063, 10662, -783, -2049, 24746, 11000, -813, -2030, 24425, + 11342, -844, -2009, 24100, 11686, -875, -1986, 23771, 12033, -907, -1962, 23438, 12382, + -939, -1937, 23103, 12733, -972, -1911, 22764, 13086, -1005, -1883, 22422, 13441, -1039, + -1855, 22077, 13798, -1072, -1825, 21729, 14156, -1107, -1795, 21380, 14516, -1141, -1764, + 21027, 14877, -1176, -1732, 20673, 15239, -1211, -1700, 20317, 15602, -1246, -1667, 19959, + 15965, -1282, -1633, 19600, 16329, -1317, -1599, 19239, 16694, -1353, -1564, 18878, 17058, + -1388, -1530, 18515, 17423, -1424, -1495, 18151, 17787, -1459, -1459, 17787, 18151, -1495, + -1424, 17423, 18515, -1530, -1388, 17058, 18878, -1564, -1353, 16694, 19239, -1599, -1317, + 16329, 19600, -1633, -1282, 15965, 19959, -1667, -1246, 15602, 20317, -1700, -1211, 15239, + 20673, -1732, -1176, 14877, 21027, -1764, -1141, 14516, 21380, -1795, -1107, 14156, 21729, + -1825, -1072, 13798, 22077, -1855, -1039, 13441, 22422, -1883, -1005, 13086, 22764, -1911, + -972, 12733, 23103, -1937, -939, 12382, 23438, -1962, -907, 12033, 23771, -1986, -875, + 11686, 24100, -2009, -844, 11342, 24425, -2030, -813, 11000, 24746, -2049, -783, 10662, + 25063, -2067, -753, 10326, 25375, -2084, -724, 9993, 25683, -2098, -696, 9663, 25987, + -2111, -668, 9336, 26285, -2121, -641, 9013, 26579, -2130, -614, 8694, 26867, -2136, + -588, 8378, 27151, -2141, -563, 8066, 27428, -2142, -538, 7758, 27700, -2142, -514, + 7453, 27966, -2139, -490, 7153, 28226, -2133, -468, 6857, 28480, -2125, -445, 6565, + 28727, -2114, -424, 6277, 28968, -2100, -403, 5994, 29203, -2083, -382, 5716, 29431, + -2063, -362, 5442, 29652, -2040, -343, 5172, 29866, -2014, -325, 4908, 30072, -1984, + -307, 4648, 30272, -1951, -289, 4393, 30464, -1914, -272, 4143, 30649, -1874, -256, + 3897, 30826, -1830, -240, 3657, 30995, -1783, -224, 3422, 31157, -1732, -209, 3192, + 31310, -1676, -194, 2967, 31456, -1617, -180, 2747, 31593, -1554, -167, 2532, 31723, + -1486, -153, 2322, 31844, -1414, -140, 2118, 31956, -1338, -128, 1919, 32061, -1258, + -115, 1724, 32157, -1173, -103, 1536, 32244, -1084, -92, 1352, 32323, -990, -80, + 1174, 32393, -891, -69, 1000, 32454, -788, -58, 832, 32507, -680, -47, 669, + 32551, -568, -36, 512, 32586, -450, -26, 359, 32613, -328, -15, 212, 32630, + -200, -5, 69, 32639, -68}; + +constexpr std::array<s16, 512> curve_lut2 = { + 3195, 26287, 3329, -32, 3064, 26281, 3467, -34, 2936, 26270, 3608, -38, 2811, + 26253, 3751, -42, 2688, 26230, 3897, -46, 2568, 26202, 4046, -50, 2451, 26169, + 4199, -54, 2338, 26130, 4354, -58, 2227, 26085, 4512, -63, 2120, 26035, 4673, + -67, 2015, 25980, 4837, -72, 1912, 25919, 5004, -76, 1813, 25852, 5174, -81, + 1716, 25780, 5347, -87, 1622, 25704, 5522, -92, 1531, 25621, 5701, -98, 1442, + 25533, 5882, -103, 1357, 25440, 6066, -109, 1274, 25342, 6253, -115, 1193, 25239, + 6442, -121, 1115, 25131, 6635, -127, 1040, 25018, 6830, -133, 967, 24899, 7027, + -140, 897, 24776, 7227, -146, 829, 24648, 7430, -153, 764, 24516, 7635, -159, + 701, 24379, 7842, -166, 641, 24237, 8052, -174, 583, 24091, 8264, -181, 526, + 23940, 8478, -187, 472, 23785, 8695, -194, 420, 23626, 8914, -202, 371, 23462, + 9135, -209, 324, 23295, 9358, -215, 279, 23123, 9583, -222, 236, 22948, 9809, + -230, 194, 22769, 10038, -237, 154, 22586, 10269, -243, 117, 22399, 10501, -250, + 81, 22208, 10735, -258, 47, 22015, 10970, -265, 15, 21818, 11206, -271, -16, + 21618, 11444, -277, -44, 21415, 11684, -283, -71, 21208, 11924, -290, -97, 20999, + 12166, -296, -121, 20786, 12409, -302, -143, 20571, 12653, -306, -163, 20354, 12898, + -311, -183, 20134, 13143, -316, -201, 19911, 13389, -321, -218, 19686, 13635, -325, + -234, 19459, 13882, -328, -248, 19230, 14130, -332, -261, 18998, 14377, -335, -273, + 18765, 14625, -337, -284, 18531, 14873, -339, -294, 18295, 15121, -341, -302, 18057, + 15369, -341, -310, 17817, 15617, -341, -317, 17577, 15864, -340, -323, 17335, 16111, + -340, -328, 17092, 16357, -338, -332, 16848, 16603, -336, -336, 16603, 16848, -332, + -338, 16357, 17092, -328, -340, 16111, 17335, -323, -340, 15864, 17577, -317, -341, + 15617, 17817, -310, -341, 15369, 18057, -302, -341, 15121, 18295, -294, -339, 14873, + 18531, -284, -337, 14625, 18765, -273, -335, 14377, 18998, -261, -332, 14130, 19230, + -248, -328, 13882, 19459, -234, -325, 13635, 19686, -218, -321, 13389, 19911, -201, + -316, 13143, 20134, -183, -311, 12898, 20354, -163, -306, 12653, 20571, -143, -302, + 12409, 20786, -121, -296, 12166, 20999, -97, -290, 11924, 21208, -71, -283, 11684, + 21415, -44, -277, 11444, 21618, -16, -271, 11206, 21818, 15, -265, 10970, 22015, + 47, -258, 10735, 22208, 81, -250, 10501, 22399, 117, -243, 10269, 22586, 154, + -237, 10038, 22769, 194, -230, 9809, 22948, 236, -222, 9583, 23123, 279, -215, + 9358, 23295, 324, -209, 9135, 23462, 371, -202, 8914, 23626, 420, -194, 8695, + 23785, 472, -187, 8478, 23940, 526, -181, 8264, 24091, 583, -174, 8052, 24237, + 641, -166, 7842, 24379, 701, -159, 7635, 24516, 764, -153, 7430, 24648, 829, + -146, 7227, 24776, 897, -140, 7027, 24899, 967, -133, 6830, 25018, 1040, -127, + 6635, 25131, 1115, -121, 6442, 25239, 1193, -115, 6253, 25342, 1274, -109, 6066, + 25440, 1357, -103, 5882, 25533, 1442, -98, 5701, 25621, 1531, -92, 5522, 25704, + 1622, -87, 5347, 25780, 1716, -81, 5174, 25852, 1813, -76, 5004, 25919, 1912, + -72, 4837, 25980, 2015, -67, 4673, 26035, 2120, -63, 4512, 26085, 2227, -58, + 4354, 26130, 2338, -54, 4199, 26169, 2451, -50, 4046, 26202, 2568, -46, 3897, + 26230, 2688, -42, 3751, 26253, 2811, -38, 3608, 26270, 2936, -34, 3467, 26281, + 3064, -32, 3329, 26287, 3195}; std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, double ratio) { if (input.size() < 2) @@ -30,40 +149,39 @@ std::vector<s16> Interpolate(InterpolationState& state, std::vector<s16> input, ratio = 1.0; } - if (ratio != state.current_ratio) { - const double cutoff_frequency = std::min(0.5 / ratio, 0.5 * ratio); - state.nyquist = CascadingFilter::LowPass(std::clamp(cutoff_frequency, 0.0, 0.4), 3); - state.current_ratio = ratio; - } - state.nyquist.Process(input); - - constexpr std::size_t taps = InterpolationState::lanczos_taps; - const std::size_t num_frames = input.size() / 2; - - std::vector<s16> output; - output.reserve(static_cast<std::size_t>(input.size() / ratio + 4)); - - double& pos = state.position; - auto& h = state.history; - for (std::size_t i = 0; i < num_frames; ++i) { - std::rotate(h.begin(), h.end() - 1, h.end()); - h[0][0] = input[i * 2 + 0]; - h[0][1] = input[i * 2 + 1]; - - while (pos <= 1.0) { - double l = 0.0; - double r = 0.0; - for (std::size_t j = 0; j < h.size(); j++) { - const double lanczos_calc = Lanczos(taps, pos + j - taps + 1); - l += lanczos_calc * h[j][0]; - r += lanczos_calc * h[j][1]; - } - output.emplace_back(static_cast<s16>(std::clamp(l, -32768.0, 32767.0))); - output.emplace_back(static_cast<s16>(std::clamp(r, -32768.0, 32767.0))); - - pos += ratio; + const int step = static_cast<int>(ratio * 0x8000); + const std::array<s16, 512>& lut = [step] { + if (step > 0xaaaa) { + return curve_lut0; + } + if (step <= 0x8000) { + return curve_lut1; } - pos -= 1.0; + return curve_lut2; + }(); + + std::vector<s16> output(static_cast<std::size_t>(input.size() / ratio)); + int in_offset = 0; + for (std::size_t out_offset = 0; out_offset < output.size(); out_offset += 2) { + const int lut_index = (state.fraction >> 8) * 4; + + const int l = input[(in_offset + 0) * 2 + 0] * lut[lut_index + 0] + + input[(in_offset + 1) * 2 + 0] * lut[lut_index + 1] + + input[(in_offset + 2) * 2 + 0] * lut[lut_index + 2] + + input[(in_offset + 3) * 2 + 0] * lut[lut_index + 3]; + + const int r = input[(in_offset + 0) * 2 + 1] * lut[lut_index + 0] + + input[(in_offset + 1) * 2 + 1] * lut[lut_index + 1] + + input[(in_offset + 2) * 2 + 1] * lut[lut_index + 2] + + input[(in_offset + 3) * 2 + 1] * lut[lut_index + 3]; + + const int new_offset = state.fraction + step; + + in_offset += new_offset >> 15; + state.fraction = new_offset & 0x7fff; + + output[out_offset + 0] = static_cast<s16>(std::clamp(l >> 15, SHRT_MIN, SHRT_MAX)); + output[out_offset + 1] = static_cast<s16>(std::clamp(r >> 15, SHRT_MIN, SHRT_MAX)); } return output; diff --git a/src/audio_core/algorithm/interpolate.h b/src/audio_core/algorithm/interpolate.h index edbd6460f..1b9831a75 100644 --- a/src/audio_core/algorithm/interpolate.h +++ b/src/audio_core/algorithm/interpolate.h @@ -6,19 +6,12 @@ #include <array> #include <vector> -#include "audio_core/algorithm/filter.h" #include "common/common_types.h" namespace AudioCore { struct InterpolationState { - static constexpr std::size_t lanczos_taps = 4; - static constexpr std::size_t history_size = lanczos_taps * 2 - 1; - - double current_ratio = 0.0; - CascadingFilter nyquist; - std::array<std::array<s16, 2>, history_size> history = {}; - double position = 0; + int fraction = 0; }; /// Interpolates input signal to produce output signal. diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 26612e692..88c06b2ce 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -187,6 +187,8 @@ add_library(core STATIC hle/kernel/synchronization.h hle/kernel/thread.cpp hle/kernel/thread.h + hle/kernel/time_manager.cpp + hle/kernel/time_manager.h hle/kernel/transfer_memory.cpp hle/kernel/transfer_memory.h hle/kernel/vm_manager.cpp diff --git a/src/core/core.cpp b/src/core/core.cpp index 0eb0c0dca..86e314c94 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -707,4 +707,12 @@ const Service::SM::ServiceManager& System::ServiceManager() const { return *impl->service_manager; } +void System::RegisterCoreThread(std::size_t id) { + impl->kernel.RegisterCoreThread(id); +} + +void System::RegisterHostThread() { + impl->kernel.RegisterHostThread(); +} + } // namespace Core diff --git a/src/core/core.h b/src/core/core.h index e69d68fcf..8d862a8e6 100644 --- a/src/core/core.h +++ b/src/core/core.h @@ -360,6 +360,12 @@ public: const CurrentBuildProcessID& GetCurrentProcessBuildID() const; + /// Register a host thread as an emulated CPU Core. + void RegisterCoreThread(std::size_t id); + + /// Register a host thread as an auxiliary thread. + void RegisterHostThread(); + private: System(); diff --git a/src/core/hardware_properties.h b/src/core/hardware_properties.h index 213461b6a..b04e046ed 100644 --- a/src/core/hardware_properties.h +++ b/src/core/hardware_properties.h @@ -20,6 +20,8 @@ constexpr u32 NUM_CPU_CORES = 4; // Number of CPU Cores } // namespace Hardware +constexpr u32 INVALID_HOST_THREAD_ID = 0xFFFFFFFF; + struct EmuThreadHandle { u32 host_handle; u32 guest_handle; diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 4eb1d8703..9232f4d7e 100644 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -3,9 +3,12 @@ // Refer to the license.txt file included. #include <atomic> +#include <bitset> #include <functional> #include <memory> #include <mutex> +#include <thread> +#include <unordered_map> #include <utility> #include "common/assert.h" @@ -15,6 +18,7 @@ #include "core/core.h" #include "core/core_timing.h" #include "core/core_timing_util.h" +#include "core/hardware_properties.h" #include "core/hle/kernel/client_port.h" #include "core/hle/kernel/errors.h" #include "core/hle/kernel/handle_table.h" @@ -25,6 +29,7 @@ #include "core/hle/kernel/scheduler.h" #include "core/hle/kernel/synchronization.h" #include "core/hle/kernel/thread.h" +#include "core/hle/kernel/time_manager.h" #include "core/hle/lock.h" #include "core/hle/result.h" #include "core/memory.h" @@ -44,7 +49,7 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_ std::lock_guard lock{HLE::g_hle_lock}; std::shared_ptr<Thread> thread = - system.Kernel().RetrieveThreadFromWakeupCallbackHandleTable(proper_handle); + system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle); if (thread == nullptr) { LOG_CRITICAL(Kernel, "Callback fired for invalid thread {:08X}", proper_handle); return; @@ -97,8 +102,8 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_ } struct KernelCore::Impl { - explicit Impl(Core::System& system) - : system{system}, global_scheduler{system}, synchronization{system} {} + explicit Impl(Core::System& system, KernelCore& kernel) + : system{system}, global_scheduler{kernel}, synchronization{system}, time_manager{system} {} void Initialize(KernelCore& kernel) { Shutdown(); @@ -120,7 +125,7 @@ struct KernelCore::Impl { system_resource_limit = nullptr; - thread_wakeup_callback_handle_table.Clear(); + global_handle_table.Clear(); thread_wakeup_event_type = nullptr; preemption_event = nullptr; @@ -138,8 +143,8 @@ struct KernelCore::Impl { void InitializePhysicalCores() { exclusive_monitor = - Core::MakeExclusiveMonitor(system.Memory(), global_scheduler.CpuCoresCount()); - for (std::size_t i = 0; i < global_scheduler.CpuCoresCount(); i++) { + Core::MakeExclusiveMonitor(system.Memory(), Core::Hardware::NUM_CPU_CORES); + for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) { cores.emplace_back(system, i, *exclusive_monitor); } } @@ -184,6 +189,50 @@ struct KernelCore::Impl { system.Memory().SetCurrentPageTable(*process); } + void RegisterCoreThread(std::size_t core_id) { + std::unique_lock lock{register_thread_mutex}; + const std::thread::id this_id = std::this_thread::get_id(); + const auto it = host_thread_ids.find(this_id); + ASSERT(core_id < Core::Hardware::NUM_CPU_CORES); + ASSERT(it == host_thread_ids.end()); + ASSERT(!registered_core_threads[core_id]); + host_thread_ids[this_id] = static_cast<u32>(core_id); + registered_core_threads.set(core_id); + } + + void RegisterHostThread() { + std::unique_lock lock{register_thread_mutex}; + const std::thread::id this_id = std::this_thread::get_id(); + const auto it = host_thread_ids.find(this_id); + ASSERT(it == host_thread_ids.end()); + host_thread_ids[this_id] = registered_thread_ids++; + } + + u32 GetCurrentHostThreadID() const { + const std::thread::id this_id = std::this_thread::get_id(); + const auto it = host_thread_ids.find(this_id); + if (it == host_thread_ids.end()) { + return Core::INVALID_HOST_THREAD_ID; + } + return it->second; + } + + Core::EmuThreadHandle GetCurrentEmuThreadID() const { + Core::EmuThreadHandle result = Core::EmuThreadHandle::InvalidHandle(); + result.host_handle = GetCurrentHostThreadID(); + if (result.host_handle >= Core::Hardware::NUM_CPU_CORES) { + return result; + } + const Kernel::Scheduler& sched = cores[result.host_handle].Scheduler(); + const Kernel::Thread* current = sched.GetCurrentThread(); + if (current != nullptr) { + result.guest_handle = current->GetGlobalHandle(); + } else { + result.guest_handle = InvalidHandle; + } + return result; + } + std::atomic<u32> next_object_id{0}; std::atomic<u64> next_kernel_process_id{Process::InitialKIPIDMin}; std::atomic<u64> next_user_process_id{Process::ProcessIDMin}; @@ -194,15 +243,16 @@ struct KernelCore::Impl { Process* current_process = nullptr; Kernel::GlobalScheduler global_scheduler; Kernel::Synchronization synchronization; + Kernel::TimeManager time_manager; std::shared_ptr<ResourceLimit> system_resource_limit; std::shared_ptr<Core::Timing::EventType> thread_wakeup_event_type; std::shared_ptr<Core::Timing::EventType> preemption_event; - // TODO(yuriks): This can be removed if Thread objects are explicitly pooled in the future, - // allowing us to simply use a pool index or similar. - Kernel::HandleTable thread_wakeup_callback_handle_table; + // This is the kernel's handle table or supervisor handle table which + // stores all the objects in place. + Kernel::HandleTable global_handle_table; /// Map of named ports managed by the kernel, which can be retrieved using /// the ConnectToPort SVC. @@ -211,11 +261,17 @@ struct KernelCore::Impl { std::unique_ptr<Core::ExclusiveMonitor> exclusive_monitor; std::vector<Kernel::PhysicalCore> cores; + // 0-3 IDs represent core threads, >3 represent others + std::unordered_map<std::thread::id, u32> host_thread_ids; + u32 registered_thread_ids{Core::Hardware::NUM_CPU_CORES}; + std::bitset<Core::Hardware::NUM_CPU_CORES> registered_core_threads; + std::mutex register_thread_mutex; + // System context Core::System& system; }; -KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system)} {} +KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system, *this)} {} KernelCore::~KernelCore() { Shutdown(); } @@ -232,9 +288,8 @@ std::shared_ptr<ResourceLimit> KernelCore::GetSystemResourceLimit() const { return impl->system_resource_limit; } -std::shared_ptr<Thread> KernelCore::RetrieveThreadFromWakeupCallbackHandleTable( - Handle handle) const { - return impl->thread_wakeup_callback_handle_table.Get<Thread>(handle); +std::shared_ptr<Thread> KernelCore::RetrieveThreadFromGlobalHandleTable(Handle handle) const { + return impl->global_handle_table.Get<Thread>(handle); } void KernelCore::AppendNewProcess(std::shared_ptr<Process> process) { @@ -265,6 +320,14 @@ const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const { return impl->global_scheduler; } +Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) { + return impl->cores[id].Scheduler(); +} + +const Kernel::Scheduler& KernelCore::Scheduler(std::size_t id) const { + return impl->cores[id].Scheduler(); +} + Kernel::PhysicalCore& KernelCore::PhysicalCore(std::size_t id) { return impl->cores[id]; } @@ -281,6 +344,14 @@ const Kernel::Synchronization& KernelCore::Synchronization() const { return impl->synchronization; } +Kernel::TimeManager& KernelCore::TimeManager() { + return impl->time_manager; +} + +const Kernel::TimeManager& KernelCore::TimeManager() const { + return impl->time_manager; +} + Core::ExclusiveMonitor& KernelCore::GetExclusiveMonitor() { return *impl->exclusive_monitor; } @@ -338,12 +409,28 @@ const std::shared_ptr<Core::Timing::EventType>& KernelCore::ThreadWakeupCallback return impl->thread_wakeup_event_type; } -Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() { - return impl->thread_wakeup_callback_handle_table; +Kernel::HandleTable& KernelCore::GlobalHandleTable() { + return impl->global_handle_table; +} + +const Kernel::HandleTable& KernelCore::GlobalHandleTable() const { + return impl->global_handle_table; +} + +void KernelCore::RegisterCoreThread(std::size_t core_id) { + impl->RegisterCoreThread(core_id); +} + +void KernelCore::RegisterHostThread() { + impl->RegisterHostThread(); +} + +u32 KernelCore::GetCurrentHostThreadID() const { + return impl->GetCurrentHostThreadID(); } -const Kernel::HandleTable& KernelCore::ThreadWakeupCallbackHandleTable() const { - return impl->thread_wakeup_callback_handle_table; +Core::EmuThreadHandle KernelCore::GetCurrentEmuThreadID() const { + return impl->GetCurrentEmuThreadID(); } } // namespace Kernel diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h index 1eede3063..c4f78ab71 100644 --- a/src/core/hle/kernel/kernel.h +++ b/src/core/hle/kernel/kernel.h @@ -11,6 +11,7 @@ #include "core/hle/kernel/object.h" namespace Core { +struct EmuThreadHandle; class ExclusiveMonitor; class System; } // namespace Core @@ -29,8 +30,10 @@ class HandleTable; class PhysicalCore; class Process; class ResourceLimit; +class Scheduler; class Synchronization; class Thread; +class TimeManager; /// Represents a single instance of the kernel. class KernelCore { @@ -64,7 +67,7 @@ public: std::shared_ptr<ResourceLimit> GetSystemResourceLimit() const; /// Retrieves a shared pointer to a Thread instance within the thread wakeup handle table. - std::shared_ptr<Thread> RetrieveThreadFromWakeupCallbackHandleTable(Handle handle) const; + std::shared_ptr<Thread> RetrieveThreadFromGlobalHandleTable(Handle handle) const; /// Adds the given shared pointer to an internal list of active processes. void AppendNewProcess(std::shared_ptr<Process> process); @@ -87,6 +90,12 @@ public: /// Gets the sole instance of the global scheduler const Kernel::GlobalScheduler& GlobalScheduler() const; + /// Gets the sole instance of the Scheduler assoviated with cpu core 'id' + Kernel::Scheduler& Scheduler(std::size_t id); + + /// Gets the sole instance of the Scheduler assoviated with cpu core 'id' + const Kernel::Scheduler& Scheduler(std::size_t id) const; + /// Gets the an instance of the respective physical CPU core. Kernel::PhysicalCore& PhysicalCore(std::size_t id); @@ -99,6 +108,12 @@ public: /// Gets the an instance of the Synchronization Interface. const Kernel::Synchronization& Synchronization() const; + /// Gets the an instance of the TimeManager Interface. + Kernel::TimeManager& TimeManager(); + + /// Gets the an instance of the TimeManager Interface. + const Kernel::TimeManager& TimeManager() const; + /// Stops execution of 'id' core, in order to reschedule a new thread. void PrepareReschedule(std::size_t id); @@ -120,6 +135,18 @@ public: /// Determines whether or not the given port is a valid named port. bool IsValidNamedPort(NamedPortTable::const_iterator port) const; + /// Gets the current host_thread/guest_thread handle. + Core::EmuThreadHandle GetCurrentEmuThreadID() const; + + /// Gets the current host_thread handle. + u32 GetCurrentHostThreadID() const; + + /// Register the current thread as a CPU Core Thread. + void RegisterCoreThread(std::size_t core_id); + + /// Register the current thread as a non CPU core thread. + void RegisterHostThread(); + private: friend class Object; friend class Process; @@ -140,11 +167,11 @@ private: /// Retrieves the event type used for thread wakeup callbacks. const std::shared_ptr<Core::Timing::EventType>& ThreadWakeupCallbackEventType() const; - /// Provides a reference to the thread wakeup callback handle table. - Kernel::HandleTable& ThreadWakeupCallbackHandleTable(); + /// Provides a reference to the global handle table. + Kernel::HandleTable& GlobalHandleTable(); - /// Provides a const reference to the thread wakeup callback handle table. - const Kernel::HandleTable& ThreadWakeupCallbackHandleTable() const; + /// Provides a const reference to the global handle table. + const Kernel::HandleTable& GlobalHandleTable() const; struct Impl; std::unique_ptr<Impl> impl; diff --git a/src/core/hle/kernel/scheduler.cpp b/src/core/hle/kernel/scheduler.cpp index 86f1421bf..c65f82fb7 100644 --- a/src/core/hle/kernel/scheduler.cpp +++ b/src/core/hle/kernel/scheduler.cpp @@ -18,10 +18,11 @@ #include "core/hle/kernel/kernel.h" #include "core/hle/kernel/process.h" #include "core/hle/kernel/scheduler.h" +#include "core/hle/kernel/time_manager.h" namespace Kernel { -GlobalScheduler::GlobalScheduler(Core::System& system) : system{system} {} +GlobalScheduler::GlobalScheduler(KernelCore& kernel) : kernel{kernel} {} GlobalScheduler::~GlobalScheduler() = default; @@ -35,7 +36,7 @@ void GlobalScheduler::RemoveThread(std::shared_ptr<Thread> thread) { } void GlobalScheduler::UnloadThread(std::size_t core) { - Scheduler& sched = system.Scheduler(core); + Scheduler& sched = kernel.Scheduler(core); sched.UnloadThread(); } @@ -50,7 +51,7 @@ void GlobalScheduler::SelectThread(std::size_t core) { sched.is_context_switch_pending = sched.selected_thread != sched.current_thread; std::atomic_thread_fence(std::memory_order_seq_cst); }; - Scheduler& sched = system.Scheduler(core); + Scheduler& sched = kernel.Scheduler(core); Thread* current_thread = nullptr; // Step 1: Get top thread in schedule queue. current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front(); @@ -356,6 +357,32 @@ void GlobalScheduler::Shutdown() { thread_list.clear(); } +void GlobalScheduler::Lock() { + Core::EmuThreadHandle current_thread = kernel.GetCurrentEmuThreadID(); + if (current_thread == current_owner) { + ++scope_lock; + } else { + inner_lock.lock(); + current_owner = current_thread; + ASSERT(current_owner != Core::EmuThreadHandle::InvalidHandle()); + scope_lock = 1; + } +} + +void GlobalScheduler::Unlock() { + if (--scope_lock != 0) { + ASSERT(scope_lock > 0); + return; + } + for (std::size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) { + SelectThread(i); + } + current_owner = Core::EmuThreadHandle::InvalidHandle(); + scope_lock = 1; + inner_lock.unlock(); + // TODO(Blinkhawk): Setup the interrupts and change context on current core. +} + Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, std::size_t core_id) : system(system), cpu_core(cpu_core), core_id(core_id) {} @@ -485,4 +512,27 @@ void Scheduler::Shutdown() { selected_thread = nullptr; } +SchedulerLock::SchedulerLock(KernelCore& kernel) : kernel{kernel} { + kernel.GlobalScheduler().Lock(); +} + +SchedulerLock::~SchedulerLock() { + kernel.GlobalScheduler().Unlock(); +} + +SchedulerLockAndSleep::SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle, + Thread* time_task, s64 nanoseconds) + : SchedulerLock{kernel}, event_handle{event_handle}, time_task{time_task}, nanoseconds{ + nanoseconds} { + event_handle = InvalidHandle; +} + +SchedulerLockAndSleep::~SchedulerLockAndSleep() { + if (sleep_cancelled) { + return; + } + auto& time_manager = kernel.TimeManager(); + time_manager.ScheduleTimeEvent(event_handle, time_task, nanoseconds); +} + } // namespace Kernel diff --git a/src/core/hle/kernel/scheduler.h b/src/core/hle/kernel/scheduler.h index 96db049cb..1c93a838c 100644 --- a/src/core/hle/kernel/scheduler.h +++ b/src/core/hle/kernel/scheduler.h @@ -6,6 +6,7 @@ #include <atomic> #include <memory> +#include <mutex> #include <vector> #include "common/common_types.h" @@ -20,11 +21,13 @@ class System; namespace Kernel { +class KernelCore; class Process; +class SchedulerLock; class GlobalScheduler final { public: - explicit GlobalScheduler(Core::System& system); + explicit GlobalScheduler(KernelCore& kernel); ~GlobalScheduler(); /// Adds a new thread to the scheduler @@ -138,6 +141,14 @@ public: void Shutdown(); private: + friend class SchedulerLock; + + /// Lock the scheduler to the current thread. + void Lock(); + + /// Unlocks the scheduler, reselects threads, interrupts cores for rescheduling + /// and reschedules current core if needed. + void Unlock(); /** * Transfers a thread into an specific core. If the destination_core is -1 * it will be unscheduled from its source code and added into its suggested @@ -158,9 +169,14 @@ private: // ordered from Core 0 to Core 3. std::array<u32, Core::Hardware::NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62}; + /// Scheduler lock mechanisms. + std::mutex inner_lock{}; // TODO(Blinkhawk): Replace for a SpinLock + std::atomic<s64> scope_lock{}; + Core::EmuThreadHandle current_owner{Core::EmuThreadHandle::InvalidHandle()}; + /// Lists all thread ids that aren't deleted/etc. std::vector<std::shared_ptr<Thread>> thread_list; - Core::System& system; + KernelCore& kernel; }; class Scheduler final { @@ -227,4 +243,30 @@ private: bool is_context_switch_pending = false; }; +class SchedulerLock { +public: + explicit SchedulerLock(KernelCore& kernel); + ~SchedulerLock(); + +protected: + KernelCore& kernel; +}; + +class SchedulerLockAndSleep : public SchedulerLock { +public: + explicit SchedulerLockAndSleep(KernelCore& kernel, Handle& event_handle, Thread* time_task, + s64 nanoseconds); + ~SchedulerLockAndSleep(); + + void CancelSleep() { + sleep_cancelled = true; + } + +private: + Handle& event_handle; + Thread* time_task; + s64 nanoseconds; + bool sleep_cancelled{}; +}; + } // namespace Kernel diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp index ae5f2c8bd..bf850e0b2 100644 --- a/src/core/hle/kernel/thread.cpp +++ b/src/core/hle/kernel/thread.cpp @@ -46,9 +46,9 @@ Thread::~Thread() = default; void Thread::Stop() { // Cancel any outstanding wakeup events for this thread Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(), - callback_handle); - kernel.ThreadWakeupCallbackHandleTable().Close(callback_handle); - callback_handle = 0; + global_handle); + kernel.GlobalHandleTable().Close(global_handle); + global_handle = 0; SetStatus(ThreadStatus::Dead); Signal(); @@ -73,12 +73,12 @@ void Thread::WakeAfterDelay(s64 nanoseconds) { // thread-safe version of ScheduleEvent. const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds}); Core::System::GetInstance().CoreTiming().ScheduleEvent( - cycles, kernel.ThreadWakeupCallbackEventType(), callback_handle); + cycles, kernel.ThreadWakeupCallbackEventType(), global_handle); } void Thread::CancelWakeupTimer() { Core::System::GetInstance().CoreTiming().UnscheduleEvent(kernel.ThreadWakeupCallbackEventType(), - callback_handle); + global_handle); } void Thread::ResumeFromWait() { @@ -190,7 +190,7 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(KernelCore& kernel, std::strin thread->condvar_wait_address = 0; thread->wait_handle = 0; thread->name = std::move(name); - thread->callback_handle = kernel.ThreadWakeupCallbackHandleTable().Create(thread).Unwrap(); + thread->global_handle = kernel.GlobalHandleTable().Create(thread).Unwrap(); thread->owner_process = &owner_process; auto& scheduler = kernel.GlobalScheduler(); scheduler.AddThread(thread); diff --git a/src/core/hle/kernel/thread.h b/src/core/hle/kernel/thread.h index 7a4916318..129e7858a 100644 --- a/src/core/hle/kernel/thread.h +++ b/src/core/hle/kernel/thread.h @@ -453,6 +453,10 @@ public: is_sync_cancelled = value; } + Handle GetGlobalHandle() const { + return global_handle; + } + private: void SetSchedulingStatus(ThreadSchedStatus new_status); void SetCurrentPriority(u32 new_priority); @@ -514,7 +518,7 @@ private: VAddr arb_wait_address{0}; /// Handle used as userdata to reference this object when inserting into the CoreTiming queue. - Handle callback_handle = 0; + Handle global_handle = 0; /// Callback that will be invoked when the thread is resumed from a waiting state. If the thread /// was waiting via WaitSynchronization then the object will be the last object that became diff --git a/src/core/hle/kernel/time_manager.cpp b/src/core/hle/kernel/time_manager.cpp new file mode 100644 index 000000000..21b290468 --- /dev/null +++ b/src/core/hle/kernel/time_manager.cpp @@ -0,0 +1,44 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "core/core.h" +#include "core/core_timing.h" +#include "core/core_timing_util.h" +#include "core/hle/kernel/handle_table.h" +#include "core/hle/kernel/kernel.h" +#include "core/hle/kernel/thread.h" +#include "core/hle/kernel/time_manager.h" + +namespace Kernel { + +TimeManager::TimeManager(Core::System& system) : system{system} { + time_manager_event_type = Core::Timing::CreateEvent( + "Kernel::TimeManagerCallback", [this](u64 thread_handle, [[maybe_unused]] s64 cycles_late) { + Handle proper_handle = static_cast<Handle>(thread_handle); + std::shared_ptr<Thread> thread = + this->system.Kernel().RetrieveThreadFromGlobalHandleTable(proper_handle); + thread->ResumeFromWait(); + }); +} + +void TimeManager::ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds) { + if (nanoseconds > 0) { + ASSERT(timetask); + event_handle = timetask->GetGlobalHandle(); + const s64 cycles = Core::Timing::nsToCycles(std::chrono::nanoseconds{nanoseconds}); + system.CoreTiming().ScheduleEvent(cycles, time_manager_event_type, event_handle); + } else { + event_handle = InvalidHandle; + } +} + +void TimeManager::UnscheduleTimeEvent(Handle event_handle) { + if (event_handle == InvalidHandle) { + return; + } + system.CoreTiming().UnscheduleEvent(time_manager_event_type, event_handle); +} + +} // namespace Kernel diff --git a/src/core/hle/kernel/time_manager.h b/src/core/hle/kernel/time_manager.h new file mode 100644 index 000000000..eaec486d1 --- /dev/null +++ b/src/core/hle/kernel/time_manager.h @@ -0,0 +1,43 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> + +#include "core/hle/kernel/object.h" + +namespace Core { +class System; +} // namespace Core + +namespace Core::Timing { +struct EventType; +} // namespace Core::Timing + +namespace Kernel { + +class Thread; + +/** + * The `TimeManager` takes care of scheduling time events on threads and executes their TimeUp + * method when the event is triggered. + */ +class TimeManager { +public: + explicit TimeManager(Core::System& system); + + /// Schedule a time event on `timetask` thread that will expire in 'nanoseconds' + /// returns a non-invalid handle in `event_handle` if correctly scheduled + void ScheduleTimeEvent(Handle& event_handle, Thread* timetask, s64 nanoseconds); + + /// Unschedule an existing time event + void UnscheduleTimeEvent(Handle event_handle); + +private: + Core::System& system; + std::shared_ptr<Core::Timing::EventType> time_manager_event_type; +}; + +} // namespace Kernel diff --git a/src/core/hle/service/hid/controllers/npad.cpp b/src/core/hle/service/hid/controllers/npad.cpp index 15c09f04c..c1e32b28c 100644 --- a/src/core/hle/service/hid/controllers/npad.cpp +++ b/src/core/hle/service/hid/controllers/npad.cpp @@ -287,13 +287,13 @@ void Controller_NPad::RequestPadStateUpdate(u32 npad_id) { analog_state[static_cast<std::size_t>(JoystickId::Joystick_Left)]->GetAnalogDirectionStatus( Input::AnalogDirection::DOWN)); - pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT)); - pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT)); pad_state.r_stick_right.Assign( analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] - ->GetAnalogDirectionStatus(Input::AnalogDirection::UP)); + ->GetAnalogDirectionStatus(Input::AnalogDirection::RIGHT)); + pad_state.r_stick_left.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] + ->GetAnalogDirectionStatus(Input::AnalogDirection::LEFT)); + pad_state.r_stick_up.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] + ->GetAnalogDirectionStatus(Input::AnalogDirection::UP)); pad_state.r_stick_down.Assign(analog_state[static_cast<std::size_t>(JoystickId::Joystick_Right)] ->GetAnalogDirectionStatus(Input::AnalogDirection::DOWN)); diff --git a/src/input_common/analog_from_button.cpp b/src/input_common/analog_from_button.cpp index e1a260762..6cabdaa3c 100755 --- a/src/input_common/analog_from_button.cpp +++ b/src/input_common/analog_from_button.cpp @@ -34,6 +34,20 @@ public: y * coef * (x == 0 ? 1.0f : SQRT_HALF)); } + bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override { + switch (direction) { + case Input::AnalogDirection::RIGHT: + return right->GetStatus(); + case Input::AnalogDirection::LEFT: + return left->GetStatus(); + case Input::AnalogDirection::UP: + return up->GetStatus(); + case Input::AnalogDirection::DOWN: + return down->GetStatus(); + } + return false; + } + private: Button up; Button down; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 26939be3f..6ea7cc6a5 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -542,7 +542,7 @@ public: BitField<12, 1, InvMemoryLayout> type; } memory_layout; union { - BitField<0, 16, u32> array_mode; + BitField<0, 16, u32> layers; BitField<16, 1, u32> volume; }; u32 layer_stride; @@ -800,8 +800,12 @@ public: u32 zeta_width; u32 zeta_height; + union { + BitField<0, 16, u32> zeta_layers; + BitField<16, 1, u32> zeta_volume; + }; - INSERT_UNION_PADDING_WORDS(0x27); + INSERT_UNION_PADDING_WORDS(0x26); u32 depth_test_enable; @@ -1507,6 +1511,7 @@ ASSERT_REG_POSITION(vertex_attrib_format, 0x458); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); ASSERT_REG_POSITION(zeta_height, 0x48b); +ASSERT_REG_POSITION(zeta_layers, 0x48c); ASSERT_REG_POSITION(depth_test_enable, 0x4B3); ASSERT_REG_POSITION(independent_blend_enable, 0x4B9); ASSERT_REG_POSITION(depth_write_enabled, 0x4BA); diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 7d7137109..e8f763ce9 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -140,71 +140,6 @@ void GPU::FlushCommands() { renderer.Rasterizer().FlushCommands(); } -u32 RenderTargetBytesPerPixel(RenderTargetFormat format) { - ASSERT(format != RenderTargetFormat::NONE); - - switch (format) { - case RenderTargetFormat::RGBA32_FLOAT: - case RenderTargetFormat::RGBA32_UINT: - return 16; - case RenderTargetFormat::RGBA16_UINT: - case RenderTargetFormat::RGBA16_UNORM: - case RenderTargetFormat::RGBA16_FLOAT: - case RenderTargetFormat::RGBX16_FLOAT: - case RenderTargetFormat::RG32_FLOAT: - case RenderTargetFormat::RG32_UINT: - return 8; - case RenderTargetFormat::RGBA8_UNORM: - case RenderTargetFormat::RGBA8_SNORM: - case RenderTargetFormat::RGBA8_SRGB: - case RenderTargetFormat::RGBA8_UINT: - case RenderTargetFormat::RGB10_A2_UNORM: - case RenderTargetFormat::BGRA8_UNORM: - case RenderTargetFormat::BGRA8_SRGB: - case RenderTargetFormat::RG16_UNORM: - case RenderTargetFormat::RG16_SNORM: - case RenderTargetFormat::RG16_UINT: - case RenderTargetFormat::RG16_SINT: - case RenderTargetFormat::RG16_FLOAT: - case RenderTargetFormat::R32_FLOAT: - case RenderTargetFormat::R11G11B10_FLOAT: - case RenderTargetFormat::R32_UINT: - return 4; - case RenderTargetFormat::R16_UNORM: - case RenderTargetFormat::R16_SNORM: - case RenderTargetFormat::R16_UINT: - case RenderTargetFormat::R16_SINT: - case RenderTargetFormat::R16_FLOAT: - case RenderTargetFormat::RG8_UNORM: - case RenderTargetFormat::RG8_SNORM: - return 2; - case RenderTargetFormat::R8_UNORM: - case RenderTargetFormat::R8_UINT: - return 1; - default: - UNIMPLEMENTED_MSG("Unimplemented render target format {}", static_cast<u32>(format)); - return 1; - } -} - -u32 DepthFormatBytesPerPixel(DepthFormat format) { - switch (format) { - case DepthFormat::Z32_S8_X24_FLOAT: - return 8; - case DepthFormat::Z32_FLOAT: - case DepthFormat::S8_Z24_UNORM: - case DepthFormat::Z24_X8_UNORM: - case DepthFormat::Z24_S8_UNORM: - case DepthFormat::Z24_C8_UNORM: - return 4; - case DepthFormat::Z16_UNORM: - return 2; - default: - UNIMPLEMENTED_MSG("Unimplemented Depth format {}", static_cast<u32>(format)); - return 1; - } -} - // Note that, traditionally, methods are treated as 4-byte addressable locations, and hence // their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. // So the values you see in docs might be multiplied by 4. diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 07727210c..ba8c9d665 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -57,6 +57,7 @@ enum class RenderTargetFormat : u32 { RG16_UINT = 0xDD, RG16_FLOAT = 0xDE, R11G11B10_FLOAT = 0xE0, + R32_SINT = 0xE3, R32_UINT = 0xE4, R32_FLOAT = 0xE5, B5G6R5_UNORM = 0xE8, @@ -82,12 +83,6 @@ enum class DepthFormat : u32 { Z32_S8_X24_FLOAT = 0x19, }; -/// Returns the number of bytes per pixel of each rendertarget format. -u32 RenderTargetBytesPerPixel(RenderTargetFormat format); - -/// Returns the number of bytes per pixel of each depth format. -u32 DepthFormatBytesPerPixel(DepthFormat format); - struct CommandListHeader; class DebugContext; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index 2f2fe6859..f2c83266e 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -85,6 +85,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::RG32UI>, MortonCopy<true, PixelFormat::RGBX16F>, MortonCopy<true, PixelFormat::R32UI>, + MortonCopy<true, PixelFormat::R32I>, MortonCopy<true, PixelFormat::ASTC_2D_8X8>, MortonCopy<true, PixelFormat::ASTC_2D_8X5>, MortonCopy<true, PixelFormat::ASTC_2D_5X4>, @@ -166,6 +167,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::RG32UI>, MortonCopy<false, PixelFormat::RGBX16F>, MortonCopy<false, PixelFormat::R32UI>, + MortonCopy<false, PixelFormat::R32I>, nullptr, nullptr, nullptr, diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index d4b81cd87..cf934b0d8 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -87,6 +87,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_RG32UI, GL_RG_INTEGER, GL_UNSIGNED_INT, false}, // RG32UI {GL_RGB16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBX16F {GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, false}, // R32UI + {GL_R32I, GL_RED_INTEGER, GL_INT, false}, // R32I {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X8 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_8X5 {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, false}, // ASTC_2D_5X4 @@ -260,6 +261,13 @@ CachedSurface::~CachedSurface() = default; void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { MICROPROFILE_SCOPE(OpenGL_Texture_Download); + if (params.IsBuffer()) { + glGetNamedBufferSubData(texture_buffer.handle, 0, + static_cast<GLsizeiptr>(params.GetHostSizeInBytes()), + staging_buffer.data()); + return; + } + SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); }); for (u32 level = 0; level < params.emulated_levels; ++level) { @@ -398,24 +406,36 @@ CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& p CachedSurfaceView::~CachedSurfaceView() = default; void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const { - ASSERT(params.num_layers == 1 && params.num_levels == 1); + ASSERT(params.num_levels == 1); - const auto& owner_params = surface.GetSurfaceParams(); + const GLuint texture = surface.GetTexture(); + if (params.num_layers > 1) { + // Layered framebuffer attachments + UNIMPLEMENTED_IF(params.base_layer != 0); + + switch (params.target) { + case SurfaceTarget::Texture2DArray: + glFramebufferTexture(target, attachment, texture, params.base_level); + break; + default: + UNIMPLEMENTED(); + } + return; + } - switch (owner_params.target) { + const GLenum view_target = surface.GetTarget(); + switch (surface.GetSurfaceParams().target) { case SurfaceTarget::Texture1D: - glFramebufferTexture1D(target, attachment, surface.GetTarget(), surface.GetTexture(), - params.base_level); + glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture2D: - glFramebufferTexture2D(target, attachment, surface.GetTarget(), surface.GetTexture(), - params.base_level); + glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level); break; case SurfaceTarget::Texture1DArray: case SurfaceTarget::Texture2DArray: case SurfaceTarget::TextureCubemap: case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(target, attachment, surface.GetTexture(), params.base_level, + glFramebufferTextureLayer(target, attachment, texture, params.base_level, params.base_layer); break; default: diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 331808113..ef66dd141 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -159,12 +159,13 @@ struct FormatTuple { {vk::Format::eR32G32Uint, Attachable | Storage}, // RG32UI {vk::Format::eUndefined, {}}, // RGBX16F {vk::Format::eR32Uint, Attachable | Storage}, // R32UI + {vk::Format::eR32Sint, Attachable | Storage}, // R32I {vk::Format::eAstc8x8UnormBlock, {}}, // ASTC_2D_8X8 {vk::Format::eUndefined, {}}, // ASTC_2D_8X5 {vk::Format::eUndefined, {}}, // ASTC_2D_5X4 {vk::Format::eUndefined, {}}, // BGRA8_SRGB {vk::Format::eBc1RgbaSrgbBlock, {}}, // DXT1_SRGB - {vk::Format::eUndefined, {}}, // DXT23_SRGB + {vk::Format::eBc2SrgbBlock, {}}, // DXT23_SRGB {vk::Format::eBc3SrgbBlock, {}}, // DXT45_SRGB {vk::Format::eBc7SrgbBlock, {}}, // BC7U_SRGB {vk::Format::eR4G4B4A4UnormPack16, Attachable}, // R4G4B4A4U @@ -363,6 +364,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32_32: + return vk::Format::eR32G32B32A32Uint; default: break; } diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 588a6835f..886bde3b9 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -107,6 +107,8 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan features.occlusionQueryPrecise = true; features.fragmentStoresAndAtomics = true; features.shaderImageGatherExtended = true; + features.shaderStorageImageReadWithoutFormat = + is_shader_storage_img_read_without_format_supported; features.shaderStorageImageWriteWithoutFormat = true; features.textureCompressionASTC_LDR = is_optimal_astc_supported; @@ -465,6 +467,8 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) { const auto supported_features{physical.getFeatures(dldi)}; + is_shader_storage_img_read_without_format_supported = + supported_features.shaderStorageImageReadWithoutFormat; is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi); } @@ -519,6 +523,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eB10G11R11UfloatPack32, vk::Format::eR32Sfloat, vk::Format::eR32Uint, + vk::Format::eR32Sint, vk::Format::eR16Sfloat, vk::Format::eR16G16B16A16Sfloat, vk::Format::eB8G8R8A8Unorm, @@ -538,6 +543,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eBc6HUfloatBlock, vk::Format::eBc6HSfloatBlock, vk::Format::eBc1RgbaSrgbBlock, + vk::Format::eBc2SrgbBlock, vk::Format::eBc3SrgbBlock, vk::Format::eBc7SrgbBlock, vk::Format::eAstc4x4SrgbBlock, diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/renderer_vulkan/vk_device.h index 72603f9f6..2c27ad730 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/renderer_vulkan/vk_device.h @@ -122,6 +122,11 @@ public: return properties.limits.maxPushConstantsSize; } + /// Returns true if Shader storage Image Read Without Format supported. + bool IsShaderStorageImageReadWithoutFormatSupported() const { + return is_shader_storage_img_read_without_format_supported; + } + /// Returns true if ASTC is natively supported. bool IsOptimalAstcSupported() const { return is_optimal_astc_supported; @@ -227,6 +232,8 @@ private: bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. bool nv_device_diagnostic_checkpoints{}; ///< Support for VK_NV_device_diagnostic_checkpoints. + bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage + ///< image read without format // Telemetry parameters std::string vendor_name; ///< Device's driver name. diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 31c078f6a..3bf86da87 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -611,33 +611,34 @@ bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachmen std::tuple<vk::Framebuffer, vk::Extent2D> RasterizerVulkan::ConfigureFramebuffers( vk::RenderPass renderpass) { FramebufferCacheKey key{renderpass, std::numeric_limits<u32>::max(), - std::numeric_limits<u32>::max()}; + std::numeric_limits<u32>::max(), std::numeric_limits<u32>::max()}; - const auto MarkAsModifiedAndPush = [&](const View& view) { - if (view == nullptr) { + const auto try_push = [&](const View& view) { + if (!view) { return false; } key.views.push_back(view->GetHandle()); key.width = std::min(key.width, view->GetWidth()); key.height = std::min(key.height, view->GetHeight()); + key.layers = std::min(key.layers, view->GetNumLayers()); return true; }; for (std::size_t index = 0; index < std::size(color_attachments); ++index) { - if (MarkAsModifiedAndPush(color_attachments[index])) { + if (try_push(color_attachments[index])) { texture_cache.MarkColorBufferInUse(index); } } - if (MarkAsModifiedAndPush(zeta_attachment)) { + if (try_push(zeta_attachment)) { texture_cache.MarkDepthBufferInUse(); } const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); auto& framebuffer = fbentry->second; if (is_cache_miss) { - const vk::FramebufferCreateInfo framebuffer_ci({}, key.renderpass, - static_cast<u32>(key.views.size()), - key.views.data(), key.width, key.height, 1); + const vk::FramebufferCreateInfo framebuffer_ci( + {}, key.renderpass, static_cast<u32>(key.views.size()), key.views.data(), key.width, + key.height, key.layers); const auto dev = device.GetLogical(); const auto& dld = device.GetDispatchLoader(); framebuffer = dev.createFramebufferUnique(framebuffer_ci, nullptr, dld); diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 138903d60..4dc8af6e8 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -56,6 +56,7 @@ struct FramebufferCacheKey { vk::RenderPass renderpass{}; u32 width = 0; u32 height = 0; + u32 layers = 0; ImageViewsPack views; std::size_t Hash() const noexcept { @@ -66,12 +67,17 @@ struct FramebufferCacheKey { } boost::hash_combine(hash, width); boost::hash_combine(hash, height); + boost::hash_combine(hash, layers); return hash; } bool operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(renderpass, views, width, height) == - std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height); + return std::tie(renderpass, views, width, height, layers) == + std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers); + } + + bool operator!=(const FramebufferCacheKey& rhs) const noexcept { + return !operator==(rhs); } }; diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp index 0a8ec8398..204b7c39c 100644 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp @@ -23,7 +23,14 @@ static std::optional<vk::BorderColor> TryConvertBorderColor(std::array<float, 4> } else if (color == std::array<float, 4>{1, 1, 1, 1}) { return vk::BorderColor::eFloatOpaqueWhite; } else { - return {}; + if (color[0] + color[1] + color[2] > 1.35f) { + // If color elements are brighter than roughly 0.5 average, use white border + return vk::BorderColor::eFloatOpaqueWhite; + } + if (color[3] > 0.5f) { + return vk::BorderColor::eFloatOpaqueBlack; + } + return vk::BorderColor::eFloatTransparentBlack; } } @@ -37,8 +44,6 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const auto border_color{tsc.GetBorderColor()}; const auto vk_border_color{TryConvertBorderColor(border_color)}; - UNIMPLEMENTED_IF_MSG(!vk_border_color, "Unimplemented border color {} {} {} {}", - border_color[0], border_color[1], border_color[2], border_color[3]); constexpr bool unnormalized_coords{false}; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index f64f5da28..2da622d15 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -86,6 +86,7 @@ struct AttributeType { struct VertexIndices { std::optional<u32> position; + std::optional<u32> layer; std::optional<u32> viewport; std::optional<u32> point_size; std::optional<u32> clip_distances; @@ -284,14 +285,20 @@ public: AddExtension("SPV_KHR_variable_pointers"); AddExtension("SPV_KHR_shader_draw_parameters"); - if (ir.UsesViewportIndex()) { - AddCapability(spv::Capability::MultiViewport); - if (device.IsExtShaderViewportIndexLayerSupported()) { + if (ir.UsesLayer() || ir.UsesViewportIndex()) { + if (ir.UsesViewportIndex()) { + AddCapability(spv::Capability::MultiViewport); + } + if (stage != ShaderType::Geometry && device.IsExtShaderViewportIndexLayerSupported()) { AddExtension("SPV_EXT_shader_viewport_index_layer"); AddCapability(spv::Capability::ShaderViewportIndexLayerEXT); } } + if (device.IsShaderStorageImageReadWithoutFormatSupported()) { + AddCapability(spv::Capability::StorageImageReadWithoutFormat); + } + if (device.IsFloat16Supported()) { AddCapability(spv::Capability::Float16); } @@ -924,13 +931,22 @@ private: VertexIndices indices; indices.position = AddBuiltIn(t_float4, spv::BuiltIn::Position, "position"); + if (ir.UsesLayer()) { + if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) { + indices.layer = AddBuiltIn(t_int, spv::BuiltIn::Layer, "layer"); + } else { + LOG_ERROR( + Render_Vulkan, + "Shader requires Layer but it's not supported on this stage with this device."); + } + } + if (ir.UsesViewportIndex()) { if (stage != ShaderType::Vertex || device.IsExtShaderViewportIndexLayerSupported()) { indices.viewport = AddBuiltIn(t_int, spv::BuiltIn::ViewportIndex, "viewport_index"); } else { - LOG_ERROR(Render_Vulkan, - "Shader requires ViewportIndex but it's not supported on this " - "stage with this device."); + LOG_ERROR(Render_Vulkan, "Shader requires ViewportIndex but it's not supported on " + "this stage with this device."); } } @@ -1292,6 +1308,13 @@ private: } case Attribute::Index::LayerViewportPointSize: switch (element) { + case 1: { + if (!out_indices.layer) { + return {}; + } + const u32 index = out_indices.layer.value(); + return {AccessElement(t_out_int, out_vertex, index), Type::Int}; + } case 2: { if (!out_indices.viewport) { return {}; @@ -1362,6 +1385,11 @@ private: UNIMPLEMENTED(); } + if (!target.id) { + // On failure we return a nullptr target.id, skip these stores. + return {}; + } + OpStore(target.id, As(Visit(src), target.type)); return {}; } @@ -1755,8 +1783,16 @@ private: } Expression ImageLoad(Operation operation) { - UNIMPLEMENTED(); - return {}; + if (!device.IsShaderStorageImageReadWithoutFormatSupported()) { + return {v_float_zero, Type::Float}; + } + + const auto& meta{std::get<MetaImage>(operation.GetMeta())}; + + const Id coords = GetCoordinates(operation, Type::Int); + const Id texel = OpImageRead(t_uint4, GetImage(operation), coords); + + return {OpCompositeExtract(t_uint, texel, meta.element), Type::Uint}; } Expression ImageStore(Operation operation) { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index d3edbe80c..22e3d34de 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -151,6 +151,10 @@ public: return params.GetMipHeight(base_level); } + u32 GetNumLayers() const { + return num_layers; + } + bool IsBufferView() const { return buffer_view; } diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index 542636430..bee7d8cad 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -527,7 +527,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, const bool is_bindless = bindless_reg.has_value(); UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); - ASSERT_MSG(texture_type != TextureType::Texture3D || is_array || is_shadow, + ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow, "Illegal texture type"); const SamplerInfo info{texture_type, is_array, is_shadow, false}; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1655ccf16..9707c353d 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -155,6 +155,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::R16I; case Tegra::RenderTargetFormat::R32_FLOAT: return PixelFormat::R32F; + case Tegra::RenderTargetFormat::R32_SINT: + return PixelFormat::R32I; case Tegra::RenderTargetFormat::R32_UINT: return PixelFormat::R32UI; case Tegra::RenderTargetFormat::RG32_UINT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0d17a93ed..d88109e5a 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -59,47 +59,48 @@ enum class PixelFormat { RG32UI = 41, RGBX16F = 42, R32UI = 43, - ASTC_2D_8X8 = 44, - ASTC_2D_8X5 = 45, - ASTC_2D_5X4 = 46, - BGRA8_SRGB = 47, - DXT1_SRGB = 48, - DXT23_SRGB = 49, - DXT45_SRGB = 50, - BC7U_SRGB = 51, - R4G4B4A4U = 52, - ASTC_2D_4X4_SRGB = 53, - ASTC_2D_8X8_SRGB = 54, - ASTC_2D_8X5_SRGB = 55, - ASTC_2D_5X4_SRGB = 56, - ASTC_2D_5X5 = 57, - ASTC_2D_5X5_SRGB = 58, - ASTC_2D_10X8 = 59, - ASTC_2D_10X8_SRGB = 60, - ASTC_2D_6X6 = 61, - ASTC_2D_6X6_SRGB = 62, - ASTC_2D_10X10 = 63, - ASTC_2D_10X10_SRGB = 64, - ASTC_2D_12X12 = 65, - ASTC_2D_12X12_SRGB = 66, - ASTC_2D_8X6 = 67, - ASTC_2D_8X6_SRGB = 68, - ASTC_2D_6X5 = 69, - ASTC_2D_6X5_SRGB = 70, - E5B9G9R9F = 71, + R32I = 44, + ASTC_2D_8X8 = 45, + ASTC_2D_8X5 = 46, + ASTC_2D_5X4 = 47, + BGRA8_SRGB = 48, + DXT1_SRGB = 49, + DXT23_SRGB = 50, + DXT45_SRGB = 51, + BC7U_SRGB = 52, + R4G4B4A4U = 53, + ASTC_2D_4X4_SRGB = 54, + ASTC_2D_8X8_SRGB = 55, + ASTC_2D_8X5_SRGB = 56, + ASTC_2D_5X4_SRGB = 57, + ASTC_2D_5X5 = 58, + ASTC_2D_5X5_SRGB = 59, + ASTC_2D_10X8 = 60, + ASTC_2D_10X8_SRGB = 61, + ASTC_2D_6X6 = 62, + ASTC_2D_6X6_SRGB = 63, + ASTC_2D_10X10 = 64, + ASTC_2D_10X10_SRGB = 65, + ASTC_2D_12X12 = 66, + ASTC_2D_12X12_SRGB = 67, + ASTC_2D_8X6 = 68, + ASTC_2D_8X6_SRGB = 69, + ASTC_2D_6X5 = 70, + ASTC_2D_6X5_SRGB = 71, + E5B9G9R9F = 72, MaxColorFormat, // Depth formats - Z32F = 72, - Z16 = 73, + Z32F = 73, + Z16 = 74, MaxDepthFormat, // DepthStencil formats - Z24S8 = 74, - S8Z24 = 75, - Z32FS8 = 76, + Z24S8 = 75, + S8Z24 = 76, + Z32FS8 = 77, MaxDepthStencilFormat, @@ -171,6 +172,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // RG32UI 0, // RGBX16F 0, // R32UI + 0, // R32I 2, // ASTC_2D_8X8 2, // ASTC_2D_8X5 2, // ASTC_2D_5X4 @@ -267,6 +269,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // RG32UI 1, // RGBX16F 1, // R32UI + 1, // R32I 8, // ASTC_2D_8X8 8, // ASTC_2D_8X5 5, // ASTC_2D_5X4 @@ -355,6 +358,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // RG32UI 1, // RGBX16F 1, // R32UI + 1, // R32I 8, // ASTC_2D_8X8 5, // ASTC_2D_8X5 4, // ASTC_2D_5X4 @@ -443,6 +447,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 64, // RG32UI 64, // RGBX16F 32, // R32UI + 32, // R32I 128, // ASTC_2D_8X8 128, // ASTC_2D_8X5 128, // ASTC_2D_5X4 @@ -546,6 +551,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // RG32UI SurfaceCompression::None, // RGBX16F SurfaceCompression::None, // R32UI + SurfaceCompression::None, // R32I SurfaceCompression::Converted, // ASTC_2D_8X8 SurfaceCompression::Converted, // ASTC_2D_8X5 SurfaceCompression::Converted, // ASTC_2D_5X4 diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 81fb9f633..cc3ad8417 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 74> DefinitionTable = {{ +constexpr std::array<Table, 75> DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -89,6 +89,7 @@ constexpr std::array<Table, 74> DefinitionTable = {{ {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32F}, {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32UI}, + {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32I}, {TextureFormat::E5B9G9R9_SHAREDEXP, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9F}, diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp index 84469b7ba..002df414f 100644 --- a/src/video_core/texture_cache/surface_base.cpp +++ b/src/video_core/texture_cache/surface_base.cpp @@ -277,6 +277,10 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, SwizzleFunc(MortonSwizzleMode::LinearToMorton, host_ptr, params, staging_buffer.data() + host_offset, level); } + } else if (params.IsBuffer()) { + // Buffers don't have pitch or any fancy layout property. We can just memcpy them to guest + // memory. + std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size); } else { ASSERT(params.target == SurfaceTarget::Texture2D); ASSERT(params.num_levels == 1); diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp index 38b3a4ba8..f00839313 100644 --- a/src/video_core/texture_cache/surface_params.cpp +++ b/src/video_core/texture_cache/surface_params.cpp @@ -84,19 +84,16 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta if (entry.IsShadow() && params.type == SurfaceType::ColorTexture) { switch (params.pixel_format) { case PixelFormat::R16U: - case PixelFormat::R16F: { + case PixelFormat::R16F: params.pixel_format = PixelFormat::Z16; break; - } - case PixelFormat::R32F: { + case PixelFormat::R32F: params.pixel_format = PixelFormat::Z32F; break; - } - default: { + default: UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}", static_cast<u32>(params.pixel_format)); } - } params.type = GetFormatType(params.pixel_format); } params.type = GetFormatType(params.pixel_format); @@ -168,27 +165,29 @@ SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_tabl return params; } -SurfaceParams SurfaceParams::CreateForDepthBuffer( - Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type) { +SurfaceParams SurfaceParams::CreateForDepthBuffer(Core::System& system) { + const auto& regs = system.GPU().Maxwell3D().regs; + regs.zeta_width, regs.zeta_height, regs.zeta.format, regs.zeta.memory_layout.type; SurfaceParams params; - params.is_tiled = type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; + params.is_tiled = regs.zeta.memory_layout.type == + Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; params.srgb_conversion = false; - params.block_width = std::min(block_width, 5U); - params.block_height = std::min(block_height, 5U); - params.block_depth = std::min(block_depth, 5U); + params.block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U); + params.block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U); + params.block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromDepthFormat(format); + params.pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); params.type = GetFormatType(params.pixel_format); - params.width = zeta_width; - params.height = zeta_height; - params.target = SurfaceTarget::Texture2D; - params.depth = 1; + params.width = regs.zeta_width; + params.height = regs.zeta_height; params.pitch = 0; params.num_levels = 1; params.emulated_levels = 1; - params.is_layered = false; + + const bool is_layered = regs.zeta_layers > 1 && params.block_depth == 0; + params.is_layered = is_layered; + params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; + params.depth = is_layered ? regs.zeta_layers.Value() : 1U; return params; } @@ -214,11 +213,13 @@ SurfaceParams SurfaceParams::CreateForFramebuffer(Core::System& system, std::siz params.width = params.pitch / bpp; } params.height = config.height; - params.depth = 1; - params.target = SurfaceTarget::Texture2D; params.num_levels = 1; params.emulated_levels = 1; - params.is_layered = false; + + const bool is_layered = config.layers > 1 && params.block_depth == 0; + params.is_layered = is_layered; + params.depth = is_layered ? config.layers.Value() : 1; + params.target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; return params; } diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h index 9256fd6d9..995cc3818 100644 --- a/src/video_core/texture_cache/surface_params.h +++ b/src/video_core/texture_cache/surface_params.h @@ -35,10 +35,7 @@ public: const VideoCommon::Shader::Image& entry); /// Creates SurfaceCachedParams for a depth buffer configuration. - static SurfaceParams CreateForDepthBuffer( - Core::System& system, u32 zeta_width, u32 zeta_height, Tegra::DepthFormat format, - u32 block_width, u32 block_height, u32 block_depth, - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout type); + static SurfaceParams CreateForDepthBuffer(Core::System& system); /// Creates SurfaceCachedParams from a framebuffer configuration. static SurfaceParams CreateForFramebuffer(Core::System& system, std::size_t index); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index f4c015635..c70e4aec2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -160,10 +160,7 @@ public: SetEmptyDepthBuffer(); return {}; } - const auto depth_params{SurfaceParams::CreateForDepthBuffer( - system, regs.zeta_width, regs.zeta_height, regs.zeta.format, - regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height, - regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)}; + const auto depth_params{SurfaceParams::CreateForDepthBuffer(system)}; auto surface_view = GetSurface(gpu_addr, cache_addr, depth_params, preserve_contents, true); if (depth_buffer.target) depth_buffer.target->MarkAsRenderTarget(false, NO_RT); @@ -721,7 +718,6 @@ private: std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const CacheAddr cache_addr, const SurfaceParams& params, bool preserve_contents, bool is_render) { - // Step 1 // Check Level 1 Cache for a fast structural match. If candidate surface // matches at certain level we are pretty much done. @@ -733,14 +729,18 @@ private: return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, topological_result); } + const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None && - (params.target != SurfaceTarget::Texture3D || - current_surface->MatchTarget(params.target))) { - if (struct_result == MatchStructureResult::FullMatch) { - return ManageStructuralMatch(current_surface, params, is_render); - } else { - return RebuildSurface(current_surface, params, is_render); + if (struct_result != MatchStructureResult::None) { + const auto& old_params = current_surface->GetSurfaceParams(); + const bool not_3d = params.target != SurfaceTarget::Texture3D && + old_params.target != SurfaceTarget::Texture3D; + if (not_3d || current_surface->MatchTarget(params.target)) { + if (struct_result == MatchStructureResult::FullMatch) { + return ManageStructuralMatch(current_surface, params, is_render); + } else { + return RebuildSurface(current_surface, params, is_render); + } } } } |