From d4f871cb6a616f6b1a8a76049e042118571f3dd3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 1 Jan 2021 23:28:55 +0100 Subject: X86/NativeClock: Improve performance of clock calculations on hot path. --- src/common/x64/native_clock.cpp | 69 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 5 deletions(-) (limited to 'src/common/x64/native_clock.cpp') diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index eb8a7782f..e246432d0 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -2,12 +2,17 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include +#include #include #include #ifdef _MSC_VER #include + +#pragma intrinsic(__umulh) +#pragma intrinsic(_udiv128) #else #include #endif @@ -15,6 +20,55 @@ #include "common/uint128.h" #include "common/x64/native_clock.h" +namespace { + +[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) { +#ifdef __SIZEOF_INT128__ + const auto base = static_cast(numerator) << 64ULL; + return static_cast(base / divisor); +#elif defined(_M_X64) || defined(_M_ARM64) + std::array r = {0, numerator}; + u64 remainder; +#if _MSC_VER < 1923 + return udiv128(r[1], r[0], divisor, &remainder); +#else + return _udiv128(r[1], r[0], divisor, &remainder); +#endif +#else + // This one is bit more inaccurate. + return MultiplyAndDivide64(std::numeric_limits::max(), numerator, divisor); +#endif +} + +[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) { +#ifdef __SIZEOF_INT128__ + return (static_cast(a) * static_cast(b)) >> 64; +#elif defined(_M_X64) || defined(_M_ARM64) + return __umulh(a, b); // MSVC +#else + // Generic fallback + const u64 a_lo = u32(a); + const u64 a_hi = a >> 32; + const u64 b_lo = u32(b); + const u64 b_hi = b >> 32; + + const u64 a_x_b_hi = a_hi * b_hi; + const u64 a_x_b_mid = a_hi * b_lo; + const u64 b_x_a_mid = b_hi * a_lo; + const u64 a_x_b_lo = a_lo * b_lo; + + const u64 carry_bit = (static_cast(static_cast(a_x_b_mid)) + + static_cast(static_cast(b_x_a_mid)) + (a_x_b_lo >> 32)) >> + 32; + + const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit; + + return multhi; +#endif +} + +} // namespace + namespace Common { u64 EstimateRDTSCFrequency() { @@ -50,6 +104,11 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen _mm_mfence(); last_measure = __rdtsc(); accumulated_ticks = 0U; + ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); + us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); + ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); + clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency); + cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency); } u64 NativeClock::GetRTSC() { @@ -75,27 +134,27 @@ void NativeClock::Pause(bool is_paused) { std::chrono::nanoseconds NativeClock::GetTimeNS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; + return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)}; } std::chrono::microseconds NativeClock::GetTimeUS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; + return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)}; } std::chrono::milliseconds NativeClock::GetTimeMS() { const u64 rtsc_value = GetRTSC(); - return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; + return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)}; } u64 NativeClock::GetClockCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, clock_rtsc_factor); } u64 NativeClock::GetCPUCycles() { const u64 rtsc_value = GetRTSC(); - return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); + return MultiplyHigh(rtsc_value, cpu_rtsc_factor); } } // namespace X64 -- cgit v1.2.3 From 53d92318b82cd4a9e08f814fcb8aab624d795c6c Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 2 Jan 2021 02:24:49 +0100 Subject: X86/NativeClock: Reimplement RTDSC access to be lock free. --- src/common/x64/native_clock.cpp | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) (limited to 'src/common/x64/native_clock.cpp') diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index e246432d0..a65f6b832 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp @@ -17,6 +17,7 @@ #include #endif +#include "common/atomic_ops.h" #include "common/uint128.h" #include "common/x64/native_clock.h" @@ -102,8 +103,8 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ rtsc_frequency_} { _mm_mfence(); - last_measure = __rdtsc(); - accumulated_ticks = 0U; + time_point.inner.last_measure = __rdtsc(); + time_point.inner.accumulated_ticks = 0U; ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency); us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency); ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency); @@ -112,23 +113,35 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen } u64 NativeClock::GetRTSC() { - std::scoped_lock scope{rtsc_serialize}; - _mm_mfence(); - const u64 current_measure = __rdtsc(); - u64 diff = current_measure - last_measure; - diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) - if (current_measure > last_measure) { - last_measure = current_measure; - } - accumulated_ticks += diff; + TimePoint new_time_point{}; + TimePoint current_time_point{}; + do { + current_time_point.pack = time_point.pack; + _mm_mfence(); + const u64 current_measure = __rdtsc(); + u64 diff = current_measure - current_time_point.inner.last_measure; + diff = diff & ~static_cast(static_cast(diff) >> 63); // max(diff, 0) + new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure + ? current_measure + : current_time_point.inner.last_measure; + new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff; + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); /// The clock cannot be more precise than the guest timer, remove the lower bits - return accumulated_ticks & inaccuracy_mask; + return new_time_point.inner.accumulated_ticks & inaccuracy_mask; } void NativeClock::Pause(bool is_paused) { if (!is_paused) { - _mm_mfence(); - last_measure = __rdtsc(); + TimePoint current_time_point{}; + TimePoint new_time_point{}; + do { + current_time_point.pack = time_point.pack; + new_time_point.pack = current_time_point.pack; + _mm_mfence(); + new_time_point.inner.last_measure = __rdtsc(); + } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, + current_time_point.pack)); } } -- cgit v1.2.3