From c78c8190d5b964c9421b7d0c0c3a96ba599b3c2f Mon Sep 17 00:00:00 2001 From: Markus Wick Date: Mon, 7 Mar 2022 19:13:05 +0100 Subject: shader_recompiler/LOP3: Use brute force python results within switch/case. Thanks to @asLody for optimizing this function. This raised the focus that this function should be optimized more. The current table assumes that the host GPU is able to invert for free, so only AND,OR,XOR are accumulated in the performance metrik. Performance results: Instructions 0: 8 1: 30 2: 114 3: 80 4: 24 Latency 0: 8 1: 30 2: 194 3: 24 --- .../impl/logic_operation_three_input_lut3.py | 92 ++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py (limited to 'src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py') diff --git a/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py b/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py new file mode 100644 index 000000000..8f547c266 --- /dev/null +++ b/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py @@ -0,0 +1,92 @@ +# Copyright © 2022 degasus +# This work is free. You can redistribute it and/or modify it under the +# terms of the Do What The Fuck You Want To Public License, Version 2, +# as published by Sam Hocevar. See http://www.wtfpl.net/ for more details. + +from itertools import product + +# The primitive instructions +OPS = { + 'ir.BitwiseAnd({}, {})' : (2, 1, lambda a,b: a&b), + 'ir.BitwiseOr({}, {})' : (2, 1, lambda a,b: a|b), + 'ir.BitwiseXor({}, {})' : (2, 1, lambda a,b: a^b), + 'ir.BitwiseNot({})' : (1, 0.1, lambda a: (~a) & 255), # Only tiny cost, as this can often inlined in other instructions +} + +# Our database of combination of instructions +optimized_calls = {} +def cmp(lhs, rhs): + if lhs is None: # new entry + return True + if lhs[3] > rhs[3]: # costs + return True + if lhs[3] < rhs[3]: # costs + return False + if len(lhs[0]) > len(rhs[0]): # string len + return True + if len(lhs[0]) < len(rhs[0]): # string len + return False + if lhs[0] > rhs[0]: # string sorting + return True + if lhs[0] < rhs[0]: # string sorting + return False + assert lhs == rhs, "redundant instruction, bug in brute force" + return False +def register(imm, instruction, count, latency): + # Use the sum of instruction count and latency as costs to evaluate which combination is best + costs = count + latency + + old = optimized_calls.get(imm, None) + new = (instruction, count, latency, costs) + + # Update if new or better + if cmp(old, new): + optimized_calls[imm] = new + return True + + return False + +# Constants: 0, 1 (for free) +register(0, 'ir.Imm32(0)', 0, 0) +register(255, 'ir.Imm32(0xFFFFFFFF)', 0, 0) + +# Inputs: a, b, c (for free) +ta = 0xF0 +tb = 0xCC +tc = 0xAA +inputs = { + ta : 'a', + tb : 'b', + tc : 'c', +} +for imm, instruction in inputs.items(): + register(imm, instruction, 0, 0) + register((~imm) & 255, 'ir.BitwiseNot({})'.format(instruction), 0.099, 0.099) # slightly cheaper NEG on inputs + +# Try to combine two values from the db with an instruction. +# If it is better than the old method, update it. +while True: + registered = 0 + calls_copy = optimized_calls.copy() + for OP, (argc, cost, f) in OPS.items(): + for args in product(calls_copy.items(), repeat=argc): + # unpack(transponse) the arrays + imm = [arg[0] for arg in args] + value = [arg[1][0] for arg in args] + count = [arg[1][1] for arg in args] + latency = [arg[1][2] for arg in args] + + registered += register( + f(*imm), + OP.format(*value), + sum(count) + cost, + max(latency) + cost) + if registered == 0: + # No update at all? So terminate + break + +# Hacky output. Please improve me to output valid C++ instead. +s = """ case {imm}: + return {op};""" +for imm in range(256): + print(s.format(imm=imm, op=optimized_calls[imm][0])) -- cgit v1.2.3