From a518a133c3859bc9da8fbb9cc75a96d07c3baf87 Mon Sep 17 00:00:00 2001 From: Michael Adler Date: Thu, 19 Dec 2024 14:56:43 -0500 Subject: [PATCH] pci_device: improved reset sequence - Wait for data link active after triggering a bridge's secondary device reset. - Move reset to a separate command that unloads drivers, then devices, triggers warm reset on FPGAs, and finally rescans and clears errors. Fixes: c97c3f2863f0 ("pci_device: Add bridge reset when a device is unplugged (#3143)") Signed-off-by: Michael Adler --- python/opae.admin/opae/admin/sysfs.py | 43 ++++++++++++-- .../opae.admin/opae/admin/tools/pci_device.py | 58 +++++++++++++++++-- 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/python/opae.admin/opae/admin/sysfs.py b/python/opae.admin/opae/admin/sysfs.py index a97ea29f3569..b898260375a2 100644 --- a/python/opae.admin/opae/admin/sysfs.py +++ b/python/opae.admin/opae/admin/sysfs.py @@ -232,6 +232,8 @@ def __init__(self, pci_address, parent=None, **kwargs): pci_address['pci_address']) self._bridge_ctrl_cmd = 'setpci -s {} BRIDGE_CONTROL'.format( pci_address['pci_address']) + self._link_status_cmd = 'setpci -s {} CAP_EXP+12.W'.format( + pci_address['pci_address']) if self.have_node('driver'): self._driver = os.readlink(self.node('driver').sysfs_path) @@ -599,8 +601,39 @@ def aer(self, values): except CalledProcessError as err: self.log.warn('error setting aer: %s', err) + def data_link_layer_active(self): + """data_link_layer_active Return true if PCIe data link is active. + + Returns: + Bit 13 of the PCIe link status register. + + Notes: + This relies on calling 'setpci' and will log an exception if an + error was encountered while calling 'setpci'. + """ + try: + ls = int(call_process(self._link_status_cmd), 16) + return (ls >> 13) & 1 + except CalledProcessError as err: + self.log.warn('error reading link status: %s', err) + + def wait_for_data_link(self, target_state): + """wait_for_data_link Wait until data link active state matches + target_state. + """ + # Bridges are not required to implement the link layer active bit. + # Time out after a generous 5 seconds, assuming the target state + # is reached. + for i in range(5): + if self.data_link_layer_active() == target_state: + break + time.sleep(1) + + # The spec requires at least 100ms delay after the link change. + time.sleep(0.25) + def reset_bridge(self): - """reset_bridge Reset the devices under root port bridge object. + """reset_bridge Reset secondary devices under root port bridge object. Notes: This relies on calling 'setpci' and will log an exception if an @@ -609,13 +642,15 @@ def reset_bridge(self): try: # Get current bridge control value bc = int(call_process(self._bridge_ctrl_cmd), 16) - # Enable bus reset + + # Enable secondary bus reset bc_reset = bc | 0x40 call_process(f"{self._bridge_ctrl_cmd}=0x{bc_reset:x}") - time.sleep(0.1) + self.wait_for_data_link(0) + # Clear bus reset call_process(f"{self._bridge_ctrl_cmd}=0x{bc:x}") - time.sleep(0.25) + self.wait_for_data_link(1) except CalledProcessError as err: self.log.warn('error resetting bridge: %s', err) diff --git a/python/opae.admin/opae/admin/tools/pci_device.py b/python/opae.admin/opae/admin/tools/pci_device.py index 51d35e811ecb..b0f5504960a7 100644 --- a/python/opae.admin/opae/admin/tools/pci_device.py +++ b/python/opae.admin/opae/admin/tools/pci_device.py @@ -260,10 +260,6 @@ def unplug_node(root, debug, exclude_node): remove = False if remove: - if debug: - print(f'Resetting the root bridge {root.pci_address}') - root.reset_bridge() - if debug: print(f'Removing the root device {root.pci_address}') root.remove() @@ -314,6 +310,54 @@ def clear_correctable_errors(self, device, debug): call_process(f'{cmd}=FFFFFFFF') +class reset(object): + @staticmethod + def add_subparser(subparser): + reset = subparser.add_parser('reset') + reset.add_argument('-d', '--debug', action='store_true', + default=False, help='enable debug output') + + def __call__(self, device, args): + debug = args.debug + + root = device.pci_node.root + + v0, v1 = None, None + if root.supports_ecap('aer'): + if debug: + print('ECAP_AER is supported') + v0, v1 = root.aer + root.aer = (0xFFFFFFFF, 0xFFFFFFFF) + + reset.reset_node(root, debug) + + if v0: + root.aer = (v0, v1) + + @staticmethod + def reset_node(root, debug): + if debug: + print('Unbinding drivers for leaf devices') + + for e in root.endpoints: + if debug: + print(f' unbind {e.pci_address}') + e.unbind() + + for e in root.endpoints: + if debug: + print(f' remove {e.pci_address}') + try: + e.remove() + except NameError: + None # Ignore endpoints with no sysfs remove entry + + if debug: + print(f'Resetting secondary devices under {root.pci_address}') + root.reset_bridge() + root.rescan() + + def add_unbind_subparser(subparser): subparser.add_parser('unbind') @@ -350,7 +394,8 @@ def main(): 'aer': (aer.add_subparser, aer()), 'topology': (add_topology_subparser, topology), 'unplug': (unplug.add_subparser, unplug()), - 'plug': (plug.add_subparser, plug())} + 'plug': (plug.add_subparser, plug()), + 'reset': (reset.add_subparser, reset())} parser = ArgumentParser() @@ -397,6 +442,9 @@ def main(): actions['vf'][1](dev, args, args.numvfs) else: actions[args.which][1](dev, args) + # Clear AER after reset + if args.which == 'reset': + actions['plug'][1](dev, args) else: actions['topology'][1](dev, args)