From 5f7697bbc1a41d4799797204137be85121063f65 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 23 May 2024 17:58:47 +0800 Subject: drm/amdgpu: trigger mode1 reset for RAS RMA status Check RMA status in bad page retirement flow. v2: fix coding bugs in v1. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 540e0f066b26..20e0e522fb51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); amdgpu_umc_handle_bad_pages(adev, ras_error_status); - if (err_data->ue_count && reset) { + if ((err_data->ue_count || err_data->de_count) && + (reset || (con && con->is_rma))) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint32_t timeout = timeout_ms; memset(&err_data, 0, sizeof(err_data)); @@ -243,9 +245,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (reset) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - + if (reset || (err_data.err_addr_cnt && con && con->is_rma)) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } -- cgit v1.2.3