diff options
author | Tao Zhou <tao.zhou1@amd.com> | 2022-10-17 18:31:20 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2022-10-27 15:12:08 -0400 |
commit | ae45a18b80d9d0d29f0ecfc52fb4e7831671b299 (patch) | |
tree | 5e4b92907103f8773af3f400ef26cb073608f5b1 /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | |
parent | 24b822928b5139b85ee9a818a65e343b7e3bb4fe (diff) | |
download | linux-ae45a18b80d9d0d29f0ecfc52fb4e7831671b299.tar.gz linux-ae45a18b80d9d0d29f0ecfc52fb4e7831671b299.tar.bz2 linux-ae45a18b80d9d0d29f0ecfc52fb4e7831671b299.zip |
drm/amdgpu: add RAS poison handling for MCA
For MCA poison, if unmap queue fails, only gpu reset should be
triggered without page retirement handling, MCA notifier will do it.
v2: handle MCA poison consumption in umc_poison_handler directly.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 31 |
1 files changed, 20 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 3c83129f4090..758942150c09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -169,19 +169,28 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, void *ras_error_status, bool reset) { - int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + int ret = AMDGPU_RAS_SUCCESS; - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!adev->gmc.xgmi.connected_to_cpu) { + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + ret = + amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data->ue_count; + obj->err_data.ce_count += err_data->ce_count; + } + } else if (reset) { + /* MCA poison handler is only responsible for GPU reset, + * let MCA notifier do page retirement. + */ + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); } return ret; |