diff options
author | Rodrigo Vivi <rodrigo.vivi@intel.com> | 2022-12-30 04:09:09 -0500 |
---|---|---|
committer | Rodrigo Vivi <rodrigo.vivi@intel.com> | 2022-12-30 04:18:36 -0500 |
commit | b501d4dc83aa3940189b68045cadc8b3eac73988 (patch) | |
tree | e5310a4fafd59505c3db86bb05baf964cb8da944 /drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | |
parent | 4071d98b296a5bc5fd4b15ec651bd05800ec9510 (diff) | |
parent | 1b929c02afd37871d5afb9d498426f83432e71c2 (diff) | |
download | linux-b501d4dc83aa3940189b68045cadc8b3eac73988.tar.gz linux-b501d4dc83aa3940189b68045cadc8b3eac73988.tar.bz2 linux-b501d4dc83aa3940189b68045cadc8b3eac73988.zip |
Merge drm/drm-next into drm-intel-gt-next
Sync after v6.2-rc1 landed in drm-next.
We need to get some dependencies in place before we can merge
the fixes series from Gwan-gyeong and Chris.
References: https://lore.kernel.org/all/Y6x5JCDnh2rvh4lA@intel.com/
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 87 |
1 files changed, 73 insertions, 14 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index aad3c8b4c810..f76c19fc0392 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -22,6 +22,59 @@ */ #include "amdgpu.h" +#include "umc_v6_7.h" + +static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst) +{ + switch (adev->ip_versions[UMC_HWIP][0]) { + case IP_VERSION(6, 7, 0): + umc_v6_7_convert_error_address(adev, + err_data, err_addr, ch_inst, umc_inst); + break; + default: + dev_warn(adev->dev, + "UMC address to Physical address translation is not supported\n"); + return AMDGPU_RAS_FAIL; + } + + return AMDGPU_RAS_SUCCESS; +} + +int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + int ret = AMDGPU_RAS_FAIL; + + err_data.err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + if (!err_data.err_addr) { + dev_warn(adev->dev, + "Failed to alloc memory for umc error record in MCA notifier!\n"); + return AMDGPU_RAS_FAIL; + } + + /* + * Translate UMC channel address to Physical address + */ + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, + ch_inst, umc_inst); + if (ret) + goto out; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + +out: + kfree(err_data.err_addr); + return ret; +} static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, void *ras_error_status, @@ -112,23 +165,29 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, return AMDGPU_RAS_SUCCESS; } -int amdgpu_umc_poison_handler(struct amdgpu_device *adev, - void *ras_error_status, - bool reset) +int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) { - int ret; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ras_common_if head = { - .block = AMDGPU_RAS_BLOCK__UMC, - }; - struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + int ret = AMDGPU_RAS_SUCCESS; - ret = - amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset); + if (!adev->gmc.xgmi.connected_to_cpu) { + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct ras_common_if head = { + .block = AMDGPU_RAS_BLOCK__UMC, + }; + struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); - if (ret == AMDGPU_RAS_SUCCESS && obj) { - obj->err_data.ue_count += err_data->ue_count; - obj->err_data.ce_count += err_data->ce_count; + ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); + + if (ret == AMDGPU_RAS_SUCCESS && obj) { + obj->err_data.ue_count += err_data.ue_count; + obj->err_data.ce_count += err_data.ce_count; + } + } else if (reset) { + /* MCA poison handler is only responsible for GPU reset, + * let MCA notifier do page retirement. + */ + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + amdgpu_ras_reset_gpu(adev); } return ret; |