diff options
author | Joonas Lahtinen <joonas.lahtinen@linux.intel.com> | 2023-04-11 15:43:45 +0300 |
---|---|---|
committer | Joonas Lahtinen <joonas.lahtinen@linux.intel.com> | 2023-04-11 15:43:45 +0300 |
commit | ea68a3e9d14e9e0bf017d178fb4bd53b6deb1482 (patch) | |
tree | 4ca0e37218ecc0844d330cb37b956d5219bb1d32 /drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | |
parent | 16fc9c08f0ec7b1c95f1ea4a16097acdb3fc943d (diff) | |
parent | 55bf14961db9da61220e6f04bc9919c94b1a6585 (diff) | |
download | linux-ea68a3e9d14e9e0bf017d178fb4bd53b6deb1482.tar.gz linux-ea68a3e9d14e9e0bf017d178fb4bd53b6deb1482.tar.bz2 linux-ea68a3e9d14e9e0bf017d178fb4bd53b6deb1482.zip |
Merge drm/drm-next into drm-intel-gt-next
Need to pull in commit from drm-next (earlier in drm-intel-next):
1eca0778f4b3 ("drm/i915: add struct i915_dsm to wrap dsm members together")
In order to merge following patch to drm-intel-gt-next:
https://patchwork.freedesktop.org/patch/530942/?series=114925&rev=6
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v8_10.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 202 |
1 files changed, 171 insertions, 31 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c index da394bc06bba..fb55e8cb9967 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c @@ -209,6 +209,45 @@ static int umc_v8_10_swizzle_mode_na_to_pa(struct amdgpu_device *adev, return 0; } +static void umc_v8_10_convert_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, uint64_t err_addr, + uint32_t ch_inst, uint32_t umc_inst, + uint32_t node_inst, uint64_t mc_umc_status) +{ + uint64_t na_err_addr_base; + uint64_t na_err_addr, retired_page_addr; + uint32_t channel_index, addr_lsb, col = 0; + int ret = 0; + + channel_index = + adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst]; + + /* the lowest lsb bits should be ignored */ + addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb); + err_addr &= ~((0x1ULL << addr_lsb) - 1); + na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); + + /* loop for all possibilities of [C6 C5] in normal address. */ + for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { + na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); + + /* Mapping normal error address to retired soc physical address. */ + ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, + na_err_addr, &retired_page_addr); + if (ret) { + dev_err(adev->dev, "Failed to map pa from umc na.\n"); + break; + } + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", + retired_page_addr); + amdgpu_umc_fill_error_record(err_data, na_err_addr, + retired_page_addr, channel_index, umc_inst); + } +} + static void umc_v8_10_query_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, uint32_t umc_reg_offset, @@ -218,10 +257,7 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, { uint64_t mc_umc_status_addr; uint64_t mc_umc_status, err_addr; - uint64_t mc_umc_addrt0, na_err_addr_base; - uint64_t na_err_addr, retired_page_addr; - uint32_t channel_index, addr_lsb, col = 0; - int ret = 0; + uint64_t mc_umc_addrt0; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0); @@ -236,12 +272,6 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, return; } - channel_index = - adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num * - adev->umc.channel_inst_num + - umc_inst * adev->umc.channel_inst_num + - ch_inst]; - /* calculate error address if ue error is detected */ if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && @@ -251,27 +281,8 @@ static void umc_v8_10_query_error_address(struct amdgpu_device *adev, err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4); err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - /* the lowest lsb bits should be ignored */ - addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb); - err_addr &= ~((0x1ULL << addr_lsb) - 1); - na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT); - - /* loop for all possibilities of [C6 C5] in normal address. */ - for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) { - na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT); - - /* Mapping normal error address to retired soc physical address. */ - ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index, - na_err_addr, &retired_page_addr); - if (ret) { - dev_err(adev->dev, "Failed to map pa from umc na.\n"); - break; - } - dev_info(adev->dev, "Error Address(PA): 0x%llx\n", - retired_page_addr); - amdgpu_umc_fill_error_record(err_data, na_err_addr, - retired_page_addr, channel_index, umc_inst); - } + umc_v8_10_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst, node_inst, mc_umc_status); } /* clear umc status */ @@ -349,6 +360,133 @@ static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev) return true; } +static void umc_v8_10_ecc_info_query_correctable_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, + unsigned long *error_count) +{ + uint64_t mc_umc_status; + uint32_t eccinfo_table_idx; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + /* check the MCUMC_STATUS */ + mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) { + *error_count += 1; + } +} + +static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_device *adev, + uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, + unsigned long *error_count) +{ + uint64_t mc_umc_status; + uint32_t eccinfo_table_idx; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + /* check the MCUMC_STATUS */ + mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; + if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) { + *error_count += 1; + } +} + +static void umc_v8_10_ecc_info_query_ras_error_count(struct amdgpu_device *adev, + void *ras_error_status) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + + uint32_t node_inst = 0; + uint32_t umc_inst = 0; + uint32_t ch_inst = 0; + + /* TODO: driver needs to toggle DF Cstate to ensure + * safe access of UMC registers. Will add the protection + */ + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { + umc_v8_10_ecc_info_query_correctable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ce_count)); + umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, + node_inst, umc_inst, ch_inst, + &(err_data->ue_count)); + } +} + +static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, + struct ras_err_data *err_data, + uint32_t ch_inst, + uint32_t umc_inst, + uint32_t node_inst) +{ + uint32_t eccinfo_table_idx; + uint64_t mc_umc_status, err_addr; + + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * + adev->umc.channel_inst_num + + umc_inst * adev->umc.channel_inst_num + + ch_inst; + + mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; + + if (mc_umc_status == 0) + return; + + if (!err_data->err_addr) + return; + + /* calculate error address if ue error is detected */ + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && + (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1)) { + + err_addr = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_addr; + err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); + + umc_v8_10_convert_error_address(adev, err_data, err_addr, + ch_inst, umc_inst, node_inst, mc_umc_status); + } +} + +static void umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *adev, + void *ras_error_status) +{ + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + + uint32_t node_inst = 0; + uint32_t umc_inst = 0; + uint32_t ch_inst = 0; + + /* TODO: driver needs to toggle DF Cstate to ensure + * safe access of UMC resgisters. Will add the protection + * when firmware interface is ready + */ + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) { + umc_v8_10_ecc_info_query_error_address(adev, + err_data, + ch_inst, + umc_inst, + node_inst); + } +} + const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { .query_ras_error_count = umc_v8_10_query_ras_error_count, .query_ras_error_address = umc_v8_10_query_ras_error_address, @@ -360,4 +498,6 @@ struct amdgpu_umc_ras umc_v8_10_ras = { }, .err_cnt_init = umc_v8_10_err_cnt_init, .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode, + .ecc_info_query_ras_error_count = umc_v8_10_ecc_info_query_ras_error_count, + .ecc_info_query_ras_error_address = umc_v8_10_ecc_info_query_ras_error_address, }; |