diff options
author | Tao Zhou <tao.zhou1@amd.com> | 2024-05-23 11:23:20 +0800 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-06-05 11:25:14 -0400 |
commit | b95fa494d6b74c30eeb4a50481aa1041c631754e (patch) | |
tree | b5f6a3167531f119d94551f4fe6f328702fc8be8 /drivers/gpu/drm/amd | |
parent | 15c2990e0f0108b9c3752d7072a97d45d4283aea (diff) | |
download | linux-b95fa494d6b74c30eeb4a50481aa1041c631754e.tar.gz linux-b95fa494d6b74c30eeb4a50481aa1041c631754e.tar.bz2 linux-b95fa494d6b74c30eeb4a50481aa1041c631754e.zip |
drm/amdgpu: add RAS is_rma flag
Set the flag to true if bad page number reaches threshold.
Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 |
4 files changed, 12 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8dbfdb767f94..b3d11703df04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; u32 max_eeprom_records_count = 0; - bool exc_err_limit = false; int ret; if (!con || amdgpu_sriov_vf(adev)) @@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.pending_reset) return 0; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); /* - * This calling fails when exc_err_limit is true or + * This calling fails when is_rma is true or * ret != 0. */ - if (exc_err_limit || ret) + if (con->is_rma || ret) goto free; if (con->eeprom_control.ras_num_recs) { @@ -3016,7 +3015,7 @@ out: * Except error threshold exceeding case, other failure cases in this * function would not fail amdgpu driver init. */ - if (!exc_err_limit) + if (!con->is_rma) ret = 0; else ret = -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 56b9bf63b67f..e70c45712ddb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -522,6 +522,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 9b789dcc2bd1..eae0a555df3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } + if (amdgpu_bad_page_threshold != -1) + ras->is_rma = true; + /* ignore the -ENOTSUPP return value */ amdgpu_dpm_send_rma_reason(adev); } @@ -1321,8 +1324,7 @@ Out: return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) { struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; @@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); int res; - *exceed_err_limit = false; + ras->is_rma = false; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); res = 0; } else { - *exceed_err_limit = true; + ras->is_rma = true; dev_err(adev->dev, "RAS records:%d exceed threshold:%d, " "GPU will not be initialized. Replace this GPU or increase the threshold", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 6dfd667f3013..b9ebda577797 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -129,8 +129,7 @@ struct eeprom_table_record { unsigned char mcumc_id; } __packed; -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, - bool *exceed_err_limit); +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); |