summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd
diff options
context:
space:
mode:
authorTao Zhou <tao.zhou1@amd.com>2024-05-23 11:23:20 +0800
committerAlex Deucher <alexander.deucher@amd.com>2024-06-05 11:25:14 -0400
commitb95fa494d6b74c30eeb4a50481aa1041c631754e (patch)
treeb5f6a3167531f119d94551f4fe6f328702fc8be8 /drivers/gpu/drm/amd
parent15c2990e0f0108b9c3752d7072a97d45d4283aea (diff)
downloadlinux-b95fa494d6b74c30eeb4a50481aa1041c631754e.tar.gz
linux-b95fa494d6b74c30eeb4a50481aa1041c631754e.tar.bz2
linux-b95fa494d6b74c30eeb4a50481aa1041c631754e.zip
drm/amdgpu: add RAS is_rma flag
Set the flag to true if bad page number reaches threshold. Signed-off-by: Tao Zhou <tao.zhou1@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c9
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c10
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h3
4 files changed, 12 insertions, 11 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 8dbfdb767f94..b3d11703df04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2926,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
u32 max_eeprom_records_count = 0;
- bool exc_err_limit = false;
int ret;
if (!con || amdgpu_sriov_vf(adev))
@@ -2963,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
*/
if (adev->gmc.xgmi.pending_reset)
return 0;
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+ ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
- * This calling fails when exc_err_limit is true or
+ * This calling fails when is_rma is true or
* ret != 0.
*/
- if (exc_err_limit || ret)
+ if (con->is_rma || ret)
goto free;
if (con->eeprom_control.ras_num_recs) {
@@ -3016,7 +3015,7 @@ out:
* Except error threshold exceeding case, other failure cases in this
* function would not fail amdgpu driver init.
*/
- if (!exc_err_limit)
+ if (!con->is_rma)
ret = 0;
else
ret = -EINVAL;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 56b9bf63b67f..e70c45712ddb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -522,6 +522,7 @@ struct amdgpu_ras {
bool update_channel_flag;
/* Record status of smu mca debug mode */
bool is_aca_debug_mode;
+ bool is_rma;
/* Record special requirements of gpu reset caller */
uint32_t gpu_reset_flags;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 9b789dcc2bd1..eae0a555df3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -750,6 +750,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
control->tbl_rai.health_percent = 0;
}
+ if (amdgpu_bad_page_threshold != -1)
+ ras->is_rma = true;
+
/* ignore the -ENOTSUPP return value */
amdgpu_dpm_send_rma_reason(adev);
}
@@ -1321,8 +1324,7 @@ Out:
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
}
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
- bool *exceed_err_limit)
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
{
struct amdgpu_device *adev = to_amdgpu_device(control);
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
@@ -1330,7 +1332,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int res;
- *exceed_err_limit = false;
+ ras->is_rma = false;
if (!__is_ras_eeprom_supported(adev))
return 0;
@@ -1422,7 +1424,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
res = 0;
} else {
- *exceed_err_limit = true;
+ ras->is_rma = true;
dev_err(adev->dev,
"RAS records:%d exceed threshold:%d, "
"GPU will not be initialized. Replace this GPU or increase the threshold",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 6dfd667f3013..b9ebda577797 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -129,8 +129,7 @@ struct eeprom_table_record {
unsigned char mcumc_id;
} __packed;
-int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
- bool *exceed_err_limit);
+int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control);
int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control);