From 126bbd4ab524160e63725d04e838c0f18c917e11 Mon Sep 17 00:00:00 2001 From: Alex Sierra Date: Mon, 12 Apr 2021 13:34:57 -0500 Subject: drm/amdgpu: extend xnack limit page fault timeout Extending this timeout will prevent IH from storm interrupts coming from SDMA while a page fault is active. Currently, on Aldebaran, handling that many interrupts can take a lot of CPU time (up to 4 seconds). This eventually causes timeouts in other tasks. Signed-off-by: Alex Sierra Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 5715be6770ec..823a367990bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1109,6 +1109,8 @@ static void sdma_v4_0_ctx_switch_enable(struct amdgpu_device *adev, bool enable) if (adev->asic_type == CHIP_ARCTURUS && adev->sdma.instance[i].fw_version >= 14) WREG32_SDMA(i, mmSDMA0_PUB_DUMMY_REG2, enable); + /* Extend page fault timeout to avoid interrupt storm */ + WREG32_SDMA(i, mmSDMA0_UTCL1_TIMEOUT, 0x00800080); } } -- cgit v1.2.3 From 3d2bee9188f24211b56874c7753a304d43278fad Mon Sep 17 00:00:00 2001 From: Feifei Xu Date: Sun, 25 Apr 2021 15:26:15 +0800 Subject: drm/amdgpu: Change the sdma interrupt print level Change the print level into debug. Signed-off-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 823a367990bf..85ab9bbcd551 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -2229,7 +2229,7 @@ static int sdma_v4_0_print_iv_entry(struct amdgpu_device *adev, memset(&task_info, 0, sizeof(struct amdgpu_task_info)); amdgpu_vm_get_task_info(adev, entry->pasid, &task_info); - dev_info(adev->dev, + dev_dbg_ratelimited(adev->dev, "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u " "pasid:%u, for process %s pid %d thread %s pid %d\n", instance, addr, entry->src_id, entry->ring_id, entry->vmid, @@ -2242,7 +2242,7 @@ static int sdma_v4_0_process_vm_hole_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - dev_err(adev->dev, "MC or SEM address in VM hole\n"); + dev_dbg_ratelimited(adev->dev, "MC or SEM address in VM hole\n"); sdma_v4_0_print_iv_entry(adev, entry); return 0; } @@ -2251,7 +2251,7 @@ static int sdma_v4_0_process_doorbell_invalid_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - dev_err(adev->dev, "SDMA received a doorbell from BIF with byte_enable !=0xff\n"); + dev_dbg_ratelimited(adev->dev, "SDMA received a doorbell from BIF with byte_enable !=0xff\n"); sdma_v4_0_print_iv_entry(adev, entry); return 0; } @@ -2260,7 +2260,7 @@ static int sdma_v4_0_process_pool_timeout_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - dev_err(adev->dev, + dev_dbg_ratelimited(adev->dev, "Polling register/memory timeout executing POLL_REG/MEM with finite timer\n"); sdma_v4_0_print_iv_entry(adev, entry); return 0; @@ -2270,7 +2270,7 @@ static int sdma_v4_0_process_srbm_write_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) { - dev_err(adev->dev, + dev_dbg_ratelimited(adev->dev, "SDMA gets an Register Write SRBM_WRITE command in non-privilege command buffer\n"); sdma_v4_0_print_iv_entry(adev, entry); return 0; -- cgit v1.2.3 From 5d11699914b9993d7c56ff74450f815c7a54e99f Mon Sep 17 00:00:00 2001 From: Feifei Xu Date: Sun, 25 Apr 2021 15:49:23 +0800 Subject: drm/amdgpu: Correct and simplify sdma 4.x irq.num_types Correct and init the sdma4.x irq.num_types. v2: squash in fix (Alex) Signed-off-by: Feifei Xu Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c') diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 85ab9bbcd551..d197185f7789 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -2599,27 +2599,18 @@ static const struct amdgpu_irq_src_funcs sdma_v4_0_srbm_write_irq_funcs = { static void sdma_v4_0_set_irq_funcs(struct amdgpu_device *adev) { + adev->sdma.trap_irq.num_types = adev->sdma.num_instances; + adev->sdma.ecc_irq.num_types = adev->sdma.num_instances; + /*For Arcturus and Aldebaran, add another 4 irq handler*/ switch (adev->sdma.num_instances) { - case 1: - adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE1; - adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE1; - break; case 5: - adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE5; - adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE5; - break; case 8: - adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_LAST; - adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_LAST; - adev->sdma.vm_hole_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE5; - adev->sdma.doorbell_invalid_irq.num_types = AMDGPU_SDMA_IRQ_LAST; - adev->sdma.pool_timeout_irq.num_types = AMDGPU_SDMA_IRQ_LAST; - adev->sdma.srbm_write_irq.num_types = AMDGPU_SDMA_IRQ_LAST; + adev->sdma.vm_hole_irq.num_types = adev->sdma.num_instances; + adev->sdma.doorbell_invalid_irq.num_types = adev->sdma.num_instances; + adev->sdma.pool_timeout_irq.num_types = adev->sdma.num_instances; + adev->sdma.srbm_write_irq.num_types = adev->sdma.num_instances; break; - case 2: default: - adev->sdma.trap_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE2; - adev->sdma.ecc_irq.num_types = AMDGPU_SDMA_IRQ_INSTANCE2; break; } adev->sdma.trap_irq.funcs = &sdma_v4_0_trap_irq_funcs; -- cgit v1.2.3