From c5093cddf56baceb1545028e8a5971d94cf59d25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 28 Sep 2022 13:21:05 +0200 Subject: drm/amdgpu: drop the fence argument from amdgpu_vmid_grab MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is always the job anyway. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-5-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 46c99331d7f1..5aa053acc0b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -256,9 +256,7 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, } while (fence == NULL && vm && !job->vmid) { - r = amdgpu_vmid_grab(vm, ring, &job->sync, - &job->base.s_fence->finished, - job); + r = amdgpu_vmid_grab(vm, ring, &job->sync, job); if (r) DRM_ERROR("Error getting VM ID (%d)\n", r); -- cgit v1.2.3 From 940ca22b7ea9db6857ba7c6adb961b84d8cc28ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 28 Sep 2022 14:00:57 +0200 Subject: drm/amdgpu: drop amdgpu_sync from amdgpu_vmid_grab v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead return the fence directly. Avoids memory allocation to store the fence. v2: cleanup coding style as well Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-6-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 5aa053acc0b4..384149a81978 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -239,12 +239,12 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, return 0; } -static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, - struct drm_sched_entity *s_entity) +static struct dma_fence * +amdgpu_job_dependency(struct drm_sched_job *sched_job, + struct drm_sched_entity *s_entity) { struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); struct amdgpu_job *job = to_amdgpu_job(sched_job); - struct amdgpu_vm *vm = job->vm; struct dma_fence *fence; int r; @@ -255,12 +255,10 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, DRM_ERROR("Error adding fence (%d)\n", r); } - while (fence == NULL && vm && !job->vmid) { - r = amdgpu_vmid_grab(vm, ring, &job->sync, job); + while (!fence && job->vm && !job->vmid) { + r = amdgpu_vmid_grab(job->vm, ring, job, &fence); if (r) DRM_ERROR("Error getting VM ID (%d)\n", r); - - fence = amdgpu_sync_get_fence(&job->sync); } if (!fence && job->gang_submit) -- cgit v1.2.3 From f7d66fb2ea43a3016e78a700a2ca6c77a74579f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 28 Sep 2022 20:31:38 +0200 Subject: drm/amdgpu: cleanup scheduler job initialization v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Init the DRM scheduler base class while allocating the job. This makes the whole handling much more cleaner. v2: fix coding style Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-7-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 44 +++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 384149a81978..fa60d25dfbd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -89,8 +89,9 @@ exit: return DRM_GPU_SCHED_STAT_NOMINAL; } -int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, - struct amdgpu_job **job, struct amdgpu_vm *vm) +int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, + struct drm_sched_entity *entity, void *owner, + unsigned int num_ibs, struct amdgpu_job **job) { if (num_ibs == 0) return -EINVAL; @@ -111,23 +112,30 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; - return 0; + if (!entity) + return 0; + + return drm_sched_job_init(&(*job)->base, entity, owner); } -int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, - enum amdgpu_ib_pool_type pool_type, - struct amdgpu_job **job) +int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, + struct drm_sched_entity *entity, void *owner, + size_t size, enum amdgpu_ib_pool_type pool_type, + struct amdgpu_job **job) { int r; - r = amdgpu_job_alloc(adev, 1, job, NULL); + r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job); if (r) return r; (*job)->num_ibs = 1; r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); - if (r) + if (r) { + if (entity) + drm_sched_job_cleanup(&(*job)->base); kfree(*job); + } return r; } @@ -191,6 +199,9 @@ void amdgpu_job_set_gang_leader(struct amdgpu_job *job, void amdgpu_job_free(struct amdgpu_job *job) { + if (job->base.entity) + drm_sched_job_cleanup(&job->base); + amdgpu_job_free_resources(job); amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->sched_sync); @@ -203,25 +214,16 @@ void amdgpu_job_free(struct amdgpu_job *job) dma_fence_put(&job->hw_fence); } -int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, - void *owner, struct dma_fence **f) +struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job) { - int r; - - if (!f) - return -EINVAL; - - r = drm_sched_job_init(&job->base, entity, owner); - if (r) - return r; + struct dma_fence *f; drm_sched_job_arm(&job->base); - - *f = dma_fence_get(&job->base.s_fence->finished); + f = dma_fence_get(&job->base.s_fence->finished); amdgpu_job_free_resources(job); drm_sched_entity_push_job(&job->base); - return 0; + return f; } int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, -- cgit v1.2.3 From 1b2d5eda5ad785d0dd13484141b78d2ac366c169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 29 Sep 2022 13:05:56 +0200 Subject: drm/amdgpu: move explicit sync check into the CS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves the memory allocation out of the critical code path. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-8-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index fa60d25dfbd1..6c126797d6a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -108,7 +108,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, (*job)->vm = vm; amdgpu_sync_create(&(*job)->sync); - amdgpu_sync_create(&(*job)->sched_sync); + amdgpu_sync_create(&(*job)->explicit_sync); (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; @@ -176,7 +176,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) drm_sched_job_cleanup(s_job); amdgpu_sync_free(&job->sync); - amdgpu_sync_free(&job->sched_sync); + amdgpu_sync_free(&job->explicit_sync); dma_fence_put(&job->hw_fence); } @@ -204,7 +204,7 @@ void amdgpu_job_free(struct amdgpu_job *job) amdgpu_job_free_resources(job); amdgpu_sync_free(&job->sync); - amdgpu_sync_free(&job->sched_sync); + amdgpu_sync_free(&job->explicit_sync); if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); @@ -251,12 +251,6 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, int r; fence = amdgpu_sync_get_fence(&job->sync); - if (fence && drm_sched_dependency_optimized(fence, s_entity)) { - r = amdgpu_sync_fence(&job->sched_sync, fence); - if (r) - DRM_ERROR("Error adding fence (%d)\n", r); - } - while (!fence && job->vm && !job->vmid) { r = amdgpu_vmid_grab(job->vm, ring, job, &fence); if (r) -- cgit v1.2.3 From 1728baa7e4e60054bf13dd9b1212d133cbd53b3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 29 Sep 2022 14:04:01 +0200 Subject: drm/amdgpu: use scheduler dependencies for CS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Entirely remove the sync obj in the job. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-11-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 6c126797d6a4..16f7bcc9c654 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -107,7 +107,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, (*job)->base.sched = &adev->rings[0]->sched; (*job)->vm = vm; - amdgpu_sync_create(&(*job)->sync); amdgpu_sync_create(&(*job)->explicit_sync); (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; @@ -175,9 +174,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) drm_sched_job_cleanup(s_job); - amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->explicit_sync); - dma_fence_put(&job->hw_fence); } @@ -203,7 +200,6 @@ void amdgpu_job_free(struct amdgpu_job *job) drm_sched_job_cleanup(&job->base); amdgpu_job_free_resources(job); - amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->explicit_sync); if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); @@ -247,10 +243,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, { struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); struct amdgpu_job *job = to_amdgpu_job(sched_job); - struct dma_fence *fence; + struct dma_fence *fence = NULL; int r; - fence = amdgpu_sync_get_fence(&job->sync); while (!fence && job->vm && !job->vmid) { r = amdgpu_vmid_grab(job->vm, ring, job, &fence); if (r) @@ -274,8 +269,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) job = to_amdgpu_job(sched_job); finished = &job->base.s_fence->finished; - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); - trace_amdgpu_sched_run_job(job); /* Skip job if VRAM is lost and never resubmit gangs */ -- cgit v1.2.3 From a82f30b04c6aaefe62cbbfd297e1bb23435b6b3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Thu, 29 Sep 2022 15:01:57 +0200 Subject: drm/scheduler: rename dependency callback into prepare_job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This now matches much better what this is doing. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-14-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 16f7bcc9c654..172572cfed36 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -238,7 +238,7 @@ int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, } static struct dma_fence * -amdgpu_job_dependency(struct drm_sched_job *sched_job, +amdgpu_job_prepare_job(struct drm_sched_job *sched_job, struct drm_sched_entity *s_entity) { struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); @@ -327,7 +327,7 @@ void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) } const struct drm_sched_backend_ops amdgpu_sched_ops = { - .dependency = amdgpu_job_dependency, + .prepare_job = amdgpu_job_prepare_job, .run_job = amdgpu_job_run, .timedout_job = amdgpu_job_timedout, .free_job = amdgpu_job_free_cb -- cgit v1.2.3 From b09d6acba1d9a23963fedf96b4191502a4fec25d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Fri, 18 Nov 2022 13:32:30 +0100 Subject: drm/amdgpu: handle gang submit before VMID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise it can happen that not all gang members can get a VMID assigned and we deadlock. Signed-off-by: Christian König Tested-by: Timur Kristóf Acked-by: Timur Kristóf Fixes: 68ce8b242242 ("drm/amdgpu: add gang submit backend v2") Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20221118153023.312582-1-christian.koenig@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index cd968e781077..abb99cff8b4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -254,6 +254,9 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, DRM_ERROR("Error adding fence (%d)\n", r); } + if (!fence && job->gang_submit) + fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); + while (fence == NULL && vm && !job->vmid) { r = amdgpu_vmid_grab(vm, ring, &job->sync, &job->base.s_fence->finished, @@ -264,9 +267,6 @@ static struct dma_fence *amdgpu_job_dependency(struct drm_sched_job *sched_job, fence = amdgpu_sync_get_fence(&job->sync); } - if (!fence && job->gang_submit) - fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); - return fence; } -- cgit v1.2.3 From 3cb93f390453cde4d6afda1587aaa00e75e09617 Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Wed, 16 Nov 2022 17:08:22 +0800 Subject: drm/amdgpu: fix use-after-free during gpu recovery [Why] [ 754.862560] refcount_t: underflow; use-after-free. [ 754.862898] Call Trace: [ 754.862903] [ 754.862913] amdgpu_job_free_cb+0xc2/0xe1 [amdgpu] [ 754.863543] drm_sched_main.cold+0x34/0x39 [amd_sched] [How] The fw_fence may be not init, check whether dma_fence_init is performed before job free Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index cd968e781077..f4a3122352de 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -169,7 +169,11 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->sched_sync); - dma_fence_put(&job->hw_fence); + /* only put the hw fence if has embedded fence */ + if (!job->hw_fence.ops) + kfree(job); + else + dma_fence_put(&job->hw_fence); } void amdgpu_job_set_gang_leader(struct amdgpu_job *job, -- cgit v1.2.3 From 0317d73954850c48268f3db00a49e676d12b10cf Mon Sep 17 00:00:00 2001 From: "Stanley.Yang" Date: Wed, 16 Nov 2022 17:08:22 +0800 Subject: drm/amdgpu: fix use-after-free during gpu recovery [Why] [ 754.862560] refcount_t: underflow; use-after-free. [ 754.862898] Call Trace: [ 754.862903] [ 754.862913] amdgpu_job_free_cb+0xc2/0xe1 [amdgpu] [ 754.863543] drm_sched_main.cold+0x34/0x39 [amd_sched] [How] The fw_fence may be not init, check whether dma_fence_init is performed before job free Signed-off-by: Stanley.Yang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 032651a655f0..7f1ca90a552c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -174,7 +174,12 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) drm_sched_job_cleanup(s_job); amdgpu_sync_free(&job->explicit_sync); - dma_fence_put(&job->hw_fence); + + /* only put the hw fence if has embedded fence */ + if (!job->hw_fence.ops) + kfree(job); + else + dma_fence_put(&job->hw_fence); } void amdgpu_job_set_gang_leader(struct amdgpu_job *job, -- cgit v1.2.3