summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm
diff options
context:
space:
mode:
authorAlex Deucher <alexander.deucher@amd.com>2024-10-14 11:58:34 -0400
committerAlex Deucher <alexander.deucher@amd.com>2024-11-04 11:27:04 -0500
commitefe6a8774375ddcbdd46fb920be55cc2d0120836 (patch)
treeed8164297fdda4c6bced3c0d3d5434873274f9ff /drivers/gpu/drm
parent5fd95dab6094ba0b851767fc460c2806eaafe8bd (diff)
downloadlinux-efe6a8774375ddcbdd46fb920be55cc2d0120836.tar.gz
linux-efe6a8774375ddcbdd46fb920be55cc2d0120836.tar.bz2
linux-efe6a8774375ddcbdd46fb920be55cc2d0120836.zip
drm/amdgpu: fix fairness in enforce isolation handling
Make sure KFD gets a turn when serializing access to the GC IP. Currently non-KFD jobs can starve KFD if they submit often enough. This patch prevents that by stalling non-KFD if its time period has elapsed. v2: fix units v3: check enablement properly Acked-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c53
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h2
3 files changed, 54 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 607998d8b1d5..7645e498faa4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -118,7 +118,7 @@
#define MAX_GPU_INSTANCE 64
-#define GFX_SLICE_PERIOD msecs_to_jiffies(250)
+#define GFX_SLICE_PERIOD_MS 250
struct amdgpu_gpu_instance {
struct amdgpu_device *adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index e96984c53e72..b8cc4b146bdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1752,7 +1752,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx,
if (adev->gfx.kfd_sch_req_count[idx] == 0 &&
adev->gfx.kfd_sch_inactive[idx]) {
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
- GFX_SLICE_PERIOD);
+ msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx]));
}
} else {
if (adev->gfx.kfd_sch_req_count[idx] == 0) {
@@ -1807,8 +1807,9 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]);
}
if (fences) {
+ /* we've already had our timeslice, so let's wrap this up */
schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work,
- GFX_SLICE_PERIOD);
+ msecs_to_jiffies(1));
} else {
/* Tell KFD to resume the runqueue */
if (adev->kfd.init_complete) {
@@ -1821,6 +1822,51 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work)
mutex_unlock(&adev->enforce_isolation_mutex);
}
+static void
+amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev,
+ u32 idx)
+{
+ unsigned long cjiffies;
+ bool wait = false;
+
+ mutex_lock(&adev->enforce_isolation_mutex);
+ if (adev->enforce_isolation[idx]) {
+ /* set the initial values if nothing is set */
+ if (!adev->gfx.enforce_isolation_jiffies[idx]) {
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ }
+ /* Make sure KFD gets a chance to run */
+ if (amdgpu_amdkfd_compute_active(adev, idx)) {
+ cjiffies = jiffies;
+ if (time_after(cjiffies, adev->gfx.enforce_isolation_jiffies[idx])) {
+ cjiffies -= adev->gfx.enforce_isolation_jiffies[idx];
+ if ((jiffies_to_msecs(cjiffies) >= GFX_SLICE_PERIOD_MS)) {
+ /* if our time is up, let KGD work drain before scheduling more */
+ wait = true;
+ /* reset the timer period */
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ } else {
+ /* set the timer period to what's left in our time slice */
+ adev->gfx.enforce_isolation_time[idx] =
+ GFX_SLICE_PERIOD_MS - jiffies_to_msecs(cjiffies);
+ }
+ } else {
+ /* if jiffies wrap around we will just wait a little longer */
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ }
+ } else {
+ /* if there is no KFD work, then set the full slice period */
+ adev->gfx.enforce_isolation_jiffies[idx] = jiffies;
+ adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS;
+ }
+ }
+ mutex_unlock(&adev->enforce_isolation_mutex);
+
+ if (wait)
+ msleep(GFX_SLICE_PERIOD_MS);
+}
+
void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
{
struct amdgpu_device *adev = ring->adev;
@@ -1837,6 +1883,9 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring)
if (idx >= MAX_XCP)
return;
+ /* Don't submit more work until KFD has had some time */
+ amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx);
+
mutex_lock(&adev->enforce_isolation_mutex);
if (adev->enforce_isolation[idx]) {
if (adev->kfd.init_complete)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index f710178a21bc..af9dbd760fee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -472,6 +472,8 @@ struct amdgpu_gfx {
struct mutex kfd_sch_mutex;
u64 kfd_sch_req_count[MAX_XCP];
bool kfd_sch_inactive[MAX_XCP];
+ unsigned long enforce_isolation_jiffies[MAX_XCP];
+ unsigned long enforce_isolation_time[MAX_XCP];
};
struct amdgpu_gfx_ras_reg_entry {