summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c211
1 files changed, 79 insertions, 132 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index e37fa52f22f7..53fdafd85cb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -66,6 +66,7 @@
#include "amdgpu_pmu.h"
#include <linux/suspend.h>
+#include <drm/task_barrier.h>
MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -215,8 +216,8 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
{
uint32_t ret;
- if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
- return amdgpu_virt_kiq_rreg(adev, reg);
+ if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+ return amdgpu_kiq_rreg(adev, reg);
if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
@@ -293,8 +294,8 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
adev->last_mm_index = v;
}
- if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
- return amdgpu_virt_kiq_wreg(adev, reg, v);
+ if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+ return amdgpu_kiq_wreg(adev, reg, v);
if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
@@ -984,7 +985,7 @@ static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
{
struct sysinfo si;
- bool is_os_64 = (sizeof(void *) == 8) ? true : false;
+ bool is_os_64 = (sizeof(void *) == 8);
uint64_t total_memory;
uint64_t dram_size_seven_GB = 0x1B8000000;
uint64_t dram_size_three_GB = 0xB8000000;
@@ -1031,8 +1032,6 @@ def_value:
*/
static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
{
- int ret = 0;
-
if (amdgpu_sched_jobs < 4) {
dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
amdgpu_sched_jobs);
@@ -1072,7 +1071,7 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
- return ret;
+ return 0;
}
/**
@@ -1810,7 +1809,8 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
}
}
- r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
+ if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
+ r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
return r;
}
@@ -2345,14 +2345,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
adev->ip_blocks[i].status.hw = false;
/* handle putting the SMC in the appropriate state */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
- if (is_support_sw_smu(adev)) {
- r = smu_set_mp1_state(&adev->smu, adev->mp1_state);
- } else if (adev->powerplay.pp_funcs &&
- adev->powerplay.pp_funcs->set_mp1_state) {
- r = adev->powerplay.pp_funcs->set_mp1_state(
- adev->powerplay.pp_handle,
- adev->mp1_state);
- }
+ r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
if (r) {
DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
adev->mp1_state, r);
@@ -2439,7 +2432,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
AMD_IP_BLOCK_TYPE_GFX,
AMD_IP_BLOCK_TYPE_SDMA,
AMD_IP_BLOCK_TYPE_UVD,
- AMD_IP_BLOCK_TYPE_VCE
+ AMD_IP_BLOCK_TYPE_VCE,
+ AMD_IP_BLOCK_TYPE_VCN
};
for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
@@ -2454,7 +2448,11 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
block->status.hw)
continue;
- r = block->version->funcs->hw_init(adev);
+ if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
+ r = block->version->funcs->resume(adev);
+ else
+ r = block->version->funcs->hw_init(adev);
+
DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
if (r)
return r;
@@ -2663,14 +2661,38 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
{
struct amdgpu_device *adev =
container_of(__work, struct amdgpu_device, xgmi_reset_work);
+ struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
- if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
- adev->asic_reset_res = (adev->in_baco == false) ?
- amdgpu_device_baco_enter(adev->ddev) :
- amdgpu_device_baco_exit(adev->ddev);
- else
- adev->asic_reset_res = amdgpu_asic_reset(adev);
+ /* It's a bug to not have a hive within this function */
+ if (WARN_ON(!hive))
+ return;
+
+ /*
+ * Use task barrier to synchronize all xgmi reset works across the
+ * hive. task_barrier_enter and task_barrier_exit will block
+ * until all the threads running the xgmi reset works reach
+ * those points. task_barrier_full will do both blocks.
+ */
+ if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
+
+ task_barrier_enter(&hive->tb);
+ adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+
+ task_barrier_exit(&hive->tb);
+ adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
+
+ if (adev->asic_reset_res)
+ goto fail;
+ } else {
+ task_barrier_full(&hive->tb);
+ adev->asic_reset_res = amdgpu_asic_reset(adev);
+ }
+
+fail:
if (adev->asic_reset_res)
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
adev->asic_reset_res, adev->ddev->unique);
@@ -2785,7 +2807,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->mman.buffer_funcs = NULL;
adev->mman.buffer_funcs_ring = NULL;
adev->vm_manager.vm_pte_funcs = NULL;
- adev->vm_manager.vm_pte_num_rqs = 0;
+ adev->vm_manager.vm_pte_num_scheds = 0;
adev->gmc.gmc_funcs = NULL;
adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
@@ -2826,6 +2848,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
hash_init(adev->mn_hash);
mutex_init(&adev->lock_reset);
mutex_init(&adev->psp.mutex);
+ mutex_init(&adev->notifier_lock);
r = amdgpu_device_check_arguments(adev);
if (r)
@@ -3029,6 +3052,14 @@ fence_driver_init:
goto failed;
}
+ DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
+ adev->gfx.config.max_shader_engines,
+ adev->gfx.config.max_sh_per_se,
+ adev->gfx.config.max_cu_per_sh,
+ adev->gfx.cu_info.number);
+
+ amdgpu_ctx_init_sched(adev);
+
adev->accel_working = true;
amdgpu_vm_check_compute_bug(adev);
@@ -3660,8 +3691,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
if (r)
return r;
- amdgpu_amdkfd_pre_reset(adev);
-
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
@@ -3730,6 +3759,11 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
case CHIP_VEGA10:
case CHIP_VEGA12:
case CHIP_RAVEN:
+ case CHIP_ARCTURUS:
+ case CHIP_RENOIR:
+ case CHIP_NAVI10:
+ case CHIP_NAVI14:
+ case CHIP_NAVI12:
break;
default:
goto disabled;
@@ -3790,18 +3824,13 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
-static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
- struct amdgpu_hive_info *hive,
+static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
struct list_head *device_list_handle,
bool *need_full_reset_arg)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset = *need_full_reset_arg, vram_lost = false;
int r = 0;
- int cpu = smp_processor_id();
- bool use_baco =
- (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
- true : false;
/*
* ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3809,62 +3838,22 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
*/
if (need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- /*
- * For XGMI run all resets in parallel to speed up the
- * process by scheduling the highpri wq on different
- * cpus. For XGMI with baco reset, all nodes must enter
- * baco within close proximity before anyone exit.
- */
+ /* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu, system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
+ if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
- cpu = cpumask_next(cpu, cpu_online_mask);
} else
r = amdgpu_asic_reset(tmp_adev);
- if (r)
- break;
- }
-
- /* For XGMI wait for all work to complete before proceed */
- if (!r) {
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- flush_work(&tmp_adev->xgmi_reset_work);
- r = tmp_adev->asic_reset_res;
- if (r)
- break;
- if (use_baco)
- tmp_adev->in_baco = true;
- }
- }
- }
-
- /*
- * For XGMI with baco reset, need exit baco phase by scheduling
- * xgmi_reset_work one more time. PSP reset and sGPU skips this
- * phase. Not assume the situation that PSP reset and baco reset
- * coexist within an XGMI hive.
- */
- if (!r && use_baco) {
- cpu = smp_processor_id();
- list_for_each_entry(tmp_adev, device_list_handle,
- gmc.xgmi.head) {
- if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
- if (!queue_work_on(cpu,
- system_highpri_wq,
- &tmp_adev->xgmi_reset_work))
- r = -EALREADY;
- if (r)
- break;
- cpu = cpumask_next(cpu, cpu_online_mask);
- }
+ if (r) {
+ DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+ r, tmp_adev->ddev->unique);
+ break;
}
}
- if (!r && use_baco) {
+ /* For XGMI wait for all resets to complete before proceed */
+ if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
@@ -3872,16 +3861,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_device *adev,
r = tmp_adev->asic_reset_res;
if (r)
break;
- tmp_adev->in_baco = false;
}
}
}
-
- if (r) {
- DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
- r, tmp_adev->ddev->unique);
- goto end;
- }
}
if (!r && amdgpu_ras_intr_triggered())
@@ -3974,7 +3956,7 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
mutex_lock(&adev->lock_reset);
atomic_inc(&adev->gpu_reset_counter);
- adev->in_gpu_reset = 1;
+ adev->in_gpu_reset = true;
switch (amdgpu_asic_reset_method(adev)) {
case AMD_RESET_METHOD_MODE1:
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
@@ -3994,7 +3976,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
{
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
- adev->in_gpu_reset = 0;
+ adev->in_gpu_reset = false;
mutex_unlock(&adev->lock_reset);
}
@@ -4175,8 +4157,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
if (r)
adev->asic_reset_res = r;
} else {
- r = amdgpu_do_asic_reset(adev, hive, device_list_handle,
- &need_full_reset);
+ r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
if (r && r == -EAGAIN)
goto retry;
}
@@ -4377,55 +4358,21 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
if (ras && ras->supported)
adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
- if (is_support_sw_smu(adev)) {
- struct smu_context *smu = &adev->smu;
- int ret;
-
- ret = smu_baco_enter(smu);
- if (ret)
- return ret;
- } else {
- void *pp_handle = adev->powerplay.pp_handle;
- const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-
- if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state)
- return -ENOENT;
-
- /* enter BACO state */
- if (pp_funcs->set_asic_baco_state(pp_handle, 1))
- return -EIO;
- }
-
- return 0;
+ return amdgpu_dpm_baco_enter(adev);
}
int amdgpu_device_baco_exit(struct drm_device *dev)
{
struct amdgpu_device *adev = dev->dev_private;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+ int ret = 0;
if (!amdgpu_device_supports_baco(adev->ddev))
return -ENOTSUPP;
- if (is_support_sw_smu(adev)) {
- struct smu_context *smu = &adev->smu;
- int ret;
-
- ret = smu_baco_exit(smu);
- if (ret)
- return ret;
-
- } else {
- void *pp_handle = adev->powerplay.pp_handle;
- const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
-
- if (!pp_funcs ||!pp_funcs->get_asic_baco_state ||!pp_funcs->set_asic_baco_state)
- return -ENOENT;
-
- /* exit BACO state */
- if (pp_funcs->set_asic_baco_state(pp_handle, 0))
- return -EIO;
- }
+ ret = amdgpu_dpm_baco_exit(adev);
+ if (ret)
+ return ret;
if (ras && ras->supported)
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);