1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00
linux/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
Jonathan Kim 12976e6a5a drm/amdkfd: add debug device snapshot operation
Similar to queue snapshot, return an array of device information using
an entry_size check and return.
Unlike queue snapshots, the debugger needs to pass to correct number of
devices that exist.  If it fails to do so, the KFD will return the
number of actual devices so that the debugger can make a subsequent
successful call.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2023-06-09 12:37:00 -04:00

1120 lines
28 KiB
C

/*
* Copyright 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "kfd_debug.h"
#include "kfd_device_queue_manager.h"
#include "kfd_topology.h"
#include <linux/file.h>
#include <uapi/linux/kfd_ioctl.h>
#define MAX_WATCH_ADDRESSES 4
int kfd_dbg_ev_query_debug_event(struct kfd_process *process,
unsigned int *queue_id,
unsigned int *gpu_id,
uint64_t exception_clear_mask,
uint64_t *event_status)
{
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
int i;
if (!(process && process->debug_trap_enabled))
return -ENODATA;
mutex_lock(&process->event_mutex);
*event_status = 0;
*queue_id = 0;
*gpu_id = 0;
/* find and report queue events */
pqm = &process->pqm;
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
uint64_t tmp = process->exception_enable_mask;
if (!pqn->q)
continue;
tmp &= pqn->q->properties.exception_status;
if (!tmp)
continue;
*event_status = pqn->q->properties.exception_status;
*queue_id = pqn->q->properties.queue_id;
*gpu_id = pqn->q->device->id;
pqn->q->properties.exception_status &= ~exception_clear_mask;
goto out;
}
/* find and report device events */
for (i = 0; i < process->n_pdds; i++) {
struct kfd_process_device *pdd = process->pdds[i];
uint64_t tmp = process->exception_enable_mask
& pdd->exception_status;
if (!tmp)
continue;
*event_status = pdd->exception_status;
*gpu_id = pdd->dev->id;
pdd->exception_status &= ~exception_clear_mask;
goto out;
}
/* report process events */
if (process->exception_enable_mask & process->exception_status) {
*event_status = process->exception_status;
process->exception_status &= ~exception_clear_mask;
}
out:
mutex_unlock(&process->event_mutex);
return *event_status ? 0 : -EAGAIN;
}
void debug_event_write_work_handler(struct work_struct *work)
{
struct kfd_process *process;
static const char write_data = '.';
loff_t pos = 0;
process = container_of(work,
struct kfd_process,
debug_event_workarea);
kernel_write(process->dbg_ev_file, &write_data, 1, &pos);
}
/* update process/device/queue exception status, write to descriptor
* only if exception_status is enabled.
*/
bool kfd_dbg_ev_raise(uint64_t event_mask,
struct kfd_process *process, struct kfd_node *dev,
unsigned int source_id, bool use_worker,
void *exception_data, size_t exception_data_size)
{
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
int i;
static const char write_data = '.';
loff_t pos = 0;
bool is_subscribed = true;
if (!(process && process->debug_trap_enabled))
return false;
mutex_lock(&process->event_mutex);
if (event_mask & KFD_EC_MASK_DEVICE) {
for (i = 0; i < process->n_pdds; i++) {
struct kfd_process_device *pdd = process->pdds[i];
if (pdd->dev != dev)
continue;
pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE;
if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
if (!pdd->vm_fault_exc_data) {
pdd->vm_fault_exc_data = kmemdup(
exception_data,
exception_data_size,
GFP_KERNEL);
if (!pdd->vm_fault_exc_data)
pr_debug("Failed to allocate exception data memory");
} else {
pr_debug("Debugger exception data not saved\n");
print_hex_dump_bytes("exception data: ",
DUMP_PREFIX_OFFSET,
exception_data,
exception_data_size);
}
}
break;
}
} else if (event_mask & KFD_EC_MASK_PROCESS) {
process->exception_status |= event_mask & KFD_EC_MASK_PROCESS;
} else {
pqm = &process->pqm;
list_for_each_entry(pqn, &pqm->queues,
process_queue_list) {
int target_id;
if (!pqn->q)
continue;
target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ?
pqn->q->properties.queue_id :
pqn->q->doorbell_id;
if (pqn->q->device != dev || target_id != source_id)
continue;
pqn->q->properties.exception_status |= event_mask;
break;
}
}
if (process->exception_enable_mask & event_mask) {
if (use_worker)
schedule_work(&process->debug_event_workarea);
else
kernel_write(process->dbg_ev_file,
&write_data,
1,
&pos);
} else {
is_subscribed = false;
}
mutex_unlock(&process->event_mutex);
return is_subscribed;
}
/* set pending event queue entry from ring entry */
bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev,
unsigned int pasid,
uint32_t doorbell_id,
uint64_t trap_mask,
void *exception_data,
size_t exception_data_size)
{
struct kfd_process *p;
bool signaled_to_debugger_or_runtime = false;
p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return false;
if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
exception_data, exception_data_size)) {
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
if (!!(trap_mask & KFD_EC_MASK_QUEUE) &&
p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) {
mutex_lock(&p->mutex);
pqm = &p->pqm;
list_for_each_entry(pqn, &pqm->queues,
process_queue_list) {
if (!(pqn->q && pqn->q->device == dev &&
pqn->q->doorbell_id == doorbell_id))
continue;
kfd_send_exception_to_runtime(p, pqn->q->properties.queue_id,
trap_mask);
signaled_to_debugger_or_runtime = true;
break;
}
mutex_unlock(&p->mutex);
} else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
kfd_dqm_evict_pasid(dev->dqm, p->pasid);
kfd_signal_vm_fault_event(dev, p->pasid, NULL,
exception_data);
signaled_to_debugger_or_runtime = true;
}
} else {
signaled_to_debugger_or_runtime = true;
}
kfd_unref_process(p);
return signaled_to_debugger_or_runtime;
}
int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
unsigned int dev_id,
unsigned int queue_id,
uint64_t error_reason)
{
if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
struct kfd_process_device *pdd = NULL;
struct kfd_hsa_memory_exception_data *data;
int i;
for (i = 0; i < p->n_pdds; i++) {
if (p->pdds[i]->dev->id == dev_id) {
pdd = p->pdds[i];
break;
}
}
if (!pdd)
return -ENODEV;
data = (struct kfd_hsa_memory_exception_data *)
pdd->vm_fault_exc_data;
kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
}
if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
/*
* block should only happen after the debugger receives runtime
* enable notice.
*/
up(&p->runtime_enable_sema);
error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
}
if (error_reason)
return kfd_send_exception_to_runtime(p, queue_id, error_reason);
return 0;
}
static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
{
struct mqd_update_info minfo = {0};
int err;
if (!q)
return 0;
if (KFD_GC_VERSION(q->device) < IP_VERSION(11, 0, 0) ||
KFD_GC_VERSION(q->device) >= IP_VERSION(12, 0, 0))
return 0;
if (enable && q->properties.is_user_cu_masked)
return -EBUSY;
minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE;
q->properties.is_dbg_wa = enable;
err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo);
if (err)
q->properties.is_dbg_wa = false;
return err;
}
static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable)
{
struct process_queue_manager *pqm = &target->pqm;
struct process_queue_node *pqn;
int r = 0;
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
r = kfd_dbg_set_queue_workaround(pqn->q, enable);
if (enable && r)
goto unwind;
}
return 0;
unwind:
list_for_each_entry(pqn, &pqm->queues, process_queue_list)
kfd_dbg_set_queue_workaround(pqn->q, false);
if (enable)
target->runtime_info.runtime_state = r == -EBUSY ?
DEBUG_RUNTIME_STATE_ENABLED_BUSY :
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
return r;
}
int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
{
uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
uint32_t flags = pdd->process->dbg_flags;
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
return 0;
return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
pdd->watch_points, flags);
}
#define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1
static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id)
{
int i;
*watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID;
spin_lock(&pdd->dev->kfd->watch_points_lock);
for (i = 0; i < MAX_WATCH_ADDRESSES; i++) {
/* device watchpoint in use so skip */
if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1)
continue;
pdd->alloc_watch_ids |= 0x1 << i;
pdd->dev->kfd->alloc_watch_ids |= 0x1 << i;
*watch_id = i;
spin_unlock(&pdd->dev->kfd->watch_points_lock);
return 0;
}
spin_unlock(&pdd->dev->kfd->watch_points_lock);
return -ENOMEM;
}
static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
{
spin_lock(&pdd->dev->kfd->watch_points_lock);
/* process owns device watch point so safe to clear */
if ((pdd->alloc_watch_ids >> watch_id) & 0x1) {
pdd->alloc_watch_ids &= ~(0x1 << watch_id);
pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id);
}
spin_unlock(&pdd->dev->kfd->watch_points_lock);
}
static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id)
{
bool owns_watch_id = false;
spin_lock(&pdd->dev->kfd->watch_points_lock);
owns_watch_id = watch_id < MAX_WATCH_ADDRESSES &&
((pdd->alloc_watch_ids >> watch_id) & 0x1);
spin_unlock(&pdd->dev->kfd->watch_points_lock);
return owns_watch_id;
}
int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd,
uint32_t watch_id)
{
int r;
if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id))
return -EINVAL;
if (!pdd->dev->kfd->shared_resources.enable_mes) {
r = debug_lock_and_unmap(pdd->dev->dqm);
if (r)
return r;
}
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch(
pdd->dev->adev,
watch_id);
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_map_and_unlock(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
kfd_dbg_clear_dev_watch_id(pdd, watch_id);
return r;
}
int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd,
uint64_t watch_address,
uint32_t watch_address_mask,
uint32_t *watch_id,
uint32_t watch_mode)
{
int r = kfd_dbg_get_dev_watch_id(pdd, watch_id);
if (r)
return r;
if (!pdd->dev->kfd->shared_resources.enable_mes) {
r = debug_lock_and_unmap(pdd->dev->dqm);
if (r) {
kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
return r;
}
}
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch(
pdd->dev->adev,
watch_address,
watch_address_mask,
*watch_id,
watch_mode,
pdd->dev->vm_info.last_vmid_kfd);
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_map_and_unlock(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
/* HWS is broken so no point in HW rollback but release the watchpoint anyways */
if (r)
kfd_dbg_clear_dev_watch_id(pdd, *watch_id);
return 0;
}
static void kfd_dbg_clear_process_address_watch(struct kfd_process *target)
{
int i, j;
for (i = 0; i < target->n_pdds; i++)
for (j = 0; j < MAX_WATCH_ADDRESSES; j++)
kfd_dbg_trap_clear_dev_address_watch(target->pdds[i], j);
}
int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags)
{
uint32_t prev_flags = target->dbg_flags;
int i, r = 0, rewind_count = 0;
for (i = 0; i < target->n_pdds; i++) {
if (!kfd_dbg_is_per_vmid_supported(target->pdds[i]->dev) &&
(*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) {
*flags = prev_flags;
return -EACCES;
}
}
target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP;
*flags = prev_flags;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
continue;
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
if (r) {
target->dbg_flags = prev_flags;
break;
}
rewind_count++;
}
/* Rewind flags */
if (r) {
target->dbg_flags = prev_flags;
for (i = 0; i < rewind_count; i++) {
struct kfd_process_device *pdd = target->pdds[i];
if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
continue;
if (!pdd->dev->kfd->shared_resources.enable_mes)
debug_refresh_runlist(pdd->dev->dqm);
else
kfd_dbg_set_mes_debug_mode(pdd);
}
}
return r;
}
/* kfd_dbg_trap_deactivate:
* target: target process
* unwind: If this is unwinding a failed kfd_dbg_trap_enable()
* unwind_count:
* If unwind == true, how far down the pdd list we need
* to unwind
* else: ignored
*/
void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
{
int i;
if (!unwind) {
uint32_t flags = 0;
cancel_work_sync(&target->debug_event_workarea);
kfd_dbg_clear_process_address_watch(target);
kfd_dbg_trap_set_wave_launch_mode(target, 0);
kfd_dbg_trap_set_flags(target, &flags);
}
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
/* If this is an unwind, and we have unwound the required
* enable calls on the pdd list, we need to stop now
* otherwise we may mess up another debugger session.
*/
if (unwind && i == unwind_count)
break;
kfd_process_set_trap_debug_flag(&pdd->qpd, false);
/* GFX off is already disabled by debug activate if not RLC restore supported. */
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
pdd->spi_dbg_override =
pdd->dev->kfd2kgd->disable_debug_trap(
pdd->dev->adev,
target->runtime_info.ttmp_setup,
pdd->dev->vm_info.last_vmid_kfd);
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
if (!pdd->dev->kfd->shared_resources.enable_mes)
debug_refresh_runlist(pdd->dev->dqm);
else
kfd_dbg_set_mes_debug_mode(pdd);
}
kfd_dbg_set_workaround(target, false);
if (!unwind) {
int resume_count = resume_queues(target, 0, NULL);
if (resume_count)
pr_debug("Resumed %d queues\n", resume_count);
}
}
static void kfd_dbg_clean_exception_status(struct kfd_process *target)
{
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
int i;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
kfd_process_drain_interrupts(pdd);
pdd->exception_status = 0;
}
pqm = &target->pqm;
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
if (!pqn->q)
continue;
pqn->q->properties.exception_status = 0;
}
target->exception_status = 0;
}
int kfd_dbg_trap_disable(struct kfd_process *target)
{
if (!target->debug_trap_enabled)
return 0;
/*
* Defer deactivation to runtime if runtime not enabled otherwise reset
* attached running target runtime state to enable for re-attach.
*/
if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
kfd_dbg_trap_deactivate(target, false, 0);
else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
fput(target->dbg_ev_file);
target->dbg_ev_file = NULL;
if (target->debugger_process) {
atomic_dec(&target->debugger_process->debugged_process_count);
target->debugger_process = NULL;
}
target->debug_trap_enabled = false;
kfd_dbg_clean_exception_status(target);
kfd_unref_process(target);
return 0;
}
int kfd_dbg_trap_activate(struct kfd_process *target)
{
int i, r = 0;
r = kfd_dbg_set_workaround(target, true);
if (r)
return r;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
if (r) {
target->runtime_info.runtime_state = (r == -EBUSY) ?
DEBUG_RUNTIME_STATE_ENABLED_BUSY :
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
goto unwind_err;
}
}
/* Disable GFX OFF to prevent garbage read/writes to debug registers.
* If RLC restore of debug registers is not supported and runtime enable
* hasn't done so already on ttmp setup request, restore the trap config registers.
*
* If RLC restore of debug registers is not supported, keep gfx off disabled for
* the debug session.
*/
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
target->runtime_info.ttmp_setup))
pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
pdd->dev->vm_info.last_vmid_kfd);
pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
pdd->dev->adev,
false,
pdd->dev->vm_info.last_vmid_kfd);
if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
/*
* Setting the debug flag in the trap handler requires that the TMA has been
* allocated, which occurs during CWSR initialization.
* In the event that CWSR has not been initialized at this point, setting the
* flag will be called again during CWSR initialization if the target process
* is still debug enabled.
*/
kfd_process_set_trap_debug_flag(&pdd->qpd, true);
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
if (r) {
target->runtime_info.runtime_state =
DEBUG_RUNTIME_STATE_ENABLED_ERROR;
goto unwind_err;
}
}
return 0;
unwind_err:
/* Enabling debug failed, we need to disable on
* all GPUs so the enable is all or nothing.
*/
kfd_dbg_trap_deactivate(target, true, i);
return r;
}
int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
void __user *runtime_info, uint32_t *runtime_size)
{
struct file *f;
uint32_t copy_size;
int i, r = 0;
if (target->debug_trap_enabled)
return -EALREADY;
/* Enable pre-checks */
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
if (!KFD_IS_SOC15(pdd->dev))
return -ENODEV;
if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
return -EBUSY;
}
copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
f = fget(fd);
if (!f) {
pr_err("Failed to get file for (%i)\n", fd);
return -EBADF;
}
target->dbg_ev_file = f;
/* defer activation to runtime if not runtime enabled */
if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
kfd_dbg_trap_activate(target);
/* We already hold the process reference but hold another one for the
* debug session.
*/
kref_get(&target->ref);
target->debug_trap_enabled = true;
if (target->debugger_process)
atomic_inc(&target->debugger_process->debugged_process_count);
if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
kfd_dbg_trap_deactivate(target, false, 0);
r = -EFAULT;
}
*runtime_size = sizeof(target->runtime_info);
return r;
}
static int kfd_dbg_validate_trap_override_request(struct kfd_process *p,
uint32_t trap_override,
uint32_t trap_mask_request,
uint32_t *trap_mask_supported)
{
int i = 0;
*trap_mask_supported = 0xffffffff;
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
int err = pdd->dev->kfd2kgd->validate_trap_override_request(
pdd->dev->adev,
trap_override,
trap_mask_supported);
if (err)
return err;
}
if (trap_mask_request & ~*trap_mask_supported)
return -EACCES;
return 0;
}
int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target,
uint32_t trap_override,
uint32_t trap_mask_bits,
uint32_t trap_mask_request,
uint32_t *trap_mask_prev,
uint32_t *trap_mask_supported)
{
int r = 0, i;
r = kfd_dbg_validate_trap_override_request(target,
trap_override,
trap_mask_request,
trap_mask_supported);
if (r)
return r;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override(
pdd->dev->adev,
pdd->dev->vm_info.last_vmid_kfd,
trap_override,
trap_mask_bits,
trap_mask_request,
trap_mask_prev,
pdd->spi_dbg_override);
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
if (r)
break;
}
return r;
}
int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target,
uint8_t wave_launch_mode)
{
int r = 0, i;
if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL &&
wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT &&
wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG)
return -EINVAL;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode(
pdd->dev->adev,
wave_launch_mode,
pdd->dev->vm_info.last_vmid_kfd);
amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
if (!pdd->dev->kfd->shared_resources.enable_mes)
r = debug_refresh_runlist(pdd->dev->dqm);
else
r = kfd_dbg_set_mes_debug_mode(pdd);
if (r)
break;
}
return r;
}
int kfd_dbg_trap_query_exception_info(struct kfd_process *target,
uint32_t source_id,
uint32_t exception_code,
bool clear_exception,
void __user *info,
uint32_t *info_size)
{
bool found = false;
int r = 0;
uint32_t copy_size, actual_info_size = 0;
uint64_t *exception_status_ptr = NULL;
if (!target)
return -EINVAL;
if (!info || !info_size)
return -EINVAL;
mutex_lock(&target->event_mutex);
if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) {
/* Per queue exceptions */
struct queue *queue = NULL;
int i;
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
struct qcm_process_device *qpd = &pdd->qpd;
list_for_each_entry(queue, &qpd->queues_list, list) {
if (!found && queue->properties.queue_id == source_id) {
found = true;
break;
}
}
if (found)
break;
}
if (!found) {
r = -EINVAL;
goto out;
}
if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) {
r = -ENODATA;
goto out;
}
exception_status_ptr = &queue->properties.exception_status;
} else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) {
/* Per device exceptions */
struct kfd_process_device *pdd = NULL;
int i;
for (i = 0; i < target->n_pdds; i++) {
pdd = target->pdds[i];
if (pdd->dev->id == source_id) {
found = true;
break;
}
}
if (!found) {
r = -EINVAL;
goto out;
}
if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) {
r = -ENODATA;
goto out;
}
if (exception_code == EC_DEVICE_MEMORY_VIOLATION) {
copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size);
if (copy_to_user(info, pdd->vm_fault_exc_data, copy_size)) {
r = -EFAULT;
goto out;
}
actual_info_size = pdd->vm_fault_exc_data_size;
if (clear_exception) {
kfree(pdd->vm_fault_exc_data);
pdd->vm_fault_exc_data = NULL;
pdd->vm_fault_exc_data_size = 0;
}
}
exception_status_ptr = &pdd->exception_status;
} else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) {
/* Per process exceptions */
if (!(target->exception_status & KFD_EC_MASK(exception_code))) {
r = -ENODATA;
goto out;
}
if (exception_code == EC_PROCESS_RUNTIME) {
copy_size = min((size_t)(*info_size), sizeof(target->runtime_info));
if (copy_to_user(info, (void *)&target->runtime_info, copy_size)) {
r = -EFAULT;
goto out;
}
actual_info_size = sizeof(target->runtime_info);
}
exception_status_ptr = &target->exception_status;
} else {
pr_debug("Bad exception type [%i]\n", exception_code);
r = -EINVAL;
goto out;
}
*info_size = actual_info_size;
if (clear_exception)
*exception_status_ptr &= ~KFD_EC_MASK(exception_code);
out:
mutex_unlock(&target->event_mutex);
return r;
}
int kfd_dbg_trap_device_snapshot(struct kfd_process *target,
uint64_t exception_clear_mask,
void __user *user_info,
uint32_t *number_of_device_infos,
uint32_t *entry_size)
{
struct kfd_dbg_device_info_entry device_info;
uint32_t tmp_entry_size = *entry_size, tmp_num_devices;
int i, r = 0;
if (!(target && user_info && number_of_device_infos && entry_size))
return -EINVAL;
tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds);
*number_of_device_infos = target->n_pdds;
*entry_size = min_t(size_t, *entry_size, sizeof(device_info));
if (!tmp_num_devices)
return 0;
memset(&device_info, 0, sizeof(device_info));
mutex_lock(&target->event_mutex);
/* Run over all pdd of the process */
for (i = 0; i < tmp_num_devices; i++) {
struct kfd_process_device *pdd = target->pdds[i];
struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(pdd->dev->id);
device_info.gpu_id = pdd->dev->id;
device_info.exception_status = pdd->exception_status;
device_info.lds_base = pdd->lds_base;
device_info.lds_limit = pdd->lds_limit;
device_info.scratch_base = pdd->scratch_base;
device_info.scratch_limit = pdd->scratch_limit;
device_info.gpuvm_base = pdd->gpuvm_base;
device_info.gpuvm_limit = pdd->gpuvm_limit;
device_info.location_id = topo_dev->node_props.location_id;
device_info.vendor_id = topo_dev->node_props.vendor_id;
device_info.device_id = topo_dev->node_props.device_id;
device_info.revision_id = pdd->dev->adev->pdev->revision;
device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor;
device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device;
device_info.fw_version = pdd->dev->kfd->mec_fw_version;
device_info.gfx_target_version =
topo_dev->node_props.gfx_target_version;
device_info.simd_count = topo_dev->node_props.simd_count;
device_info.max_waves_per_simd =
topo_dev->node_props.max_waves_per_simd;
device_info.array_count = topo_dev->node_props.array_count;
device_info.simd_arrays_per_engine =
topo_dev->node_props.simd_arrays_per_engine;
device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask);
device_info.capability = topo_dev->node_props.capability;
device_info.debug_prop = topo_dev->node_props.debug_prop;
if (exception_clear_mask)
pdd->exception_status &= ~exception_clear_mask;
if (copy_to_user(user_info, &device_info, *entry_size)) {
r = -EFAULT;
break;
}
user_info += tmp_entry_size;
}
mutex_unlock(&target->event_mutex);
return r;
}
void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target,
uint64_t exception_set_mask)
{
uint64_t found_mask = 0;
struct process_queue_manager *pqm;
struct process_queue_node *pqn;
static const char write_data = '.';
loff_t pos = 0;
int i;
mutex_lock(&target->event_mutex);
found_mask |= target->exception_status;
pqm = &target->pqm;
list_for_each_entry(pqn, &pqm->queues, process_queue_list) {
if (!pqn)
continue;
found_mask |= pqn->q->properties.exception_status;
}
for (i = 0; i < target->n_pdds; i++) {
struct kfd_process_device *pdd = target->pdds[i];
found_mask |= pdd->exception_status;
}
if (exception_set_mask & found_mask)
kernel_write(target->dbg_ev_file, &write_data, 1, &pos);
target->exception_enable_mask = exception_set_mask;
mutex_unlock(&target->event_mutex);
}