Add ring timeout related information in the amdgpu devcoredump file for debugging purposes. During the gpu recovery process the registered call is triggered and add the debug information in data file created by devcoredump framework under the directory /sys/class/devcoredump/devcdx/ Signed-off-by: Sunil Khatri <sunil.khatri@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
151 lines
4.8 KiB
C
151 lines
4.8 KiB
C
/*
|
|
* Copyright 2021 Advanced Micro Devices, Inc.
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*
|
|
*/
|
|
|
|
#ifndef __AMDGPU_RESET_H__
|
|
#define __AMDGPU_RESET_H__
|
|
|
|
#include "amdgpu.h"
|
|
|
|
#define AMDGPU_RESET_MAX_HANDLERS 5
|
|
|
|
enum AMDGPU_RESET_FLAGS {
|
|
|
|
AMDGPU_NEED_FULL_RESET = 0,
|
|
AMDGPU_SKIP_HW_RESET = 1,
|
|
};
|
|
|
|
struct amdgpu_reset_context {
|
|
enum amd_reset_method method;
|
|
struct amdgpu_device *reset_req_dev;
|
|
struct amdgpu_job *job;
|
|
struct amdgpu_hive_info *hive;
|
|
struct list_head *reset_device_list;
|
|
unsigned long flags;
|
|
};
|
|
|
|
struct amdgpu_reset_handler {
|
|
enum amd_reset_method reset_method;
|
|
int (*prepare_env)(struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
int (*prepare_hwcontext)(struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
int (*perform_reset)(struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
int (*restore_hwcontext)(struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
int (*restore_env)(struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
|
|
int (*do_reset)(struct amdgpu_device *adev);
|
|
};
|
|
|
|
struct amdgpu_reset_control {
|
|
void *handle;
|
|
struct work_struct reset_work;
|
|
struct mutex reset_lock;
|
|
struct amdgpu_reset_handler *(
|
|
*reset_handlers)[AMDGPU_RESET_MAX_HANDLERS];
|
|
atomic_t in_reset;
|
|
enum amd_reset_method active_reset;
|
|
struct amdgpu_reset_handler *(*get_reset_handler)(
|
|
struct amdgpu_reset_control *reset_ctl,
|
|
struct amdgpu_reset_context *context);
|
|
void (*async_reset)(struct work_struct *work);
|
|
};
|
|
|
|
|
|
enum amdgpu_reset_domain_type {
|
|
SINGLE_DEVICE,
|
|
XGMI_HIVE
|
|
};
|
|
|
|
struct amdgpu_reset_domain {
|
|
struct kref refcount;
|
|
struct workqueue_struct *wq;
|
|
enum amdgpu_reset_domain_type type;
|
|
struct rw_semaphore sem;
|
|
atomic_t in_gpu_reset;
|
|
atomic_t reset_res;
|
|
};
|
|
|
|
#ifdef CONFIG_DEV_COREDUMP
|
|
|
|
#define AMDGPU_COREDUMP_VERSION "1"
|
|
|
|
struct amdgpu_coredump_info {
|
|
struct amdgpu_device *adev;
|
|
struct amdgpu_task_info reset_task_info;
|
|
struct timespec64 reset_time;
|
|
bool reset_vram_lost;
|
|
struct amdgpu_ring *ring;
|
|
};
|
|
#endif
|
|
|
|
int amdgpu_reset_init(struct amdgpu_device *adev);
|
|
int amdgpu_reset_fini(struct amdgpu_device *adev);
|
|
|
|
int amdgpu_reset_prepare_hwcontext(struct amdgpu_device *adev,
|
|
struct amdgpu_reset_context *reset_context);
|
|
|
|
int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
|
|
struct amdgpu_reset_context *reset_context);
|
|
|
|
int amdgpu_reset_prepare_env(struct amdgpu_device *adev,
|
|
struct amdgpu_reset_context *reset_context);
|
|
int amdgpu_reset_restore_env(struct amdgpu_device *adev,
|
|
struct amdgpu_reset_context *reset_context);
|
|
|
|
struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
|
|
char *wq_name);
|
|
|
|
void amdgpu_reset_destroy_reset_domain(struct kref *ref);
|
|
|
|
static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain)
|
|
{
|
|
return kref_get_unless_zero(&domain->refcount) != 0;
|
|
}
|
|
|
|
static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain)
|
|
{
|
|
if (domain)
|
|
kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain);
|
|
}
|
|
|
|
static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
|
|
struct work_struct *work)
|
|
{
|
|
return queue_work(domain->wq, work);
|
|
}
|
|
|
|
void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
|
|
|
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
|
|
|
|
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
|
|
struct amdgpu_reset_context *reset_context);
|
|
|
|
#define for_each_handler(i, handler, reset_ctl) \
|
|
for (i = 0; (i < AMDGPU_RESET_MAX_HANDLERS) && \
|
|
(handler = (*reset_ctl->reset_handlers)[i]); \
|
|
++i)
|
|
#endif
|