drm/amdkfd: CRIU implement gpu_id remapping
When doing a restore on a different node, the gpu_id's on the restore node may be different. But the user space application will still refer use the original gpu_id's in the ioctl calls. Adding code to create a gpu id mapping so that kfd can determine actual gpu_id during the user ioctl's. Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
40e8a766a7
commit
bef153b70c
5 changed files with 414 additions and 160 deletions
|
@ -293,14 +293,17 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
pr_debug("Looking for gpu id 0x%x\n", args->gpu_id);
|
pr_debug("Looking for gpu id 0x%x\n", args->gpu_id);
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev) {
|
|
||||||
pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
|
|
||||||
return -EINVAL;
|
|
||||||
}
|
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
|
||||||
|
err = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
err = -ESRCH;
|
err = -ESRCH;
|
||||||
|
@ -345,6 +348,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
|
||||||
|
|
||||||
err_create_queue:
|
err_create_queue:
|
||||||
err_bind_process:
|
err_bind_process:
|
||||||
|
err_pdd:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
@ -491,7 +495,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_set_memory_policy_args *args = data;
|
struct kfd_ioctl_set_memory_policy_args *args = data;
|
||||||
struct kfd_dev *dev;
|
|
||||||
int err = 0;
|
int err = 0;
|
||||||
struct kfd_process_device *pdd;
|
struct kfd_process_device *pdd;
|
||||||
enum cache_policy default_policy, alternate_policy;
|
enum cache_policy default_policy, alternate_policy;
|
||||||
|
@ -506,13 +509,15 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
|
||||||
|
err = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(pdd->dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
err = -ESRCH;
|
err = -ESRCH;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -525,7 +530,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
|
||||||
(args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
|
(args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
|
||||||
? cache_policy_coherent : cache_policy_noncoherent;
|
? cache_policy_coherent : cache_policy_noncoherent;
|
||||||
|
|
||||||
if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm,
|
if (!pdd->dev->dqm->ops.set_cache_memory_policy(pdd->dev->dqm,
|
||||||
&pdd->qpd,
|
&pdd->qpd,
|
||||||
default_policy,
|
default_policy,
|
||||||
alternate_policy,
|
alternate_policy,
|
||||||
|
@ -534,6 +539,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
err_pdd:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
|
@ -543,17 +549,18 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_set_trap_handler_args *args = data;
|
struct kfd_ioctl_set_trap_handler_args *args = data;
|
||||||
struct kfd_dev *dev;
|
|
||||||
int err = 0;
|
int err = 0;
|
||||||
struct kfd_process_device *pdd;
|
struct kfd_process_device *pdd;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
err = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
|
||||||
|
pdd = kfd_bind_process_to_device(pdd->dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
err = -ESRCH;
|
err = -ESRCH;
|
||||||
goto out;
|
goto out;
|
||||||
|
@ -562,6 +569,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
|
||||||
kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr);
|
kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr);
|
||||||
|
|
||||||
out:
|
out:
|
||||||
|
err_pdd:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
|
@ -577,16 +585,20 @@ static int kfd_ioctl_dbg_register(struct file *filep,
|
||||||
bool create_ok;
|
bool create_ok;
|
||||||
long status = 0;
|
long status = 0;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
return -EINVAL;
|
if (!pdd) {
|
||||||
|
status = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
||||||
pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
|
pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
|
||||||
return -EINVAL;
|
status = -EINVAL;
|
||||||
|
goto err_chip_unsupp;
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
|
||||||
mutex_lock(kfd_get_dbgmgr_mutex());
|
mutex_lock(kfd_get_dbgmgr_mutex());
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -616,6 +628,8 @@ static int kfd_ioctl_dbg_register(struct file *filep,
|
||||||
|
|
||||||
out:
|
out:
|
||||||
mutex_unlock(kfd_get_dbgmgr_mutex());
|
mutex_unlock(kfd_get_dbgmgr_mutex());
|
||||||
|
err_pdd:
|
||||||
|
err_chip_unsupp:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
|
||||||
return status;
|
return status;
|
||||||
|
@ -625,13 +639,18 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_dbg_unregister_args *args = data;
|
struct kfd_ioctl_dbg_unregister_args *args = data;
|
||||||
|
struct kfd_process_device *pdd;
|
||||||
struct kfd_dev *dev;
|
struct kfd_dev *dev;
|
||||||
long status;
|
long status;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev || !dev->dbgmgr)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (!pdd || !pdd->dev->dbgmgr)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
||||||
pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
|
pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
@ -664,6 +683,7 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep,
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_dbg_address_watch_args *args = data;
|
struct kfd_ioctl_dbg_address_watch_args *args = data;
|
||||||
struct kfd_dev *dev;
|
struct kfd_dev *dev;
|
||||||
|
struct kfd_process_device *pdd;
|
||||||
struct dbg_address_watch_info aw_info;
|
struct dbg_address_watch_info aw_info;
|
||||||
unsigned char *args_buff;
|
unsigned char *args_buff;
|
||||||
long status;
|
long status;
|
||||||
|
@ -673,15 +693,19 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep,
|
||||||
|
|
||||||
memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
|
memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
||||||
pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
|
pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
cmd_from_user = (void __user *) args->content_ptr;
|
cmd_from_user = (void __user *) args->content_ptr;
|
||||||
|
|
||||||
/* Validate arguments */
|
/* Validate arguments */
|
||||||
|
@ -764,6 +788,7 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_dbg_wave_control_args *args = data;
|
struct kfd_ioctl_dbg_wave_control_args *args = data;
|
||||||
struct kfd_dev *dev;
|
struct kfd_dev *dev;
|
||||||
|
struct kfd_process_device *pdd;
|
||||||
struct dbg_wave_control_info wac_info;
|
struct dbg_wave_control_info wac_info;
|
||||||
unsigned char *args_buff;
|
unsigned char *args_buff;
|
||||||
uint32_t computed_buff_size;
|
uint32_t computed_buff_size;
|
||||||
|
@ -781,9 +806,14 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
|
||||||
sizeof(wac_info.dbgWave_msg.MemoryVA) +
|
sizeof(wac_info.dbgWave_msg.MemoryVA) +
|
||||||
sizeof(wac_info.trapId);
|
sizeof(wac_info.trapId);
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
if (dev->adev->asic_type == CHIP_CARRIZO) {
|
||||||
pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
|
pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
|
||||||
|
@ -847,12 +877,14 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_get_clock_counters_args *args = data;
|
struct kfd_ioctl_get_clock_counters_args *args = data;
|
||||||
struct kfd_dev *dev;
|
struct kfd_process_device *pdd;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (pdd)
|
||||||
/* Reading GPU clock counter from KGD */
|
/* Reading GPU clock counter from KGD */
|
||||||
args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->adev);
|
args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(pdd->dev->adev);
|
||||||
else
|
else
|
||||||
/* Node without GPU resource */
|
/* Node without GPU resource */
|
||||||
args->gpu_clock_counter = 0;
|
args->gpu_clock_counter = 0;
|
||||||
|
@ -1070,11 +1102,13 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
|
||||||
struct kfd_dev *dev;
|
struct kfd_dev *dev;
|
||||||
long err;
|
long err;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
err = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
|
@ -1094,6 +1128,7 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
bind_process_to_device_fail:
|
bind_process_to_device_fail:
|
||||||
|
err_pdd:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
@ -1102,15 +1137,17 @@ static int kfd_ioctl_get_tile_config(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_get_tile_config_args *args = data;
|
struct kfd_ioctl_get_tile_config_args *args = data;
|
||||||
struct kfd_dev *dev;
|
struct kfd_process_device *pdd;
|
||||||
struct tile_config config;
|
struct tile_config config;
|
||||||
int err = 0;
|
int err = 0;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (!pdd)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
amdgpu_amdkfd_get_tile_config(dev->adev, &config);
|
amdgpu_amdkfd_get_tile_config(pdd->dev->adev, &config);
|
||||||
|
|
||||||
args->gb_addr_config = config.gb_addr_config;
|
args->gb_addr_config = config.gb_addr_config;
|
||||||
args->num_banks = config.num_banks;
|
args->num_banks = config.num_banks;
|
||||||
|
@ -1145,40 +1182,37 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_acquire_vm_args *args = data;
|
struct kfd_ioctl_acquire_vm_args *args = data;
|
||||||
struct kfd_process_device *pdd;
|
struct kfd_process_device *pdd;
|
||||||
struct kfd_dev *dev;
|
|
||||||
struct file *drm_file;
|
struct file *drm_file;
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
drm_file = fget(args->drm_fd);
|
drm_file = fget(args->drm_fd);
|
||||||
if (!drm_file)
|
if (!drm_file)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
pdd = kfd_get_process_device_data(dev, p);
|
|
||||||
if (!pdd) {
|
if (!pdd) {
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto err_unlock;
|
goto err_pdd;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pdd->drm_file) {
|
if (pdd->drm_file) {
|
||||||
ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
|
ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
|
||||||
goto err_unlock;
|
goto err_drm_file;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = kfd_process_device_init_vm(pdd, drm_file);
|
ret = kfd_process_device_init_vm(pdd, drm_file);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto err_unlock;
|
goto err_unlock;
|
||||||
|
|
||||||
/* On success, the PDD keeps the drm_file reference */
|
/* On success, the PDD keeps the drm_file reference */
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
err_unlock:
|
err_unlock:
|
||||||
|
err_pdd:
|
||||||
|
err_drm_file:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
fput(drm_file);
|
fput(drm_file);
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -1235,19 +1269,23 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
mutex_unlock(&p->svms.lock);
|
mutex_unlock(&p->svms.lock);
|
||||||
#endif
|
#endif
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
return -EINVAL;
|
if (!pdd) {
|
||||||
|
err = -EINVAL;
|
||||||
|
goto err_pdd;
|
||||||
|
}
|
||||||
|
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
|
if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
|
||||||
(flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
|
(flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
|
||||||
!kfd_dev_is_large_bar(dev)) {
|
!kfd_dev_is_large_bar(dev)) {
|
||||||
pr_err("Alloc host visible vram on small bar is not allowed\n");
|
pr_err("Alloc host visible vram on small bar is not allowed\n");
|
||||||
return -EINVAL;
|
err = -EINVAL;
|
||||||
|
goto err_large_bar;
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
err = PTR_ERR(pdd);
|
err = PTR_ERR(pdd);
|
||||||
|
@ -1308,6 +1346,8 @@ err_free:
|
||||||
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem,
|
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem,
|
||||||
pdd->drm_priv, NULL);
|
pdd->drm_priv, NULL);
|
||||||
err_unlock:
|
err_unlock:
|
||||||
|
err_pdd:
|
||||||
|
err_large_bar:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
@ -1318,14 +1358,9 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
|
||||||
struct kfd_ioctl_free_memory_of_gpu_args *args = data;
|
struct kfd_ioctl_free_memory_of_gpu_args *args = data;
|
||||||
struct kfd_process_device *pdd;
|
struct kfd_process_device *pdd;
|
||||||
void *mem;
|
void *mem;
|
||||||
struct kfd_dev *dev;
|
|
||||||
int ret;
|
int ret;
|
||||||
uint64_t size = 0;
|
uint64_t size = 0;
|
||||||
|
|
||||||
dev = kfd_device_by_id(GET_GPU_ID(args->handle));
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
/*
|
/*
|
||||||
* Safeguard to prevent user space from freeing signal BO.
|
* Safeguard to prevent user space from freeing signal BO.
|
||||||
|
@ -1337,11 +1372,11 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
|
||||||
goto err_unlock;
|
goto err_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
pdd = kfd_get_process_device_data(dev, p);
|
pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
|
||||||
if (!pdd) {
|
if (!pdd) {
|
||||||
pr_err("Process device data doesn't exist\n");
|
pr_err("Process device data doesn't exist\n");
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto err_unlock;
|
goto err_pdd;
|
||||||
}
|
}
|
||||||
|
|
||||||
mem = kfd_process_device_translate_handle(
|
mem = kfd_process_device_translate_handle(
|
||||||
|
@ -1351,7 +1386,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
|
||||||
goto err_unlock;
|
goto err_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev,
|
ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev,
|
||||||
(struct kgd_mem *)mem, pdd->drm_priv, &size);
|
(struct kgd_mem *)mem, pdd->drm_priv, &size);
|
||||||
|
|
||||||
/* If freeing the buffer failed, leave the handle in place for
|
/* If freeing the buffer failed, leave the handle in place for
|
||||||
|
@ -1364,6 +1399,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
|
||||||
WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
|
WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
|
||||||
|
|
||||||
err_unlock:
|
err_unlock:
|
||||||
|
err_pdd:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -1381,16 +1417,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
|
||||||
struct kfd_ioctl_map_memory_to_gpu_args *args = data;
|
struct kfd_ioctl_map_memory_to_gpu_args *args = data;
|
||||||
struct kfd_process_device *pdd, *peer_pdd;
|
struct kfd_process_device *pdd, *peer_pdd;
|
||||||
void *mem;
|
void *mem;
|
||||||
struct kfd_dev *dev, *peer;
|
struct kfd_dev *dev;
|
||||||
long err = 0;
|
long err = 0;
|
||||||
int i;
|
int i;
|
||||||
uint32_t *devices_arr = NULL;
|
uint32_t *devices_arr = NULL;
|
||||||
bool table_freed = false;
|
bool table_freed = false;
|
||||||
|
|
||||||
dev = kfd_device_by_id(GET_GPU_ID(args->handle));
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
if (!args->n_devices) {
|
if (!args->n_devices) {
|
||||||
pr_debug("Device IDs array empty\n");
|
pr_debug("Device IDs array empty\n");
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
@ -1414,6 +1446,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
|
||||||
|
if (!pdd) {
|
||||||
|
err = -EINVAL;
|
||||||
|
goto get_process_device_data_failed;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
|
@ -1429,21 +1467,22 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = args->n_success; i < args->n_devices; i++) {
|
for (i = args->n_success; i < args->n_devices; i++) {
|
||||||
peer = kfd_device_by_id(devices_arr[i]);
|
peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
|
||||||
if (!peer) {
|
if (!peer_pdd) {
|
||||||
pr_debug("Getting device by id failed for 0x%x\n",
|
pr_debug("Getting device by id failed for 0x%x\n",
|
||||||
devices_arr[i]);
|
devices_arr[i]);
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
goto get_mem_obj_from_handle_failed;
|
goto get_mem_obj_from_handle_failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
peer_pdd = kfd_bind_process_to_device(peer, p);
|
peer_pdd = kfd_bind_process_to_device(peer_pdd->dev, p);
|
||||||
if (IS_ERR(peer_pdd)) {
|
if (IS_ERR(peer_pdd)) {
|
||||||
err = PTR_ERR(peer_pdd);
|
err = PTR_ERR(peer_pdd);
|
||||||
goto get_mem_obj_from_handle_failed;
|
goto get_mem_obj_from_handle_failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
|
err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
|
||||||
peer->adev, (struct kgd_mem *)mem,
|
peer_pdd->dev->adev, (struct kgd_mem *)mem,
|
||||||
peer_pdd->drm_priv, &table_freed);
|
peer_pdd->drm_priv, &table_freed);
|
||||||
if (err) {
|
if (err) {
|
||||||
pr_err("Failed to map to gpu %d/%d\n",
|
pr_err("Failed to map to gpu %d/%d\n",
|
||||||
|
@ -1464,10 +1503,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
|
||||||
/* Flush TLBs after waiting for the page table updates to complete */
|
/* Flush TLBs after waiting for the page table updates to complete */
|
||||||
if (table_freed || !kfd_flush_tlb_after_unmap(dev)) {
|
if (table_freed || !kfd_flush_tlb_after_unmap(dev)) {
|
||||||
for (i = 0; i < args->n_devices; i++) {
|
for (i = 0; i < args->n_devices; i++) {
|
||||||
peer = kfd_device_by_id(devices_arr[i]);
|
peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
|
||||||
if (WARN_ON_ONCE(!peer))
|
|
||||||
continue;
|
|
||||||
peer_pdd = kfd_get_process_device_data(peer, p);
|
|
||||||
if (WARN_ON_ONCE(!peer_pdd))
|
if (WARN_ON_ONCE(!peer_pdd))
|
||||||
continue;
|
continue;
|
||||||
kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
|
kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
|
||||||
|
@ -1477,6 +1513,7 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
|
||||||
|
|
||||||
return err;
|
return err;
|
||||||
|
|
||||||
|
get_process_device_data_failed:
|
||||||
bind_process_to_device_failed:
|
bind_process_to_device_failed:
|
||||||
get_mem_obj_from_handle_failed:
|
get_mem_obj_from_handle_failed:
|
||||||
map_memory_to_gpu_failed:
|
map_memory_to_gpu_failed:
|
||||||
|
@ -1494,14 +1531,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
|
||||||
struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
|
struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
|
||||||
struct kfd_process_device *pdd, *peer_pdd;
|
struct kfd_process_device *pdd, *peer_pdd;
|
||||||
void *mem;
|
void *mem;
|
||||||
struct kfd_dev *dev, *peer;
|
|
||||||
long err = 0;
|
long err = 0;
|
||||||
uint32_t *devices_arr = NULL, i;
|
uint32_t *devices_arr = NULL, i;
|
||||||
|
|
||||||
dev = kfd_device_by_id(GET_GPU_ID(args->handle));
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
if (!args->n_devices) {
|
if (!args->n_devices) {
|
||||||
pr_debug("Device IDs array empty\n");
|
pr_debug("Device IDs array empty\n");
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
@ -1525,8 +1557,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
|
||||||
pdd = kfd_get_process_device_data(dev, p);
|
|
||||||
if (!pdd) {
|
if (!pdd) {
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
goto bind_process_to_device_failed;
|
goto bind_process_to_device_failed;
|
||||||
|
@ -1540,19 +1571,13 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = args->n_success; i < args->n_devices; i++) {
|
for (i = args->n_success; i < args->n_devices; i++) {
|
||||||
peer = kfd_device_by_id(devices_arr[i]);
|
peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
|
||||||
if (!peer) {
|
if (!peer_pdd) {
|
||||||
err = -EINVAL;
|
err = -EINVAL;
|
||||||
goto get_mem_obj_from_handle_failed;
|
goto get_mem_obj_from_handle_failed;
|
||||||
}
|
}
|
||||||
|
|
||||||
peer_pdd = kfd_get_process_device_data(peer, p);
|
|
||||||
if (!peer_pdd) {
|
|
||||||
err = -ENODEV;
|
|
||||||
goto get_mem_obj_from_handle_failed;
|
|
||||||
}
|
|
||||||
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
|
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
|
||||||
peer->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
|
peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
|
||||||
if (err) {
|
if (err) {
|
||||||
pr_err("Failed to unmap from gpu %d/%d\n",
|
pr_err("Failed to unmap from gpu %d/%d\n",
|
||||||
i, args->n_devices);
|
i, args->n_devices);
|
||||||
|
@ -1562,8 +1587,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
|
||||||
}
|
}
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
|
||||||
if (kfd_flush_tlb_after_unmap(dev)) {
|
if (kfd_flush_tlb_after_unmap(pdd->dev)) {
|
||||||
err = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev,
|
err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev,
|
||||||
(struct kgd_mem *) mem, true);
|
(struct kgd_mem *) mem, true);
|
||||||
if (err) {
|
if (err) {
|
||||||
pr_debug("Sync memory failed, wait interrupted by user signal\n");
|
pr_debug("Sync memory failed, wait interrupted by user signal\n");
|
||||||
|
@ -1572,10 +1597,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
|
||||||
|
|
||||||
/* Flush TLBs after waiting for the page table updates to complete */
|
/* Flush TLBs after waiting for the page table updates to complete */
|
||||||
for (i = 0; i < args->n_devices; i++) {
|
for (i = 0; i < args->n_devices; i++) {
|
||||||
peer = kfd_device_by_id(devices_arr[i]);
|
peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
|
||||||
if (WARN_ON_ONCE(!peer))
|
|
||||||
continue;
|
|
||||||
peer_pdd = kfd_get_process_device_data(peer, p);
|
|
||||||
if (WARN_ON_ONCE(!peer_pdd))
|
if (WARN_ON_ONCE(!peer_pdd))
|
||||||
continue;
|
continue;
|
||||||
kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
|
kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
|
||||||
|
@ -1695,29 +1717,29 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
|
||||||
struct kfd_ioctl_import_dmabuf_args *args = data;
|
struct kfd_ioctl_import_dmabuf_args *args = data;
|
||||||
struct kfd_process_device *pdd;
|
struct kfd_process_device *pdd;
|
||||||
struct dma_buf *dmabuf;
|
struct dma_buf *dmabuf;
|
||||||
struct kfd_dev *dev;
|
|
||||||
int idr_handle;
|
int idr_handle;
|
||||||
uint64_t size;
|
uint64_t size;
|
||||||
void *mem;
|
void *mem;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpu_id);
|
|
||||||
if (!dev)
|
|
||||||
return -EINVAL;
|
|
||||||
|
|
||||||
dmabuf = dma_buf_get(args->dmabuf_fd);
|
dmabuf = dma_buf_get(args->dmabuf_fd);
|
||||||
if (IS_ERR(dmabuf))
|
if (IS_ERR(dmabuf))
|
||||||
return PTR_ERR(dmabuf);
|
return PTR_ERR(dmabuf);
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
r = -EINVAL;
|
||||||
|
goto err_unlock;
|
||||||
|
}
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(dev, p);
|
pdd = kfd_bind_process_to_device(pdd->dev, p);
|
||||||
if (IS_ERR(pdd)) {
|
if (IS_ERR(pdd)) {
|
||||||
r = PTR_ERR(pdd);
|
r = PTR_ERR(pdd);
|
||||||
goto err_unlock;
|
goto err_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
r = amdgpu_amdkfd_gpuvm_import_dmabuf(dev->adev, dmabuf,
|
r = amdgpu_amdkfd_gpuvm_import_dmabuf(pdd->dev->adev, dmabuf,
|
||||||
args->va_addr, pdd->drm_priv,
|
args->va_addr, pdd->drm_priv,
|
||||||
(struct kgd_mem **)&mem, &size,
|
(struct kgd_mem **)&mem, &size,
|
||||||
NULL);
|
NULL);
|
||||||
|
@ -1738,7 +1760,7 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
err_free:
|
err_free:
|
||||||
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem,
|
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, (struct kgd_mem *)mem,
|
||||||
pdd->drm_priv, NULL);
|
pdd->drm_priv, NULL);
|
||||||
err_unlock:
|
err_unlock:
|
||||||
mutex_unlock(&p->mutex);
|
mutex_unlock(&p->mutex);
|
||||||
|
@ -1751,13 +1773,16 @@ static int kfd_ioctl_smi_events(struct file *filep,
|
||||||
struct kfd_process *p, void *data)
|
struct kfd_process *p, void *data)
|
||||||
{
|
{
|
||||||
struct kfd_ioctl_smi_events_args *args = data;
|
struct kfd_ioctl_smi_events_args *args = data;
|
||||||
struct kfd_dev *dev;
|
struct kfd_process_device *pdd;
|
||||||
|
|
||||||
dev = kfd_device_by_id(args->gpuid);
|
mutex_lock(&p->mutex);
|
||||||
if (!dev)
|
|
||||||
|
pdd = kfd_process_device_data_by_id(p, args->gpuid);
|
||||||
|
mutex_unlock(&p->mutex);
|
||||||
|
if (!pdd)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
return kfd_smi_event_open(dev, &args->anon_fd);
|
return kfd_smi_event_open(pdd->dev, &args->anon_fd);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int kfd_ioctl_set_xnack_mode(struct file *filep,
|
static int kfd_ioctl_set_xnack_mode(struct file *filep,
|
||||||
|
@ -1834,6 +1859,62 @@ static int criu_checkpoint_process(struct kfd_process *p,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int criu_checkpoint_devices(struct kfd_process *p,
|
||||||
|
uint32_t num_devices,
|
||||||
|
uint8_t __user *user_addr,
|
||||||
|
uint8_t __user *user_priv_data,
|
||||||
|
uint64_t *priv_offset)
|
||||||
|
{
|
||||||
|
struct kfd_criu_device_priv_data *device_priv = NULL;
|
||||||
|
struct kfd_criu_device_bucket *device_buckets = NULL;
|
||||||
|
int ret = 0, i;
|
||||||
|
|
||||||
|
device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
|
||||||
|
if (!device_buckets) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
|
||||||
|
if (!device_priv) {
|
||||||
|
ret = -ENOMEM;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < num_devices; i++) {
|
||||||
|
struct kfd_process_device *pdd = p->pdds[i];
|
||||||
|
|
||||||
|
device_buckets[i].user_gpu_id = pdd->user_gpu_id;
|
||||||
|
device_buckets[i].actual_gpu_id = pdd->dev->id;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* priv_data does not contain useful information for now and is reserved for
|
||||||
|
* future use, so we do not set its contents.
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
|
||||||
|
if (ret) {
|
||||||
|
pr_err("Failed to copy device information to user\n");
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = copy_to_user(user_priv_data + *priv_offset,
|
||||||
|
device_priv,
|
||||||
|
num_devices * sizeof(*device_priv));
|
||||||
|
if (ret) {
|
||||||
|
pr_err("Failed to copy device information to user\n");
|
||||||
|
ret = -EFAULT;
|
||||||
|
}
|
||||||
|
*priv_offset += num_devices * sizeof(*device_priv);
|
||||||
|
|
||||||
|
exit:
|
||||||
|
kvfree(device_buckets);
|
||||||
|
kvfree(device_priv);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
uint32_t get_process_num_bos(struct kfd_process *p)
|
uint32_t get_process_num_bos(struct kfd_process *p)
|
||||||
{
|
{
|
||||||
uint32_t num_of_bos = 0;
|
uint32_t num_of_bos = 0;
|
||||||
|
@ -1902,9 +1983,9 @@ static int criu_checkpoint_bos(struct kfd_process *p,
|
||||||
bo_bucket = &bo_buckets[bo_index];
|
bo_bucket = &bo_buckets[bo_index];
|
||||||
bo_priv = &bo_privs[bo_index];
|
bo_priv = &bo_privs[bo_index];
|
||||||
|
|
||||||
|
bo_bucket->gpu_id = pdd->user_gpu_id;
|
||||||
bo_bucket->addr = (uint64_t)kgd_mem->va;
|
bo_bucket->addr = (uint64_t)kgd_mem->va;
|
||||||
bo_bucket->size = amdgpu_bo_size(dumper_bo);
|
bo_bucket->size = amdgpu_bo_size(dumper_bo);
|
||||||
bo_bucket->gpu_id = pdd->dev->id;
|
|
||||||
bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
|
bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
|
||||||
bo_priv->idr_handle = id;
|
bo_priv->idr_handle = id;
|
||||||
|
|
||||||
|
@ -1928,7 +2009,7 @@ static int criu_checkpoint_bos(struct kfd_process *p,
|
||||||
|
|
||||||
for (i = 0; i < p->n_pdds; i++) {
|
for (i = 0; i < p->n_pdds; i++) {
|
||||||
if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
|
if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
|
||||||
bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->dev->id;
|
bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
|
pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
|
||||||
|
@ -1967,6 +2048,7 @@ exit:
|
||||||
}
|
}
|
||||||
|
|
||||||
static int criu_get_process_object_info(struct kfd_process *p,
|
static int criu_get_process_object_info(struct kfd_process *p,
|
||||||
|
uint32_t *num_devices,
|
||||||
uint32_t *num_bos,
|
uint32_t *num_bos,
|
||||||
uint32_t *num_objects,
|
uint32_t *num_objects,
|
||||||
uint64_t *objs_priv_size)
|
uint64_t *objs_priv_size)
|
||||||
|
@ -1976,6 +2058,7 @@ static int criu_get_process_object_info(struct kfd_process *p,
|
||||||
uint32_t num_queues, num_events, num_svm_ranges;
|
uint32_t num_queues, num_events, num_svm_ranges;
|
||||||
uint64_t queues_priv_data_size;
|
uint64_t queues_priv_data_size;
|
||||||
|
|
||||||
|
*num_devices = p->n_pdds;
|
||||||
*num_bos = get_process_num_bos(p);
|
*num_bos = get_process_num_bos(p);
|
||||||
|
|
||||||
ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
|
ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
|
||||||
|
@ -1989,6 +2072,7 @@ static int criu_get_process_object_info(struct kfd_process *p,
|
||||||
|
|
||||||
if (objs_priv_size) {
|
if (objs_priv_size) {
|
||||||
priv_size = sizeof(struct kfd_criu_process_priv_data);
|
priv_size = sizeof(struct kfd_criu_process_priv_data);
|
||||||
|
priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
|
||||||
priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
|
priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
|
||||||
priv_size += queues_priv_data_size;
|
priv_size += queues_priv_data_size;
|
||||||
priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
|
priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
|
||||||
|
@ -2003,10 +2087,10 @@ static int criu_checkpoint(struct file *filep,
|
||||||
struct kfd_ioctl_criu_args *args)
|
struct kfd_ioctl_criu_args *args)
|
||||||
{
|
{
|
||||||
int ret;
|
int ret;
|
||||||
uint32_t num_bos, num_objects;
|
uint32_t num_devices, num_bos, num_objects;
|
||||||
uint64_t priv_size, priv_offset = 0;
|
uint64_t priv_size, priv_offset = 0;
|
||||||
|
|
||||||
if (!args->bos || !args->priv_data)
|
if (!args->devices || !args->bos || !args->priv_data)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
|
||||||
mutex_lock(&p->mutex);
|
mutex_lock(&p->mutex);
|
||||||
|
@ -2025,11 +2109,12 @@ static int criu_checkpoint(struct file *filep,
|
||||||
goto exit_unlock;
|
goto exit_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = criu_get_process_object_info(p, &num_bos, &num_objects, &priv_size);
|
ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto exit_unlock;
|
goto exit_unlock;
|
||||||
|
|
||||||
if (num_bos != args->num_bos ||
|
if (num_devices != args->num_devices ||
|
||||||
|
num_bos != args->num_bos ||
|
||||||
num_objects != args->num_objects ||
|
num_objects != args->num_objects ||
|
||||||
priv_size != args->priv_data_size) {
|
priv_size != args->priv_data_size) {
|
||||||
|
|
||||||
|
@ -2042,6 +2127,11 @@ static int criu_checkpoint(struct file *filep,
|
||||||
if (ret)
|
if (ret)
|
||||||
goto exit_unlock;
|
goto exit_unlock;
|
||||||
|
|
||||||
|
ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
|
||||||
|
(uint8_t __user *)args->priv_data, &priv_offset);
|
||||||
|
if (ret)
|
||||||
|
goto exit_unlock;
|
||||||
|
|
||||||
ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
|
ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
|
||||||
(uint8_t __user *)args->priv_data, &priv_offset);
|
(uint8_t __user *)args->priv_data, &priv_offset);
|
||||||
if (ret)
|
if (ret)
|
||||||
|
@ -2102,6 +2192,108 @@ exit:
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int criu_restore_devices(struct kfd_process *p,
|
||||||
|
struct kfd_ioctl_criu_args *args,
|
||||||
|
uint64_t *priv_offset,
|
||||||
|
uint64_t max_priv_data_size)
|
||||||
|
{
|
||||||
|
struct kfd_criu_device_bucket *device_buckets;
|
||||||
|
struct kfd_criu_device_priv_data *device_privs;
|
||||||
|
int ret = 0;
|
||||||
|
uint32_t i;
|
||||||
|
|
||||||
|
if (args->num_devices != p->n_pdds)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
|
||||||
|
return -EINVAL;
|
||||||
|
|
||||||
|
device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
|
||||||
|
if (!device_buckets)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
ret = copy_from_user(device_buckets, (void __user *)args->devices,
|
||||||
|
args->num_devices * sizeof(*device_buckets));
|
||||||
|
if (ret) {
|
||||||
|
pr_err("Failed to copy devices buckets from user\n");
|
||||||
|
ret = -EFAULT;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < args->num_devices; i++) {
|
||||||
|
struct kfd_dev *dev;
|
||||||
|
struct kfd_process_device *pdd;
|
||||||
|
struct file *drm_file;
|
||||||
|
|
||||||
|
/* device private data is not currently used */
|
||||||
|
|
||||||
|
if (!device_buckets[i].user_gpu_id) {
|
||||||
|
pr_err("Invalid user gpu_id\n");
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
|
||||||
|
if (!dev) {
|
||||||
|
pr_err("Failed to find device with gpu_id = %x\n",
|
||||||
|
device_buckets[i].actual_gpu_id);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
pdd = kfd_get_process_device_data(dev, p);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_err("Failed to get pdd for gpu_id = %x\n",
|
||||||
|
device_buckets[i].actual_gpu_id);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
pdd->user_gpu_id = device_buckets[i].user_gpu_id;
|
||||||
|
|
||||||
|
drm_file = fget(device_buckets[i].drm_fd);
|
||||||
|
if (!drm_file) {
|
||||||
|
pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
|
||||||
|
device_buckets[i].drm_fd);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pdd->drm_file) {
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* create the vm using render nodes for kfd pdd */
|
||||||
|
if (kfd_process_device_init_vm(pdd, drm_file)) {
|
||||||
|
pr_err("could not init vm for given pdd\n");
|
||||||
|
/* On success, the PDD keeps the drm_file reference */
|
||||||
|
fput(drm_file);
|
||||||
|
ret = -EINVAL;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* pdd now already has the vm bound to render node so below api won't create a new
|
||||||
|
* exclusive kfd mapping but use existing one with renderDXXX but is still needed
|
||||||
|
* for iommu v2 binding and runtime pm.
|
||||||
|
*/
|
||||||
|
pdd = kfd_bind_process_to_device(dev, p);
|
||||||
|
if (IS_ERR(pdd)) {
|
||||||
|
ret = PTR_ERR(pdd);
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We are not copying device private data from user as we are not using the data for now,
|
||||||
|
* but we still adjust for its private data.
|
||||||
|
*/
|
||||||
|
*priv_offset += args->num_devices * sizeof(*device_privs);
|
||||||
|
|
||||||
|
exit:
|
||||||
|
kfree(device_buckets);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
static int criu_restore_bos(struct kfd_process *p,
|
static int criu_restore_bos(struct kfd_process *p,
|
||||||
struct kfd_ioctl_criu_args *args,
|
struct kfd_ioctl_criu_args *args,
|
||||||
uint64_t *priv_offset,
|
uint64_t *priv_offset,
|
||||||
|
@ -2160,19 +2352,6 @@ static int criu_restore_bos(struct kfd_process *p,
|
||||||
bo_bucket = &bo_buckets[i];
|
bo_bucket = &bo_buckets[i];
|
||||||
bo_priv = &bo_privs[i];
|
bo_priv = &bo_privs[i];
|
||||||
|
|
||||||
dev = kfd_device_by_id(bo_bucket->gpu_id);
|
|
||||||
if (!dev) {
|
|
||||||
ret = -EINVAL;
|
|
||||||
pr_err("Failed to get pdd\n");
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
pdd = kfd_get_process_device_data(dev, p);
|
|
||||||
if (!pdd) {
|
|
||||||
ret = -EINVAL;
|
|
||||||
pr_err("Failed to get pdd\n");
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
pr_debug("kfd restore ioctl - bo_bucket[%d]:\n", i);
|
pr_debug("kfd restore ioctl - bo_bucket[%d]:\n", i);
|
||||||
pr_debug("size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
|
pr_debug("size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
|
||||||
"gpu_id = 0x%x alloc_flags = 0x%x\n"
|
"gpu_id = 0x%x alloc_flags = 0x%x\n"
|
||||||
|
@ -2184,6 +2363,14 @@ static int criu_restore_bos(struct kfd_process *p,
|
||||||
bo_bucket->alloc_flags,
|
bo_bucket->alloc_flags,
|
||||||
bo_priv->idr_handle);
|
bo_priv->idr_handle);
|
||||||
|
|
||||||
|
pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
|
||||||
|
if (!pdd) {
|
||||||
|
pr_err("Failed to get pdd\n");
|
||||||
|
ret = -ENODEV;
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
dev = pdd->dev;
|
||||||
|
|
||||||
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
|
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
|
||||||
pr_debug("restore ioctl: KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL\n");
|
pr_debug("restore ioctl: KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL\n");
|
||||||
if (bo_bucket->size != kfd_doorbell_process_slice(dev)) {
|
if (bo_bucket->size != kfd_doorbell_process_slice(dev)) {
|
||||||
|
@ -2265,12 +2452,12 @@ static int criu_restore_bos(struct kfd_process *p,
|
||||||
if (!bo_priv->mapped_gpuids[j])
|
if (!bo_priv->mapped_gpuids[j])
|
||||||
break;
|
break;
|
||||||
|
|
||||||
peer = kfd_device_by_id(bo_priv->mapped_gpuids[j]);
|
peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
|
||||||
if (!peer) {
|
if (IS_ERR(peer_pdd)) {
|
||||||
pr_debug("Getting device by id failed for 0x%x\n", pdd->dev->id);
|
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
peer = peer_pdd->dev;
|
||||||
|
|
||||||
peer_pdd = kfd_bind_process_to_device(peer, p);
|
peer_pdd = kfd_bind_process_to_device(peer, p);
|
||||||
if (IS_ERR(peer_pdd)) {
|
if (IS_ERR(peer_pdd)) {
|
||||||
|
@ -2414,6 +2601,10 @@ static int criu_restore(struct file *filep,
|
||||||
if (ret)
|
if (ret)
|
||||||
goto exit_unlock;
|
goto exit_unlock;
|
||||||
|
|
||||||
|
ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
|
||||||
|
if (ret)
|
||||||
|
goto exit_unlock;
|
||||||
|
|
||||||
ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
|
ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto exit_unlock;
|
goto exit_unlock;
|
||||||
|
@ -2519,13 +2710,14 @@ static int criu_process_info(struct file *filep,
|
||||||
args->pid = task_pid_nr_ns(p->lead_thread,
|
args->pid = task_pid_nr_ns(p->lead_thread,
|
||||||
task_active_pid_ns(p->lead_thread));
|
task_active_pid_ns(p->lead_thread));
|
||||||
|
|
||||||
ret = criu_get_process_object_info(p, &args->num_bos, &args->num_objects,
|
ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
|
||||||
&args->priv_data_size);
|
&args->num_objects, &args->priv_data_size);
|
||||||
if (ret)
|
if (ret)
|
||||||
goto err_unlock;
|
goto err_unlock;
|
||||||
|
|
||||||
dev_dbg(kfd_device, "Num of bos:%u objects:%u priv_data_size:%lld\n",
|
dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
|
||||||
args->num_bos, args->num_objects, args->priv_data_size);
|
args->num_devices, args->num_bos, args->num_objects,
|
||||||
|
args->priv_data_size);
|
||||||
|
|
||||||
err_unlock:
|
err_unlock:
|
||||||
if (ret) {
|
if (ret) {
|
||||||
|
|
|
@ -342,11 +342,12 @@ int kfd_kmap_event_page(struct kfd_process *p, uint64_t event_page_offset)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
kfd = kfd_device_by_id(GET_GPU_ID(event_page_offset));
|
pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(event_page_offset));
|
||||||
if (!kfd) {
|
if (!pdd) {
|
||||||
pr_err("Getting device by id failed in %s\n", __func__);
|
pr_err("Getting device by id failed in %s\n", __func__);
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
}
|
}
|
||||||
|
kfd = pdd->dev;
|
||||||
|
|
||||||
pdd = kfd_bind_process_to_device(kfd, p);
|
pdd = kfd_bind_process_to_device(kfd, p);
|
||||||
if (IS_ERR(pdd))
|
if (IS_ERR(pdd))
|
||||||
|
@ -1094,6 +1095,7 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
|
||||||
{
|
{
|
||||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||||
struct vm_area_struct *vma;
|
struct vm_area_struct *vma;
|
||||||
|
int user_gpu_id;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Because we are called from arbitrary context (workqueue) as opposed
|
* Because we are called from arbitrary context (workqueue) as opposed
|
||||||
|
@ -1115,12 +1117,17 @@ void kfd_signal_iommu_event(struct kfd_dev *dev, u32 pasid,
|
||||||
return; /* Process is exiting */
|
return; /* Process is exiting */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
|
||||||
|
if (unlikely(user_gpu_id == -EINVAL)) {
|
||||||
|
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||||
|
|
||||||
mmap_read_lock(mm);
|
mmap_read_lock(mm);
|
||||||
vma = find_vma(mm, address);
|
vma = find_vma(mm, address);
|
||||||
|
|
||||||
memory_exception_data.gpu_id = dev->id;
|
memory_exception_data.gpu_id = user_gpu_id;
|
||||||
memory_exception_data.va = address;
|
memory_exception_data.va = address;
|
||||||
/* Set failure reason */
|
/* Set failure reason */
|
||||||
memory_exception_data.failure.NotPresent = 1;
|
memory_exception_data.failure.NotPresent = 1;
|
||||||
|
@ -1196,11 +1203,19 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, u32 pasid,
|
||||||
uint32_t id;
|
uint32_t id;
|
||||||
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
|
||||||
struct kfd_hsa_memory_exception_data memory_exception_data;
|
struct kfd_hsa_memory_exception_data memory_exception_data;
|
||||||
|
int user_gpu_id;
|
||||||
|
|
||||||
if (!p)
|
if (!p)
|
||||||
return; /* Presumably process exited. */
|
return; /* Presumably process exited. */
|
||||||
|
|
||||||
|
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
|
||||||
|
if (unlikely(user_gpu_id == -EINVAL)) {
|
||||||
|
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||||
memory_exception_data.gpu_id = dev->id;
|
memory_exception_data.gpu_id = user_gpu_id;
|
||||||
memory_exception_data.failure.imprecise = true;
|
memory_exception_data.failure.imprecise = true;
|
||||||
/* Set failure reason */
|
/* Set failure reason */
|
||||||
if (info) {
|
if (info) {
|
||||||
|
@ -1240,27 +1255,34 @@ void kfd_signal_reset_event(struct kfd_dev *dev)
|
||||||
|
|
||||||
/* Whole gpu reset caused by GPU hang and memory is lost */
|
/* Whole gpu reset caused by GPU hang and memory is lost */
|
||||||
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
|
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
|
||||||
hw_exception_data.gpu_id = dev->id;
|
|
||||||
hw_exception_data.memory_lost = 1;
|
hw_exception_data.memory_lost = 1;
|
||||||
hw_exception_data.reset_cause = reset_cause;
|
hw_exception_data.reset_cause = reset_cause;
|
||||||
|
|
||||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||||
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
|
memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
|
||||||
memory_exception_data.gpu_id = dev->id;
|
|
||||||
memory_exception_data.failure.imprecise = true;
|
memory_exception_data.failure.imprecise = true;
|
||||||
|
|
||||||
idx = srcu_read_lock(&kfd_processes_srcu);
|
idx = srcu_read_lock(&kfd_processes_srcu);
|
||||||
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
||||||
|
int user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
|
||||||
|
|
||||||
|
if (unlikely(user_gpu_id == -EINVAL)) {
|
||||||
|
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
mutex_lock(&p->event_mutex);
|
mutex_lock(&p->event_mutex);
|
||||||
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
||||||
idr_for_each_entry_continue(&p->event_idr, ev, id) {
|
idr_for_each_entry_continue(&p->event_idr, ev, id) {
|
||||||
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
|
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
|
||||||
ev->hw_exception_data = hw_exception_data;
|
ev->hw_exception_data = hw_exception_data;
|
||||||
|
ev->hw_exception_data.gpu_id = user_gpu_id;
|
||||||
set_event(ev);
|
set_event(ev);
|
||||||
}
|
}
|
||||||
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
|
if (ev->type == KFD_EVENT_TYPE_MEMORY &&
|
||||||
reset_cause == KFD_HW_EXCEPTION_ECC) {
|
reset_cause == KFD_HW_EXCEPTION_ECC) {
|
||||||
ev->memory_exception_data = memory_exception_data;
|
ev->memory_exception_data = memory_exception_data;
|
||||||
|
ev->memory_exception_data.gpu_id = user_gpu_id;
|
||||||
set_event(ev);
|
set_event(ev);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1276,18 +1298,25 @@ void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 pasid)
|
||||||
struct kfd_hsa_hw_exception_data hw_exception_data;
|
struct kfd_hsa_hw_exception_data hw_exception_data;
|
||||||
struct kfd_event *ev;
|
struct kfd_event *ev;
|
||||||
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
|
||||||
|
int user_gpu_id;
|
||||||
|
|
||||||
if (!p)
|
if (!p)
|
||||||
return; /* Presumably process exited. */
|
return; /* Presumably process exited. */
|
||||||
|
|
||||||
|
user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
|
||||||
|
if (unlikely(user_gpu_id == -EINVAL)) {
|
||||||
|
WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", dev->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
|
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
|
||||||
hw_exception_data.gpu_id = dev->id;
|
hw_exception_data.gpu_id = user_gpu_id;
|
||||||
hw_exception_data.memory_lost = 1;
|
hw_exception_data.memory_lost = 1;
|
||||||
hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
|
hw_exception_data.reset_cause = KFD_HW_EXCEPTION_ECC;
|
||||||
|
|
||||||
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
memset(&memory_exception_data, 0, sizeof(memory_exception_data));
|
||||||
memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
|
memory_exception_data.ErrorType = KFD_MEM_ERR_POISON_CONSUMED;
|
||||||
memory_exception_data.gpu_id = dev->id;
|
memory_exception_data.gpu_id = user_gpu_id;
|
||||||
memory_exception_data.failure.imprecise = true;
|
memory_exception_data.failure.imprecise = true;
|
||||||
|
|
||||||
mutex_lock(&p->event_mutex);
|
mutex_lock(&p->event_mutex);
|
||||||
|
|
|
@ -774,6 +774,12 @@ struct kfd_process_device {
|
||||||
uint64_t faults;
|
uint64_t faults;
|
||||||
uint64_t page_in;
|
uint64_t page_in;
|
||||||
uint64_t page_out;
|
uint64_t page_out;
|
||||||
|
/*
|
||||||
|
* If this process has been checkpointed before, then the user
|
||||||
|
* application will use the original gpu_id on the
|
||||||
|
* checkpointed node to refer to this device.
|
||||||
|
*/
|
||||||
|
uint32_t user_gpu_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
|
#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
|
||||||
|
@ -933,6 +939,11 @@ int kfd_process_restore_queues(struct kfd_process *p);
|
||||||
void kfd_suspend_all_processes(void);
|
void kfd_suspend_all_processes(void);
|
||||||
int kfd_resume_all_processes(void);
|
int kfd_resume_all_processes(void);
|
||||||
|
|
||||||
|
struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,
|
||||||
|
uint32_t gpu_id);
|
||||||
|
|
||||||
|
int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id);
|
||||||
|
|
||||||
int kfd_process_device_init_vm(struct kfd_process_device *pdd,
|
int kfd_process_device_init_vm(struct kfd_process_device *pdd,
|
||||||
struct file *drm_file);
|
struct file *drm_file);
|
||||||
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
|
struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev,
|
||||||
|
|
|
@ -1526,6 +1526,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
|
||||||
pdd->runtime_inuse = false;
|
pdd->runtime_inuse = false;
|
||||||
pdd->vram_usage = 0;
|
pdd->vram_usage = 0;
|
||||||
pdd->sdma_past_activity_counter = 0;
|
pdd->sdma_past_activity_counter = 0;
|
||||||
|
pdd->user_gpu_id = dev->id;
|
||||||
atomic64_set(&pdd->evict_duration_counter, 0);
|
atomic64_set(&pdd->evict_duration_counter, 0);
|
||||||
p->pdds[p->n_pdds++] = pdd;
|
p->pdds[p->n_pdds++] = pdd;
|
||||||
|
|
||||||
|
@ -1981,6 +1982,37 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (gpu_id) {
|
||||||
|
for (i = 0; i < p->n_pdds; i++) {
|
||||||
|
struct kfd_process_device *pdd = p->pdds[i];
|
||||||
|
|
||||||
|
if (pdd->user_gpu_id == gpu_id)
|
||||||
|
return pdd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int kfd_process_get_user_gpu_id(struct kfd_process *p, uint32_t actual_gpu_id)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
if (!actual_gpu_id)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
for (i = 0; i < p->n_pdds; i++) {
|
||||||
|
struct kfd_process_device *pdd = p->pdds[i];
|
||||||
|
|
||||||
|
if (pdd->dev->id == actual_gpu_id)
|
||||||
|
return pdd->user_gpu_id;
|
||||||
|
}
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
#if defined(CONFIG_DEBUG_FS)
|
#if defined(CONFIG_DEBUG_FS)
|
||||||
|
|
||||||
int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
|
int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
|
||||||
|
|
|
@ -610,7 +610,7 @@ static int criu_checkpoint_queue(struct kfd_process_device *pdd,
|
||||||
mqd = (void *)(q_data + 1);
|
mqd = (void *)(q_data + 1);
|
||||||
ctl_stack = mqd + q_data->mqd_size;
|
ctl_stack = mqd + q_data->mqd_size;
|
||||||
|
|
||||||
q_data->gpu_id = pdd->dev->id;
|
q_data->gpu_id = pdd->user_gpu_id;
|
||||||
q_data->type = q->properties.type;
|
q_data->type = q->properties.type;
|
||||||
q_data->format = q->properties.format;
|
q_data->format = q->properties.format;
|
||||||
q_data->q_id = q->properties.queue_id;
|
q_data->q_id = q->properties.queue_id;
|
||||||
|
@ -769,7 +769,6 @@ int kfd_criu_restore_queue(struct kfd_process *p,
|
||||||
uint64_t q_extra_data_size;
|
uint64_t q_extra_data_size;
|
||||||
struct queue_properties qp;
|
struct queue_properties qp;
|
||||||
unsigned int queue_id;
|
unsigned int queue_id;
|
||||||
struct kfd_dev *dev;
|
|
||||||
int ret = 0;
|
int ret = 0;
|
||||||
|
|
||||||
if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size)
|
if (*priv_data_offset + sizeof(*q_data) > max_priv_data_size)
|
||||||
|
@ -807,20 +806,11 @@ int kfd_criu_restore_queue(struct kfd_process *p,
|
||||||
|
|
||||||
*priv_data_offset += q_extra_data_size;
|
*priv_data_offset += q_extra_data_size;
|
||||||
|
|
||||||
dev = kfd_device_by_id(q_data->gpu_id);
|
pdd = kfd_process_device_data_by_id(p, q_data->gpu_id);
|
||||||
if (!dev) {
|
|
||||||
pr_err("Could not get kfd_dev from gpu_id = 0x%x\n",
|
|
||||||
q_data->gpu_id);
|
|
||||||
|
|
||||||
ret = -EINVAL;
|
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
|
|
||||||
pdd = kfd_get_process_device_data(dev, p);
|
|
||||||
if (!pdd) {
|
if (!pdd) {
|
||||||
pr_err("Failed to get pdd\n");
|
pr_err("Failed to get pdd\n");
|
||||||
ret = -EFAULT;
|
ret = -EINVAL;
|
||||||
return ret;
|
goto exit;
|
||||||
}
|
}
|
||||||
/* data stored in this order: mqd, ctl_stack */
|
/* data stored in this order: mqd, ctl_stack */
|
||||||
mqd = q_extra_data;
|
mqd = q_extra_data;
|
||||||
|
|
Loading…
Add table
Reference in a new issue