diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 1fe6913242d7..261160d3de9e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -45,7 +45,8 @@ static const struct mmu_interval_notifier_ops svm_range_mn_ops = { * svm_range_unlink - unlink svm_range from lists and interval tree * @prange: svm range structure to be removed * - * Remove the svm range from svms interval tree and link list + * Remove the svm_range from the svms and svm_bo lists and the svms + * interval tree. * * Context: The caller must hold svms->lock */ @@ -54,6 +55,12 @@ static void svm_range_unlink(struct svm_range *prange) pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, prange->last); + if (prange->svm_bo) { + spin_lock(&prange->svm_bo->list_lock); + list_del(&prange->svm_bo_list); + spin_unlock(&prange->svm_bo->list_lock); + } + list_del(&prange->list); if (prange->it_node.start != 0 && prange->it_node.last != 0) interval_tree_remove(&prange->it_node, &prange->svms->objects); @@ -218,6 +225,7 @@ static void svm_range_free(struct svm_range *prange) pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n", prange->svms, prange, prange->start, prange->last); + svm_range_vram_node_free(prange); svm_range_free_dma_mappings(prange); mutex_destroy(&prange->lock); kfree(prange); @@ -252,6 +260,7 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, INIT_LIST_HEAD(&prange->update_list); INIT_LIST_HEAD(&prange->remove_list); INIT_LIST_HEAD(&prange->insert_list); + INIT_LIST_HEAD(&prange->svm_bo_list); INIT_LIST_HEAD(&prange->deferred_list); INIT_LIST_HEAD(&prange->child_list); atomic_set(&prange->invalid, 0); @@ -265,6 +274,210 @@ svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, return prange; } +static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) +{ + if (!svm_bo || !kref_get_unless_zero(&svm_bo->kref)) + return false; + + return true; +} + +static struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo *svm_bo) +{ + if (svm_bo) + kref_get(&svm_bo->kref); + + return svm_bo; +} + +static void svm_range_bo_release(struct kref *kref) +{ + struct svm_range_bo *svm_bo; + + svm_bo = container_of(kref, struct svm_range_bo, kref); + spin_lock(&svm_bo->list_lock); + while (!list_empty(&svm_bo->range_list)) { + struct svm_range *prange = + list_first_entry(&svm_bo->range_list, + struct svm_range, svm_bo_list); + /* list_del_init tells a concurrent svm_range_vram_node_new when + * it's safe to reuse the svm_bo pointer and svm_bo_list head. + */ + list_del_init(&prange->svm_bo_list); + spin_unlock(&svm_bo->list_lock); + + pr_debug("svms 0x%p [0x%lx 0x%lx]\n", prange->svms, + prange->start, prange->last); + mutex_lock(&prange->lock); + prange->svm_bo = NULL; + mutex_unlock(&prange->lock); + + spin_lock(&svm_bo->list_lock); + } + spin_unlock(&svm_bo->list_lock); + + amdgpu_bo_unref(&svm_bo->bo); + kfree(svm_bo); +} + +static void svm_range_bo_unref(struct svm_range_bo *svm_bo) +{ + if (!svm_bo) + return; + + kref_put(&svm_bo->kref, svm_range_bo_release); +} + +static struct svm_range_bo *svm_range_bo_new(void) +{ + struct svm_range_bo *svm_bo; + + svm_bo = kzalloc(sizeof(*svm_bo), GFP_KERNEL); + if (!svm_bo) + return NULL; + + kref_init(&svm_bo->kref); + INIT_LIST_HEAD(&svm_bo->range_list); + spin_lock_init(&svm_bo->list_lock); + + return svm_bo; +} + +int +svm_range_vram_node_new(struct amdgpu_device *adev, struct svm_range *prange, + bool clear) +{ + struct amdkfd_process_info *process_info; + struct amdgpu_bo_param bp; + struct svm_range_bo *svm_bo; + struct amdgpu_bo_user *ubo; + struct amdgpu_bo *bo; + struct kfd_process *p; + int r; + + pr_debug("[0x%lx 0x%lx]\n", prange->start, prange->last); + mutex_lock(&prange->lock); + if (prange->svm_bo) { + if (prange->ttm_res) { + /* We still have a reference, all is well */ + mutex_unlock(&prange->lock); + return 0; + } + if (svm_bo_ref_unless_zero(prange->svm_bo)) { + /* The BO was still around and we got + * a new reference to it + */ + mutex_unlock(&prange->lock); + pr_debug("reuse old bo [0x%lx 0x%lx]\n", + prange->start, prange->last); + + prange->ttm_res = &prange->svm_bo->bo->tbo.mem; + return 0; + } + + mutex_unlock(&prange->lock); + + /* We need a new svm_bo. Spin-loop to wait for concurrent + * svm_range_bo_release to finish removing this range from + * its range list. After this, it is safe to reuse the + * svm_bo pointer and svm_bo_list head. + */ + while (!list_empty_careful(&prange->svm_bo_list)) + ; + + } else { + mutex_unlock(&prange->lock); + } + + svm_bo = svm_range_bo_new(); + if (!svm_bo) { + pr_debug("failed to alloc svm bo\n"); + return -ENOMEM; + } + + memset(&bp, 0, sizeof(bp)); + bp.size = prange->npages * PAGE_SIZE; + bp.byte_align = PAGE_SIZE; + bp.domain = AMDGPU_GEM_DOMAIN_VRAM; + bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; + bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; + bp.type = ttm_bo_type_device; + bp.resv = NULL; + + r = amdgpu_bo_create_user(adev, &bp, &ubo); + if (r) { + pr_debug("failed %d to create bo\n", r); + kfree(svm_bo); + return r; + } + bo = &ubo->bo; + + p = container_of(prange->svms, struct kfd_process, svms); + r = amdgpu_bo_reserve(bo, true); + if (r) { + pr_debug("failed %d to reserve bo\n", r); + goto reserve_bo_failed; + } + + r = dma_resv_reserve_shared(bo->tbo.base.resv, 1); + if (r) { + pr_debug("failed %d to reserve bo\n", r); + amdgpu_bo_unreserve(bo); + goto reserve_bo_failed; + } + process_info = p->kgd_process_info; + amdgpu_bo_fence(bo, &process_info->eviction_fence->base, true); + + amdgpu_bo_unreserve(bo); + + svm_bo->bo = bo; + prange->svm_bo = svm_bo; + prange->ttm_res = &bo->tbo.mem; + prange->offset = 0; + + spin_lock(&svm_bo->list_lock); + list_add(&prange->svm_bo_list, &svm_bo->range_list); + spin_unlock(&svm_bo->list_lock); + + return 0; + +reserve_bo_failed: + kfree(svm_bo); + amdgpu_bo_unref(&bo); + prange->ttm_res = NULL; + + return r; +} + +void svm_range_vram_node_free(struct svm_range *prange) +{ + svm_range_bo_unref(prange->svm_bo); + prange->ttm_res = NULL; +} + +struct amdgpu_device * +svm_range_get_adev_by_id(struct svm_range *prange, uint32_t gpu_id) +{ + struct kfd_process_device *pdd; + struct kfd_process *p; + int32_t gpu_idx; + + p = container_of(prange->svms, struct kfd_process, svms); + + gpu_idx = kfd_process_gpuidx_from_gpuid(p, gpu_id); + if (gpu_idx < 0) { + pr_debug("failed to get device by id 0x%x\n", gpu_id); + return NULL; + } + pdd = kfd_process_device_from_gpuidx(p, gpu_idx); + if (!pdd) { + pr_debug("failed to get device by idx 0x%x\n", gpu_idx); + return NULL; + } + + return (struct amdgpu_device *)pdd->dev->kgd; +} + static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) { struct ttm_operation_ctx ctx = { false, false }; @@ -471,6 +684,32 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, return 0; } +static int +svm_range_split_nodes(struct svm_range *new, struct svm_range *old, + uint64_t start, uint64_t last) +{ + uint64_t npages = last - start + 1; + + pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n", + new->svms, new, new->start, start, last); + + if (new->start == old->start) { + new->offset = old->offset; + old->offset += new->npages; + } else { + new->offset = old->offset + npages; + } + + new->svm_bo = svm_range_bo_ref(old->svm_bo); + new->ttm_res = old->ttm_res; + + spin_lock(&new->svm_bo->list_lock); + list_add(&new->svm_bo_list, &new->svm_bo->range_list); + spin_unlock(&new->svm_bo->list_lock); + + return 0; +} + /** * svm_range_split_adjust - split range and adjust * @@ -479,7 +718,7 @@ svm_range_split_pages(struct svm_range *new, struct svm_range *old, * @start: the old range adjust to start address in pages * @last: the old range adjust to last address in pages * - * Copy system memory dma_addr in old range to new + * Copy system memory dma_addr or vram ttm_res in old range to new * range from new_start up to size new->npages, the remaining old range is from * start to last * @@ -505,6 +744,12 @@ svm_range_split_adjust(struct svm_range *new, struct svm_range *old, if (r) return r; + if (old->actual_loc && old->ttm_res) { + r = svm_range_split_nodes(new, old, start, last); + if (r) + return r; + } + old->npages = last - start + 1; old->start = start; old->last = last; @@ -619,7 +864,8 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange) uint64_t pte_flags; pte_flags = AMDGPU_PTE_VALID; - pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED; + if (!prange->ttm_res) + pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED; mapping_flags = AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; @@ -639,7 +885,9 @@ svm_range_get_pte_flags(struct amdgpu_device *adev, struct svm_range *prange) /* Apply ASIC specific mapping flags */ amdgpu_gmc_get_vm_pte(adev, &prange->mapping, &pte_flags); - pr_debug("PTE flags 0x%llx\n", pte_flags); + pr_debug("svms 0x%p [0x%lx 0x%lx] vram %d PTE flags 0x%llx\n", + prange->svms, prange->start, prange->last, + prange->ttm_res ? 1:0, pte_flags); return pte_flags; } @@ -715,13 +963,15 @@ svm_range_map_to_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, prange->mapping.start = prange->start; prange->mapping.last = prange->last; - prange->mapping.offset = 0; + prange->mapping.offset = prange->offset; pte_flags = svm_range_get_pte_flags(adev, prange); r = amdgpu_vm_bo_update_mapping(adev, adev, vm, false, false, NULL, prange->mapping.start, prange->mapping.last, pte_flags, - prange->mapping.offset, NULL, + prange->mapping.offset, + prange->ttm_res ? + prange->ttm_res->mm_node : NULL, dma_addr, &vm->last_update); if (r) { pr_debug("failed %d to map to gpu 0x%lx\n", r, prange->start); @@ -817,6 +1067,11 @@ static int svm_range_reserve_bos(struct svm_validate_context *ctx) ctx->tv[gpuidx].num_shared = 4; list_add(&ctx->tv[gpuidx].head, &ctx->validate_list); } + if (ctx->prange->svm_bo && ctx->prange->ttm_res) { + ctx->tv[MAX_GPU_INSTANCE].bo = &ctx->prange->svm_bo->bo->tbo; + ctx->tv[MAX_GPU_INSTANCE].num_shared = 1; + list_add(&ctx->tv[MAX_GPU_INSTANCE].head, &ctx->validate_list); + } r = ttm_eu_reserve_buffers(&ctx->ticket, &ctx->validate_list, ctx->intr, NULL); @@ -901,6 +1156,14 @@ static int svm_range_validate_and_map(struct mm_struct *mm, if (bitmap_empty(ctx.bitmap, MAX_GPU_INSTANCE)) return 0; + if (prange->actual_loc && !prange->ttm_res) { + /* This should never happen. actual_loc gets set by + * svm_migrate_ram_to_vram after allocating a BO. + */ + WARN(1, "VRAM BO missing during validation\n"); + return -EINVAL; + } + svm_range_reserve_bos(&ctx); if (!prange->actual_loc) { @@ -1098,6 +1361,14 @@ static struct svm_range *svm_range_clone(struct svm_range *old) if (!new) return NULL; + if (old->svm_bo) { + new->ttm_res = old->ttm_res; + new->offset = old->offset; + new->svm_bo = svm_range_bo_ref(old->svm_bo); + spin_lock(&new->svm_bo->list_lock); + list_add(&new->svm_bo_list, &new->svm_bo->range_list); + spin_unlock(&new->svm_bo->list_lock); + } new->flags = old->flags; new->preferred_loc = old->preferred_loc; new->prefetch_loc = old->prefetch_loc; @@ -1507,12 +1778,23 @@ svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, void svm_range_list_fini(struct kfd_process *p) { - mutex_destroy(&p->svms.lock); + struct svm_range *prange; + struct svm_range *next; pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms); /* Ensure list work is finished before process is destroyed */ flush_work(&p->svms.deferred_list_work); + + list_for_each_entry_safe(prange, next, &p->svms.list, list) { + svm_range_unlink(prange); + svm_range_remove_notifier(prange); + svm_range_free(prange); + } + + mutex_destroy(&p->svms.lock); + + pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms); } int svm_range_list_init(struct kfd_process *p) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index 3c94899c5c40..0aab88c71855 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -33,6 +33,13 @@ #include "amdgpu.h" #include "kfd_priv.h" +struct svm_range_bo { + struct amdgpu_bo *bo; + struct kref kref; + struct list_head range_list; /* all svm ranges shared this bo */ + spinlock_t list_lock; +}; + enum svm_work_list_ops { SVM_OP_NULL, SVM_OP_UNMAP_RANGE, @@ -60,6 +67,10 @@ struct svm_work_list_item { * @mapping: bo_va mapping structure to create and update GPU page table * @npages: number of pages * @dma_addr: dma mapping address on each GPU for system memory physical page + * @ttm_res: vram ttm resource map + * @offset: range start offset within mm_nodes + * @svm_bo: struct to manage splited amdgpu_bo + * @svm_bo_list:link list node, to scan all ranges which share same svm_bo * @lock: protect prange start, last, child_list, svm_bo_list * @saved_flags:save/restore current PF_MEMALLOC flags * @flags: flags defined as KFD_IOCTL_SVM_FLAG_* @@ -91,6 +102,10 @@ struct svm_range { struct amdgpu_bo_va_mapping mapping; uint64_t npages; dma_addr_t *dma_addr[MAX_GPU_INSTANCE]; + struct ttm_resource *ttm_res; + uint64_t offset; + struct svm_range_bo *svm_bo; + struct list_head svm_bo_list; struct mutex lock; unsigned int saved_flags; uint32_t flags; @@ -124,5 +139,10 @@ void svm_range_list_fini(struct kfd_process *p); int svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs); +struct amdgpu_device *svm_range_get_adev_by_id(struct svm_range *prange, + uint32_t id); +int svm_range_vram_node_new(struct amdgpu_device *adev, + struct svm_range *prange, bool clear); +void svm_range_vram_node_free(struct svm_range *prange); #endif /* KFD_SVM_H_ */