habanalabs: add device memory scrub ability through debugfs
Add the ability to scrub the device memory with a given value. Add file 'dram_mem_scrub_val' to set the value and a file 'dram_mem_scrub' to scrub the dram. This is very important to help during automated tests, when you want the CI system to randomize the memory before training certain DL topologies. Signed-off-by: Dafna Hirschfeld <dhirschfeld@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
parent
829ec038c9
commit
0688474eda
6 changed files with 93 additions and 8 deletions
|
@ -170,6 +170,20 @@ KernelVersion: 5.1
|
||||||
Contact: ogabbay@kernel.org
|
Contact: ogabbay@kernel.org
|
||||||
Description: Sets the state of the third S/W led on the device
|
Description: Sets the state of the third S/W led on the device
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/habanalabs/hl<n>/memory_scrub
|
||||||
|
Date: May 2022
|
||||||
|
KernelVersion: 5.19
|
||||||
|
Contact: dhirschfeld@habana.ai
|
||||||
|
Description: Allows the root user to scrub the dram memory. The scrubbing
|
||||||
|
value can be set using the debugfs file memory_scrub_val.
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/habanalabs/hl<n>/memory_scrub_val
|
||||||
|
Date: May 2022
|
||||||
|
KernelVersion: 5.19
|
||||||
|
Contact: dhirschfeld@habana.ai
|
||||||
|
Description: The value to which the dram will be set to when the user
|
||||||
|
scrubs the dram using 'memory_scrub' debugfs file
|
||||||
|
|
||||||
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
|
What: /sys/kernel/debug/habanalabs/hl<n>/mmu
|
||||||
Date: Jan 2019
|
Date: Jan 2019
|
||||||
KernelVersion: 5.1
|
KernelVersion: 5.1
|
||||||
|
|
|
@ -538,6 +538,39 @@ static int engines_show(struct seq_file *s, void *data)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static ssize_t hl_memory_scrub(struct file *f, const char __user *buf,
|
||||||
|
size_t count, loff_t *ppos)
|
||||||
|
{
|
||||||
|
struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
|
||||||
|
struct hl_device *hdev = entry->hdev;
|
||||||
|
u64 val = entry->memory_scrub_val;
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
if (!hl_device_operational(hdev, NULL)) {
|
||||||
|
dev_warn_ratelimited(hdev->dev, "Can't scrub memory, device is not operational\n");
|
||||||
|
return -EIO;
|
||||||
|
}
|
||||||
|
|
||||||
|
mutex_lock(&hdev->fpriv_list_lock);
|
||||||
|
if (hdev->is_compute_ctx_active) {
|
||||||
|
mutex_unlock(&hdev->fpriv_list_lock);
|
||||||
|
dev_err(hdev->dev, "can't scrub dram, context exist\n");
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
hdev->is_in_dram_scrub = true;
|
||||||
|
mutex_unlock(&hdev->fpriv_list_lock);
|
||||||
|
|
||||||
|
rc = hdev->asic_funcs->scrub_device_dram(hdev, val);
|
||||||
|
|
||||||
|
mutex_lock(&hdev->fpriv_list_lock);
|
||||||
|
hdev->is_in_dram_scrub = false;
|
||||||
|
mutex_unlock(&hdev->fpriv_list_lock);
|
||||||
|
|
||||||
|
if (rc)
|
||||||
|
return rc;
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
|
static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
|
||||||
{
|
{
|
||||||
struct asic_fixed_properties *prop = &hdev->asic_prop;
|
struct asic_fixed_properties *prop = &hdev->asic_prop;
|
||||||
|
@ -1316,6 +1349,11 @@ static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf,
|
||||||
return count;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const struct file_operations hl_mem_scrub_fops = {
|
||||||
|
.owner = THIS_MODULE,
|
||||||
|
.write = hl_memory_scrub,
|
||||||
|
};
|
||||||
|
|
||||||
static const struct file_operations hl_data32b_fops = {
|
static const struct file_operations hl_data32b_fops = {
|
||||||
.owner = THIS_MODULE,
|
.owner = THIS_MODULE,
|
||||||
.read = hl_data_read32,
|
.read = hl_data_read32,
|
||||||
|
@ -1475,6 +1513,17 @@ void hl_debugfs_add_device(struct hl_device *hdev)
|
||||||
dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
|
dev_entry->root = debugfs_create_dir(dev_name(hdev->dev),
|
||||||
hl_debug_root);
|
hl_debug_root);
|
||||||
|
|
||||||
|
debugfs_create_x64("memory_scrub_val",
|
||||||
|
0644,
|
||||||
|
dev_entry->root,
|
||||||
|
&dev_entry->memory_scrub_val);
|
||||||
|
|
||||||
|
debugfs_create_file("memory_scrub",
|
||||||
|
0200,
|
||||||
|
dev_entry->root,
|
||||||
|
dev_entry,
|
||||||
|
&hl_mem_scrub_fops);
|
||||||
|
|
||||||
debugfs_create_x64("addr",
|
debugfs_create_x64("addr",
|
||||||
0644,
|
0644,
|
||||||
dev_entry->root,
|
dev_entry->root,
|
||||||
|
|
|
@ -1246,6 +1246,7 @@ struct fw_load_mgr {
|
||||||
* its implementation is not trivial when the driver
|
* its implementation is not trivial when the driver
|
||||||
* is loaded in simulation mode (not upstreamed).
|
* is loaded in simulation mode (not upstreamed).
|
||||||
* @scrub_device_mem: Scrub device memory given an address and size
|
* @scrub_device_mem: Scrub device memory given an address and size
|
||||||
|
* @scrub_device_dram: Scrub the dram memory of the device.
|
||||||
* @get_int_queue_base: get the internal queue base address.
|
* @get_int_queue_base: get the internal queue base address.
|
||||||
* @test_queues: run simple test on all queues for sanity check.
|
* @test_queues: run simple test on all queues for sanity check.
|
||||||
* @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
|
* @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
|
||||||
|
@ -1357,6 +1358,7 @@ struct hl_asic_funcs {
|
||||||
void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
|
void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
|
||||||
void *cpu_addr, dma_addr_t dma_handle);
|
void *cpu_addr, dma_addr_t dma_handle);
|
||||||
int (*scrub_device_mem)(struct hl_device *hdev, u64 addr, u64 size);
|
int (*scrub_device_mem)(struct hl_device *hdev, u64 addr, u64 size);
|
||||||
|
int (*scrub_device_dram)(struct hl_device *hdev, u64 val);
|
||||||
void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
|
void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
|
||||||
dma_addr_t *dma_handle, u16 *queue_len);
|
dma_addr_t *dma_handle, u16 *queue_len);
|
||||||
int (*test_queues)(struct hl_device *hdev);
|
int (*test_queues)(struct hl_device *hdev);
|
||||||
|
@ -2011,6 +2013,7 @@ struct hl_debugfs_entry {
|
||||||
* @addr: next address to read/write from/to in read/write32.
|
* @addr: next address to read/write from/to in read/write32.
|
||||||
* @mmu_addr: next virtual address to translate to physical address in mmu_show.
|
* @mmu_addr: next virtual address to translate to physical address in mmu_show.
|
||||||
* @userptr_lookup: the target user ptr to look up for on demand.
|
* @userptr_lookup: the target user ptr to look up for on demand.
|
||||||
|
* @memory_scrub_val: the value to which the dram will be scrubbed to using cb scrub_device_dram
|
||||||
* @mmu_asid: ASID to use while translating in mmu_show.
|
* @mmu_asid: ASID to use while translating in mmu_show.
|
||||||
* @state_dump_head: index of the latest state dump
|
* @state_dump_head: index of the latest state dump
|
||||||
* @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
|
* @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
|
||||||
|
@ -2041,6 +2044,7 @@ struct hl_dbg_device_entry {
|
||||||
u64 addr;
|
u64 addr;
|
||||||
u64 mmu_addr;
|
u64 mmu_addr;
|
||||||
u64 userptr_lookup;
|
u64 userptr_lookup;
|
||||||
|
u64 memory_scrub_val;
|
||||||
u32 mmu_asid;
|
u32 mmu_asid;
|
||||||
u32 state_dump_head;
|
u32 state_dump_head;
|
||||||
u8 i2c_bus;
|
u8 i2c_bus;
|
||||||
|
@ -2704,6 +2708,7 @@ struct hl_reset_info {
|
||||||
* @id_control: minor of the control device
|
* @id_control: minor of the control device
|
||||||
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
|
* @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
|
||||||
* addresses.
|
* addresses.
|
||||||
|
* @is_in_dram_scrub: true if dram scrub operation is on going.
|
||||||
* @disabled: is device disabled.
|
* @disabled: is device disabled.
|
||||||
* @late_init_done: is late init stage was done during initialization.
|
* @late_init_done: is late init stage was done during initialization.
|
||||||
* @hwmon_initialized: is H/W monitor sensors was initialized.
|
* @hwmon_initialized: is H/W monitor sensors was initialized.
|
||||||
|
@ -2834,6 +2839,7 @@ struct hl_device {
|
||||||
u16 id;
|
u16 id;
|
||||||
u16 id_control;
|
u16 id_control;
|
||||||
u16 cpu_pci_msb_addr;
|
u16 cpu_pci_msb_addr;
|
||||||
|
u8 is_in_dram_scrub;
|
||||||
u8 disabled;
|
u8 disabled;
|
||||||
u8 late_init_done;
|
u8 late_init_done;
|
||||||
u8 hwmon_initialized;
|
u8 hwmon_initialized;
|
||||||
|
|
|
@ -158,6 +158,14 @@ int hl_device_open(struct inode *inode, struct file *filp)
|
||||||
goto out_err;
|
goto out_err;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hdev->is_in_dram_scrub) {
|
||||||
|
dev_dbg_ratelimited(hdev->dev,
|
||||||
|
"Can't open %s during dram scrub\n",
|
||||||
|
dev_name(hdev->dev));
|
||||||
|
rc = -EAGAIN;
|
||||||
|
goto out_err;
|
||||||
|
}
|
||||||
|
|
||||||
if (hdev->compute_ctx_in_release) {
|
if (hdev->compute_ctx_in_release) {
|
||||||
dev_dbg_ratelimited(hdev->dev,
|
dev_dbg_ratelimited(hdev->dev,
|
||||||
"Can't open %s because another user is still releasing it\n",
|
"Can't open %s because another user is still releasing it\n",
|
||||||
|
|
|
@ -4740,12 +4740,11 @@ static void gaudi_dma_free_coherent(struct hl_device *hdev, size_t size,
|
||||||
dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
|
dma_free_coherent(&hdev->pdev->dev, size, cpu_addr, fixed_dma_handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int gaudi_hbm_scrubbing(struct hl_device *hdev)
|
static int gaudi_scrub_device_dram(struct hl_device *hdev, u64 val)
|
||||||
{
|
{
|
||||||
struct asic_fixed_properties *prop = &hdev->asic_prop;
|
struct asic_fixed_properties *prop = &hdev->asic_prop;
|
||||||
u64 cur_addr = DRAM_BASE_ADDR_USER;
|
u64 cur_addr = DRAM_BASE_ADDR_USER;
|
||||||
u32 val;
|
u32 chunk_size, busy;
|
||||||
u32 chunk_size;
|
|
||||||
int rc, dma_id;
|
int rc, dma_id;
|
||||||
|
|
||||||
while (cur_addr < prop->dram_end_address) {
|
while (cur_addr < prop->dram_end_address) {
|
||||||
|
@ -4759,8 +4758,10 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
|
||||||
"Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
|
"Doing HBM scrubbing for 0x%09llx - 0x%09llx\n",
|
||||||
cur_addr, cur_addr + chunk_size);
|
cur_addr, cur_addr + chunk_size);
|
||||||
|
|
||||||
WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset, 0xdeadbeaf);
|
WREG32(mmDMA0_CORE_SRC_BASE_LO + dma_offset,
|
||||||
WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset, 0xdeadbeaf);
|
lower_32_bits(val));
|
||||||
|
WREG32(mmDMA0_CORE_SRC_BASE_HI + dma_offset,
|
||||||
|
upper_32_bits(val));
|
||||||
WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
|
WREG32(mmDMA0_CORE_DST_BASE_LO + dma_offset,
|
||||||
lower_32_bits(cur_addr));
|
lower_32_bits(cur_addr));
|
||||||
WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,
|
WREG32(mmDMA0_CORE_DST_BASE_HI + dma_offset,
|
||||||
|
@ -4783,8 +4784,8 @@ static int gaudi_hbm_scrubbing(struct hl_device *hdev)
|
||||||
rc = hl_poll_timeout(
|
rc = hl_poll_timeout(
|
||||||
hdev,
|
hdev,
|
||||||
mmDMA0_CORE_STS0 + dma_offset,
|
mmDMA0_CORE_STS0 + dma_offset,
|
||||||
val,
|
busy,
|
||||||
((val & DMA0_CORE_STS0_BUSY_MASK) == 0),
|
((busy & DMA0_CORE_STS0_BUSY_MASK) == 0),
|
||||||
1000,
|
1000,
|
||||||
HBM_SCRUBBING_TIMEOUT_US);
|
HBM_SCRUBBING_TIMEOUT_US);
|
||||||
|
|
||||||
|
@ -4838,7 +4839,7 @@ static int gaudi_scrub_device_mem(struct hl_device *hdev, u64 addr, u64 size)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Scrub HBM using all DMA channels in parallel */
|
/* Scrub HBM using all DMA channels in parallel */
|
||||||
rc = gaudi_hbm_scrubbing(hdev);
|
rc = gaudi_scrub_device_dram(hdev, 0xdeadbeaf);
|
||||||
if (rc)
|
if (rc)
|
||||||
dev_err(hdev->dev,
|
dev_err(hdev->dev,
|
||||||
"Failed to clear HBM in mem scrub all\n");
|
"Failed to clear HBM in mem scrub all\n");
|
||||||
|
@ -9208,6 +9209,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
|
||||||
.asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
|
.asic_dma_alloc_coherent = gaudi_dma_alloc_coherent,
|
||||||
.asic_dma_free_coherent = gaudi_dma_free_coherent,
|
.asic_dma_free_coherent = gaudi_dma_free_coherent,
|
||||||
.scrub_device_mem = gaudi_scrub_device_mem,
|
.scrub_device_mem = gaudi_scrub_device_mem,
|
||||||
|
.scrub_device_dram = gaudi_scrub_device_dram,
|
||||||
.get_int_queue_base = gaudi_get_int_queue_base,
|
.get_int_queue_base = gaudi_get_int_queue_base,
|
||||||
.test_queues = gaudi_test_queues,
|
.test_queues = gaudi_test_queues,
|
||||||
.asic_dma_pool_zalloc = gaudi_dma_pool_zalloc,
|
.asic_dma_pool_zalloc = gaudi_dma_pool_zalloc,
|
||||||
|
|
|
@ -5434,6 +5434,11 @@ static int goya_mmu_prefetch_cache_range(struct hl_device *hdev, u32 flags, u32
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int goya_scrub_device_dram(struct hl_device *hdev, u64 val)
|
||||||
|
{
|
||||||
|
return -EOPNOTSUPP;
|
||||||
|
}
|
||||||
|
|
||||||
static const struct hl_asic_funcs goya_funcs = {
|
static const struct hl_asic_funcs goya_funcs = {
|
||||||
.early_init = goya_early_init,
|
.early_init = goya_early_init,
|
||||||
.early_fini = goya_early_fini,
|
.early_fini = goya_early_fini,
|
||||||
|
@ -5452,6 +5457,7 @@ static const struct hl_asic_funcs goya_funcs = {
|
||||||
.asic_dma_alloc_coherent = goya_dma_alloc_coherent,
|
.asic_dma_alloc_coherent = goya_dma_alloc_coherent,
|
||||||
.asic_dma_free_coherent = goya_dma_free_coherent,
|
.asic_dma_free_coherent = goya_dma_free_coherent,
|
||||||
.scrub_device_mem = goya_scrub_device_mem,
|
.scrub_device_mem = goya_scrub_device_mem,
|
||||||
|
.scrub_device_dram = goya_scrub_device_dram,
|
||||||
.get_int_queue_base = goya_get_int_queue_base,
|
.get_int_queue_base = goya_get_int_queue_base,
|
||||||
.test_queues = goya_test_queues,
|
.test_queues = goya_test_queues,
|
||||||
.asic_dma_pool_zalloc = goya_dma_pool_zalloc,
|
.asic_dma_pool_zalloc = goya_dma_pool_zalloc,
|
||||||
|
|
Loading…
Add table
Reference in a new issue