Engine busyness samples around a 10ms period is failing with busyness
ranging approx. from 87% to 115% as shown below. The expected range is
+/- 5% of the sample period. Fail 10% of the time.
rcs0: reported 11716042ns [91%] busyness while spinning [for 12805719ns]
When determining busyness of active engine, the GuC based engine
busyness implementation relies on a 64 bit timestamp register read. The
latency incurred by this register read causes the failure.
On DG1, when the test fails, the observed latencies range from 900us -
1.5ms.
Optimizing the 2x32 read by acquiring the lock and forcewake prior to
all reg reads reduces the rate of failure to around 2%, but does not
eliminate it.
In order to make the selftest more robust and always account for such
latencies, increase the sample period to 100 ms. This eliminates the
issue as seen in a 1000 runs.
v2: (Ashutosh)
- Add error to commit msg
- Include gitlab bug
- Update commit for inclusion of 2x32 optimized read
Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4418
Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20221110171913.670286-3-umesh.nerlige.ramappa@intel.com
(cherry picked from commit 529d95a606
)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
423 lines
9.8 KiB
C
423 lines
9.8 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright © 2018 Intel Corporation
|
|
*/
|
|
|
|
#include <linux/sort.h>
|
|
|
|
#include "i915_selftest.h"
|
|
#include "intel_engine_regs.h"
|
|
#include "intel_gpu_commands.h"
|
|
#include "intel_gt_clock_utils.h"
|
|
#include "selftest_engine.h"
|
|
#include "selftest_engine_heartbeat.h"
|
|
#include "selftests/igt_atomic.h"
|
|
#include "selftests/igt_flush_test.h"
|
|
#include "selftests/igt_spinner.h"
|
|
|
|
#define COUNT 5
|
|
|
|
static int cmp_u64(const void *A, const void *B)
|
|
{
|
|
const u64 *a = A, *b = B;
|
|
|
|
return *a - *b;
|
|
}
|
|
|
|
static u64 trifilter(u64 *a)
|
|
{
|
|
sort(a, COUNT, sizeof(*a), cmp_u64, NULL);
|
|
return (a[1] + 2 * a[2] + a[3]) >> 2;
|
|
}
|
|
|
|
static u32 *emit_wait(u32 *cs, u32 offset, int op, u32 value)
|
|
{
|
|
*cs++ = MI_SEMAPHORE_WAIT |
|
|
MI_SEMAPHORE_GLOBAL_GTT |
|
|
MI_SEMAPHORE_POLL |
|
|
op;
|
|
*cs++ = value;
|
|
*cs++ = offset;
|
|
*cs++ = 0;
|
|
|
|
return cs;
|
|
}
|
|
|
|
static u32 *emit_store(u32 *cs, u32 offset, u32 value)
|
|
{
|
|
*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
|
|
*cs++ = offset;
|
|
*cs++ = 0;
|
|
*cs++ = value;
|
|
|
|
return cs;
|
|
}
|
|
|
|
static u32 *emit_srm(u32 *cs, i915_reg_t reg, u32 offset)
|
|
{
|
|
*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
|
|
*cs++ = i915_mmio_reg_offset(reg);
|
|
*cs++ = offset;
|
|
*cs++ = 0;
|
|
|
|
return cs;
|
|
}
|
|
|
|
static void write_semaphore(u32 *x, u32 value)
|
|
{
|
|
WRITE_ONCE(*x, value);
|
|
wmb();
|
|
}
|
|
|
|
static int __measure_timestamps(struct intel_context *ce,
|
|
u64 *dt, u64 *d_ring, u64 *d_ctx)
|
|
{
|
|
struct intel_engine_cs *engine = ce->engine;
|
|
u32 *sema = memset32(engine->status_page.addr + 1000, 0, 5);
|
|
u32 offset = i915_ggtt_offset(engine->status_page.vma);
|
|
struct i915_request *rq;
|
|
u32 *cs;
|
|
|
|
rq = intel_context_create_request(ce);
|
|
if (IS_ERR(rq))
|
|
return PTR_ERR(rq);
|
|
|
|
cs = intel_ring_begin(rq, 28);
|
|
if (IS_ERR(cs)) {
|
|
i915_request_add(rq);
|
|
return PTR_ERR(cs);
|
|
}
|
|
|
|
/* Signal & wait for start */
|
|
cs = emit_store(cs, offset + 4008, 1);
|
|
cs = emit_wait(cs, offset + 4008, MI_SEMAPHORE_SAD_NEQ_SDD, 1);
|
|
|
|
cs = emit_srm(cs, RING_TIMESTAMP(engine->mmio_base), offset + 4000);
|
|
cs = emit_srm(cs, RING_CTX_TIMESTAMP(engine->mmio_base), offset + 4004);
|
|
|
|
/* Busy wait */
|
|
cs = emit_wait(cs, offset + 4008, MI_SEMAPHORE_SAD_EQ_SDD, 1);
|
|
|
|
cs = emit_srm(cs, RING_TIMESTAMP(engine->mmio_base), offset + 4016);
|
|
cs = emit_srm(cs, RING_CTX_TIMESTAMP(engine->mmio_base), offset + 4012);
|
|
|
|
intel_ring_advance(rq, cs);
|
|
i915_request_get(rq);
|
|
i915_request_add(rq);
|
|
intel_engine_flush_submission(engine);
|
|
|
|
/* Wait for the request to start executing, that then waits for us */
|
|
while (READ_ONCE(sema[2]) == 0)
|
|
cpu_relax();
|
|
|
|
/* Run the request for a 100us, sampling timestamps before/after */
|
|
local_irq_disable();
|
|
write_semaphore(&sema[2], 0);
|
|
while (READ_ONCE(sema[1]) == 0) /* wait for the gpu to catch up */
|
|
cpu_relax();
|
|
*dt = local_clock();
|
|
udelay(100);
|
|
*dt = local_clock() - *dt;
|
|
write_semaphore(&sema[2], 1);
|
|
local_irq_enable();
|
|
|
|
if (i915_request_wait(rq, 0, HZ / 2) < 0) {
|
|
i915_request_put(rq);
|
|
return -ETIME;
|
|
}
|
|
i915_request_put(rq);
|
|
|
|
pr_debug("%s CTX_TIMESTAMP: [%x, %x], RING_TIMESTAMP: [%x, %x]\n",
|
|
engine->name, sema[1], sema[3], sema[0], sema[4]);
|
|
|
|
*d_ctx = sema[3] - sema[1];
|
|
*d_ring = sema[4] - sema[0];
|
|
return 0;
|
|
}
|
|
|
|
static int __live_engine_timestamps(struct intel_engine_cs *engine)
|
|
{
|
|
u64 s_ring[COUNT], s_ctx[COUNT], st[COUNT], d_ring, d_ctx, dt;
|
|
struct intel_context *ce;
|
|
int i, err = 0;
|
|
|
|
ce = intel_context_create(engine);
|
|
if (IS_ERR(ce))
|
|
return PTR_ERR(ce);
|
|
|
|
for (i = 0; i < COUNT; i++) {
|
|
err = __measure_timestamps(ce, &st[i], &s_ring[i], &s_ctx[i]);
|
|
if (err)
|
|
break;
|
|
}
|
|
intel_context_put(ce);
|
|
if (err)
|
|
return err;
|
|
|
|
dt = trifilter(st);
|
|
d_ring = trifilter(s_ring);
|
|
d_ctx = trifilter(s_ctx);
|
|
|
|
pr_info("%s elapsed:%lldns, CTX_TIMESTAMP:%lldns, RING_TIMESTAMP:%lldns\n",
|
|
engine->name, dt,
|
|
intel_gt_clock_interval_to_ns(engine->gt, d_ctx),
|
|
intel_gt_clock_interval_to_ns(engine->gt, d_ring));
|
|
|
|
d_ring = intel_gt_clock_interval_to_ns(engine->gt, d_ring);
|
|
if (3 * dt > 4 * d_ring || 4 * dt < 3 * d_ring) {
|
|
pr_err("%s Mismatch between ring timestamp and walltime!\n",
|
|
engine->name);
|
|
return -EINVAL;
|
|
}
|
|
|
|
d_ring = trifilter(s_ring);
|
|
d_ctx = trifilter(s_ctx);
|
|
|
|
d_ctx *= engine->gt->clock_frequency;
|
|
if (GRAPHICS_VER(engine->i915) == 11)
|
|
d_ring *= 12500000; /* Fixed 80ns for GEN11 ctx timestamp? */
|
|
else
|
|
d_ring *= engine->gt->clock_frequency;
|
|
|
|
if (3 * d_ctx > 4 * d_ring || 4 * d_ctx < 3 * d_ring) {
|
|
pr_err("%s Mismatch between ring and context timestamps!\n",
|
|
engine->name);
|
|
return -EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int live_engine_timestamps(void *arg)
|
|
{
|
|
struct intel_gt *gt = arg;
|
|
struct intel_engine_cs *engine;
|
|
enum intel_engine_id id;
|
|
|
|
/*
|
|
* Check that CS_TIMESTAMP / CTX_TIMESTAMP are in sync, i.e. share
|
|
* the same CS clock.
|
|
*/
|
|
|
|
if (GRAPHICS_VER(gt->i915) < 8)
|
|
return 0;
|
|
|
|
for_each_engine(engine, gt, id) {
|
|
int err;
|
|
|
|
st_engine_heartbeat_disable(engine);
|
|
err = __live_engine_timestamps(engine);
|
|
st_engine_heartbeat_enable(engine);
|
|
if (err)
|
|
return err;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int __spin_until_busier(struct intel_engine_cs *engine, ktime_t busyness)
|
|
{
|
|
ktime_t start, unused, dt;
|
|
|
|
if (!intel_engine_uses_guc(engine))
|
|
return 0;
|
|
|
|
/*
|
|
* In GuC mode of submission, the busyness stats may get updated after
|
|
* the batch starts running. Poll for a change in busyness and timeout
|
|
* after 500 us.
|
|
*/
|
|
start = ktime_get();
|
|
while (intel_engine_get_busy_time(engine, &unused) == busyness) {
|
|
dt = ktime_get() - start;
|
|
if (dt > 10000000) {
|
|
pr_err("active wait timed out %lld\n", dt);
|
|
ENGINE_TRACE(engine, "active wait time out %lld\n", dt);
|
|
return -ETIME;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int live_engine_busy_stats(void *arg)
|
|
{
|
|
struct intel_gt *gt = arg;
|
|
struct intel_engine_cs *engine;
|
|
enum intel_engine_id id;
|
|
struct igt_spinner spin;
|
|
int err = 0;
|
|
|
|
/*
|
|
* Check that if an engine supports busy-stats, they tell the truth.
|
|
*/
|
|
|
|
if (igt_spinner_init(&spin, gt))
|
|
return -ENOMEM;
|
|
|
|
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
|
|
for_each_engine(engine, gt, id) {
|
|
struct i915_request *rq;
|
|
ktime_t busyness, dummy;
|
|
ktime_t de, dt;
|
|
ktime_t t[2];
|
|
|
|
if (!intel_engine_supports_stats(engine))
|
|
continue;
|
|
|
|
if (!intel_engine_can_store_dword(engine))
|
|
continue;
|
|
|
|
if (intel_gt_pm_wait_for_idle(gt)) {
|
|
err = -EBUSY;
|
|
break;
|
|
}
|
|
|
|
st_engine_heartbeat_disable(engine);
|
|
|
|
ENGINE_TRACE(engine, "measuring idle time\n");
|
|
preempt_disable();
|
|
de = intel_engine_get_busy_time(engine, &t[0]);
|
|
udelay(100);
|
|
de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
|
|
preempt_enable();
|
|
dt = ktime_sub(t[1], t[0]);
|
|
if (de < 0 || de > 10) {
|
|
pr_err("%s: reported %lldns [%d%%] busyness while sleeping [for %lldns]\n",
|
|
engine->name,
|
|
de, (int)div64_u64(100 * de, dt), dt);
|
|
GEM_TRACE_DUMP();
|
|
err = -EINVAL;
|
|
goto end;
|
|
}
|
|
|
|
/* 100% busy */
|
|
rq = igt_spinner_create_request(&spin,
|
|
engine->kernel_context,
|
|
MI_NOOP);
|
|
if (IS_ERR(rq)) {
|
|
err = PTR_ERR(rq);
|
|
goto end;
|
|
}
|
|
i915_request_add(rq);
|
|
|
|
busyness = intel_engine_get_busy_time(engine, &dummy);
|
|
if (!igt_wait_for_spinner(&spin, rq)) {
|
|
intel_gt_set_wedged(engine->gt);
|
|
err = -ETIME;
|
|
goto end;
|
|
}
|
|
|
|
err = __spin_until_busier(engine, busyness);
|
|
if (err) {
|
|
GEM_TRACE_DUMP();
|
|
goto end;
|
|
}
|
|
|
|
ENGINE_TRACE(engine, "measuring busy time\n");
|
|
preempt_disable();
|
|
de = intel_engine_get_busy_time(engine, &t[0]);
|
|
mdelay(100);
|
|
de = ktime_sub(intel_engine_get_busy_time(engine, &t[1]), de);
|
|
preempt_enable();
|
|
dt = ktime_sub(t[1], t[0]);
|
|
if (100 * de < 95 * dt || 95 * de > 100 * dt) {
|
|
pr_err("%s: reported %lldns [%d%%] busyness while spinning [for %lldns]\n",
|
|
engine->name,
|
|
de, (int)div64_u64(100 * de, dt), dt);
|
|
GEM_TRACE_DUMP();
|
|
err = -EINVAL;
|
|
goto end;
|
|
}
|
|
|
|
end:
|
|
st_engine_heartbeat_enable(engine);
|
|
igt_spinner_end(&spin);
|
|
if (igt_flush_test(gt->i915))
|
|
err = -EIO;
|
|
if (err)
|
|
break;
|
|
}
|
|
|
|
igt_spinner_fini(&spin);
|
|
if (igt_flush_test(gt->i915))
|
|
err = -EIO;
|
|
return err;
|
|
}
|
|
|
|
static int live_engine_pm(void *arg)
|
|
{
|
|
struct intel_gt *gt = arg;
|
|
struct intel_engine_cs *engine;
|
|
enum intel_engine_id id;
|
|
|
|
/*
|
|
* Check we can call intel_engine_pm_put from any context. No
|
|
* failures are reported directly, but if we mess up lockdep should
|
|
* tell us.
|
|
*/
|
|
if (intel_gt_pm_wait_for_idle(gt)) {
|
|
pr_err("Unable to flush GT pm before test\n");
|
|
return -EBUSY;
|
|
}
|
|
|
|
GEM_BUG_ON(intel_gt_pm_is_awake(gt));
|
|
for_each_engine(engine, gt, id) {
|
|
const typeof(*igt_atomic_phases) *p;
|
|
|
|
for (p = igt_atomic_phases; p->name; p++) {
|
|
/*
|
|
* Acquisition is always synchronous, except if we
|
|
* know that the engine is already awake, in which
|
|
* case we should use intel_engine_pm_get_if_awake()
|
|
* to atomically grab the wakeref.
|
|
*
|
|
* In practice,
|
|
* intel_engine_pm_get();
|
|
* intel_engine_pm_put();
|
|
* occurs in one thread, while simultaneously
|
|
* intel_engine_pm_get_if_awake();
|
|
* intel_engine_pm_put();
|
|
* occurs from atomic context in another.
|
|
*/
|
|
GEM_BUG_ON(intel_engine_pm_is_awake(engine));
|
|
intel_engine_pm_get(engine);
|
|
|
|
p->critical_section_begin();
|
|
if (!intel_engine_pm_get_if_awake(engine))
|
|
pr_err("intel_engine_pm_get_if_awake(%s) failed under %s\n",
|
|
engine->name, p->name);
|
|
else
|
|
intel_engine_pm_put_async(engine);
|
|
intel_engine_pm_put_async(engine);
|
|
p->critical_section_end();
|
|
|
|
intel_engine_pm_flush(engine);
|
|
|
|
if (intel_engine_pm_is_awake(engine)) {
|
|
pr_err("%s is still awake after flushing pm\n",
|
|
engine->name);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* gt wakeref is async (deferred to workqueue) */
|
|
if (intel_gt_pm_wait_for_idle(gt)) {
|
|
pr_err("GT failed to idle\n");
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int live_engine_pm_selftests(struct intel_gt *gt)
|
|
{
|
|
static const struct i915_subtest tests[] = {
|
|
SUBTEST(live_engine_timestamps),
|
|
SUBTEST(live_engine_busy_stats),
|
|
SUBTEST(live_engine_pm),
|
|
};
|
|
|
|
return intel_gt_live_subtests(tests, gt);
|
|
}
|