drm/amdgpu: Add a new flag to AMDGPU_CTX_OP_QUERY_STATE2
Add AMDGPU_CTX_QUERY2_FLAGS_RAS_CE/UE which indicate if any error happened between previous query and this query. Signed-off-by: xinhui pan <xinhui.pan@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
791c47694f
commit
ae363a212b
3 changed files with 22 additions and 0 deletions
|
@ -26,6 +26,7 @@
|
||||||
#include <drm/drm_auth.h>
|
#include <drm/drm_auth.h>
|
||||||
#include "amdgpu.h"
|
#include "amdgpu.h"
|
||||||
#include "amdgpu_sched.h"
|
#include "amdgpu_sched.h"
|
||||||
|
#include "amdgpu_ras.h"
|
||||||
|
|
||||||
#define to_amdgpu_ctx_entity(e) \
|
#define to_amdgpu_ctx_entity(e) \
|
||||||
container_of((e), struct amdgpu_ctx_entity, entity)
|
container_of((e), struct amdgpu_ctx_entity, entity)
|
||||||
|
@ -344,6 +345,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
|
||||||
{
|
{
|
||||||
struct amdgpu_ctx *ctx;
|
struct amdgpu_ctx *ctx;
|
||||||
struct amdgpu_ctx_mgr *mgr;
|
struct amdgpu_ctx_mgr *mgr;
|
||||||
|
uint32_t ras_counter;
|
||||||
|
|
||||||
if (!fpriv)
|
if (!fpriv)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
|
@ -368,6 +370,21 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
|
||||||
if (atomic_read(&ctx->guilty))
|
if (atomic_read(&ctx->guilty))
|
||||||
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
|
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
|
||||||
|
|
||||||
|
/*query ue count*/
|
||||||
|
ras_counter = amdgpu_ras_query_error_count(adev, false);
|
||||||
|
/*ras counter is monotonic increasing*/
|
||||||
|
if (ras_counter != ctx->ras_counter_ue) {
|
||||||
|
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
|
||||||
|
ctx->ras_counter_ue = ras_counter;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*query ce count*/
|
||||||
|
ras_counter = amdgpu_ras_query_error_count(adev, true);
|
||||||
|
if (ras_counter != ctx->ras_counter_ce) {
|
||||||
|
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
|
||||||
|
ctx->ras_counter_ce = ras_counter;
|
||||||
|
}
|
||||||
|
|
||||||
mutex_unlock(&mgr->lock);
|
mutex_unlock(&mgr->lock);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,6 +49,8 @@ struct amdgpu_ctx {
|
||||||
enum drm_sched_priority override_priority;
|
enum drm_sched_priority override_priority;
|
||||||
struct mutex lock;
|
struct mutex lock;
|
||||||
atomic_t guilty;
|
atomic_t guilty;
|
||||||
|
uint32_t ras_counter_ce;
|
||||||
|
uint32_t ras_counter_ue;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct amdgpu_ctx_mgr {
|
struct amdgpu_ctx_mgr {
|
||||||
|
|
|
@ -210,6 +210,9 @@ union drm_amdgpu_bo_list {
|
||||||
#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1)
|
#define AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST (1<<1)
|
||||||
/* indicate some job from this context once cause gpu hang */
|
/* indicate some job from this context once cause gpu hang */
|
||||||
#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2)
|
#define AMDGPU_CTX_QUERY2_FLAGS_GUILTY (1<<2)
|
||||||
|
/* indicate some errors are detected by RAS */
|
||||||
|
#define AMDGPU_CTX_QUERY2_FLAGS_RAS_CE (1<<3)
|
||||||
|
#define AMDGPU_CTX_QUERY2_FLAGS_RAS_UE (1<<4)
|
||||||
|
|
||||||
/* Context priority level */
|
/* Context priority level */
|
||||||
#define AMDGPU_CTX_PRIORITY_UNSET -2048
|
#define AMDGPU_CTX_PRIORITY_UNSET -2048
|
||||||
|
|
Loading…
Add table
Reference in a new issue