Skip to content

Commit

Permalink
drm/vc4: Expose performance counters to userspace
Browse files Browse the repository at this point in the history
The V3D engine has various hardware counters which might be interesting
to userspace performance analysis tools.

Expose new ioctls to create/destroy a performance monitor object and
query the counter values of this perfmance monitor.

Note that a perfomance monitor is given an ID that is only valid on the
file descriptor it has been allocated from. A performance monitor can be
attached to a CL submission and the driver will enable HW counters for
this request and update the performance monitor values at the end of the
job.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Eric Anholt <eric@anholt.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20180112090926.12538-1-boris.brezillon@free-electrons.com
  • Loading branch information
Boris Brezillon authored and anholt committed Feb 10, 2018
1 parent 9c950e4 commit 65101d8
Show file tree
Hide file tree
Showing 9 changed files with 474 additions and 72 deletions.
1 change: 1 addition & 0 deletions drivers/gpu/drm/vc4/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ vc4-y := \
vc4_vec.o \
vc4_hvs.o \
vc4_irq.o \
vc4_perfmon.o \
vc4_plane.o \
vc4_render_cl.o \
vc4_trace_points.o \
Expand Down
26 changes: 26 additions & 0 deletions drivers/gpu/drm/vc4/vc4_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
case DRM_VC4_PARAM_SUPPORTS_MADVISE:
case DRM_VC4_PARAM_SUPPORTS_PERFMON:
args->value = true;
break;
default:
Expand All @@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
return 0;
}

static int vc4_open(struct drm_device *dev, struct drm_file *file)
{
struct vc4_file *vc4file;

vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
if (!vc4file)
return -ENOMEM;

vc4_perfmon_open_file(vc4file);
file->driver_priv = vc4file;
return 0;
}

static void vc4_close(struct drm_device *dev, struct drm_file *file)
{
struct vc4_file *vc4file = file->driver_priv;

vc4_perfmon_close_file(vc4file);
}

static const struct vm_operations_struct vc4_vm_ops = {
.fault = vc4_fault,
.open = drm_gem_vm_open,
Expand Down Expand Up @@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
};

static struct drm_driver vc4_drm_driver = {
Expand All @@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = {
DRIVER_RENDER |
DRIVER_PRIME),
.lastclose = drm_fb_helper_lastclose,
.open = vc4_open,
.postclose = vc4_close,
.irq_handler = vc4_irq,
.irq_preinstall = vc4_irq_preinstall,
.irq_postinstall = vc4_irq_postinstall,
Expand Down
68 changes: 68 additions & 0 deletions drivers/gpu/drm/vc4/vc4_drv.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <drm/drm_encoder.h>
#include <drm/drm_gem_cma_helper.h>

#include "uapi/drm/vc4_drm.h"

/* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
* this.
*/
Expand All @@ -29,6 +31,36 @@ enum vc4_kernel_bo_type {
VC4_BO_TYPE_COUNT
};

/* Performance monitor object. The perform lifetime is controlled by userspace
* using perfmon related ioctls. A perfmon can be attached to a submit_cl
* request, and when this is the case, HW perf counters will be activated just
* before the submit_cl is submitted to the GPU and disabled when the job is
* done. This way, only events related to a specific job will be counted.
*/
struct vc4_perfmon {
/* Tracks the number of users of the perfmon, when this counter reaches
* zero the perfmon is destroyed.
*/
refcount_t refcnt;

/* Number of counters activated in this perfmon instance
* (should be less than DRM_VC4_MAX_PERF_COUNTERS).
*/
u8 ncounters;

/* Events counted by the HW perf counters. */
u8 events[DRM_VC4_MAX_PERF_COUNTERS];

/* Storage for counter values. Counters are incremented by the HW
* perf counter values every time the perfmon is attached to a GPU job.
* This way, perfmon users don't have to retrieve the results after
* each job if they want to track events covering several submissions.
* Note that counter values can't be reset, but you can fake a reset by
* destroying the perfmon and creating a new one.
*/
u64 counters[0];
};

struct vc4_dev {
struct drm_device *dev;

Expand Down Expand Up @@ -121,6 +153,11 @@ struct vc4_dev {
wait_queue_head_t job_wait_queue;
struct work_struct job_done_work;

/* Used to track the active perfmon if any. Access to this field is
* protected by job_lock.
*/
struct vc4_perfmon *active_perfmon;

/* List of struct vc4_seqno_cb for callbacks to be made from a
* workqueue when the given seqno is passed.
*/
Expand Down Expand Up @@ -406,6 +443,21 @@ struct vc4_exec_info {
void *uniforms_v;
uint32_t uniforms_p;
uint32_t uniforms_size;

/* Pointer to a performance monitor object if the user requested it,
* NULL otherwise.
*/
struct vc4_perfmon *perfmon;
};

/* Per-open file private data. Any driver-specific resource that has to be
* released when the DRM file is closed should be placed here.
*/
struct vc4_file {
struct {
struct idr idr;
struct mutex lock;
} perfmon;
};

static inline struct vc4_exec_info *
Expand Down Expand Up @@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
/* vc4_validate_shader.c */
struct vc4_validated_shader_info *
vc4_validate_shader(struct drm_gem_cma_object *shader_obj);

/* vc4_perfmon.c */
void vc4_perfmon_get(struct vc4_perfmon *perfmon);
void vc4_perfmon_put(struct vc4_perfmon *perfmon);
void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
bool capture);
struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
void vc4_perfmon_open_file(struct vc4_file *vc4file);
void vc4_perfmon_close_file(struct vc4_file *vc4file);
int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
48 changes: 43 additions & 5 deletions drivers/gpu/drm/vc4/vc4_gem.c
Original file line number Diff line number Diff line change
Expand Up @@ -454,14 +454,30 @@ vc4_submit_next_bin_job(struct drm_device *dev)

vc4_flush_caches(dev);

/* Only start the perfmon if it was not already started by a previous
* job.
*/
if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
vc4_perfmon_start(vc4, exec->perfmon);

/* Either put the job in the binner if it uses the binner, or
* immediately move it to the to-be-rendered queue.
*/
if (exec->ct0ca != exec->ct0ea) {
submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
} else {
struct vc4_exec_info *next;

vc4_move_job_to_render(dev, exec);
goto again;
next = vc4_first_bin_job(vc4);

/* We can't start the next bin job if the previous job had a
* different perfmon instance attached to it. The same goes
* if one of them had a perfmon attached to it and the other
* one doesn't.
*/
if (next && next->perfmon == exec->perfmon)
goto again;
}
}

Expand Down Expand Up @@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
struct ww_acquire_ctx *acquire_ctx)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
struct vc4_exec_info *renderjob;
uint64_t seqno;
unsigned long irqflags;
struct vc4_fence *fence;
Expand All @@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,

list_add_tail(&exec->head, &vc4->bin_job_list);

/* If no job was executing, kick ours off. Otherwise, it'll
* get started when the previous job's flush done interrupt
* occurs.
/* If no bin job was executing and if the render job (if any) has the
* same perfmon as our job attached to it (or if both jobs don't have
* perfmon activated), then kick ours off. Otherwise, it'll get
* started when the previous job's flush/render done interrupt occurs.
*/
if (vc4_first_bin_job(vc4) == exec) {
renderjob = vc4_first_render_job(vc4);
if (vc4_first_bin_job(vc4) == exec &&
(!renderjob || renderjob->perfmon == exec->perfmon)) {
vc4_submit_next_bin_job(dev);
vc4_queue_hangcheck(dev);
}
Expand Down Expand Up @@ -915,6 +935,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
vc4->bin_alloc_used &= ~exec->bin_slots;
spin_unlock_irqrestore(&vc4->job_lock, irqflags);

/* Release the reference we had on the perf monitor. */
vc4_perfmon_put(exec->perfmon);

mutex_lock(&vc4->power_lock);
if (--vc4->power_refcount == 0) {
pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
Expand Down Expand Up @@ -1067,6 +1090,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
struct vc4_file *vc4file = file_priv->driver_priv;
struct drm_vc4_submit_cl *args = data;
struct vc4_exec_info *exec;
struct ww_acquire_ctx acquire_ctx;
Expand All @@ -1080,6 +1104,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
return -EINVAL;
}

if (args->pad2 != 0) {
DRM_DEBUG("->pad2 must be set to zero\n");
return -EINVAL;
}

exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
if (!exec) {
DRM_ERROR("malloc failure on exec struct\n");
Expand All @@ -1105,6 +1134,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
if (ret)
goto fail;

if (args->perfmonid) {
exec->perfmon = vc4_perfmon_find(vc4file,
args->perfmonid);
if (!exec->perfmon) {
ret = -ENOENT;
goto fail;
}
}

if (exec->args->bin_cl_size != 0) {
ret = vc4_get_bcl(dev, exec);
if (ret)
Expand Down
40 changes: 37 additions & 3 deletions drivers/gpu/drm/vc4/vc4_irq.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,20 @@ static void
vc4_irq_finish_bin_job(struct drm_device *dev)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
struct vc4_exec_info *next, *exec = vc4_first_bin_job(vc4);

if (!exec)
return;

vc4_move_job_to_render(dev, exec);
vc4_submit_next_bin_job(dev);
next = vc4_first_bin_job(vc4);

/* Only submit the next job in the bin list if it matches the perfmon
* attached to the one that just finished (or if both jobs don't have
* perfmon attached to them).
*/
if (next && next->perfmon == exec->perfmon)
vc4_submit_next_bin_job(dev);
}

static void
Expand All @@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev)
if (!exec)
return;

/* Stop the perfmon so that the next bin job can be started. */
if (exec->perfmon)
vc4_perfmon_stop(vc4, exec->perfmon, false);

list_move_tail(&exec->head, &vc4->bin_job_list);
vc4_submit_next_bin_job(dev);
}
Expand All @@ -131,18 +142,41 @@ vc4_irq_finish_render_job(struct drm_device *dev)
{
struct vc4_dev *vc4 = to_vc4_dev(dev);
struct vc4_exec_info *exec = vc4_first_render_job(vc4);
struct vc4_exec_info *nextbin, *nextrender;

if (!exec)
return;

vc4->finished_seqno++;
list_move_tail(&exec->head, &vc4->job_done_list);

nextbin = vc4_first_bin_job(vc4);
nextrender = vc4_first_render_job(vc4);

/* Only stop the perfmon if following jobs in the queue don't expect it
* to be enabled.
*/
if (exec->perfmon && !nextrender &&
(!nextbin || nextbin->perfmon != exec->perfmon))
vc4_perfmon_stop(vc4, exec->perfmon, true);

/* If there's a render job waiting, start it. If this is not the case
* we may have to unblock the binner if it's been stalled because of
* perfmon (this can be checked by comparing the perfmon attached to
* the finished renderjob to the one attached to the next bin job: if
* they don't match, this means the binner is stalled and should be
* restarted).
*/
if (nextrender)
vc4_submit_next_render_job(dev);
else if (nextbin && nextbin->perfmon != exec->perfmon)
vc4_submit_next_bin_job(dev);

if (exec->fence) {
dma_fence_signal_locked(exec->fence);
dma_fence_put(exec->fence);
exec->fence = NULL;
}
vc4_submit_next_render_job(dev);

wake_up_all(&vc4->job_wait_queue);
schedule_work(&vc4->job_done_work);
Expand Down
Loading

0 comments on commit 65101d8

Please sign in to comment.