From b46a33e271ed81bd765c632b972c49d5b44729c7 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 21 Nov 2017 18:18:45 +0000 Subject: drm/i915/pmu: Expose a PMU interface for perf queries From: Chris Wilson From: Tvrtko Ursulin From: Dmitry Rogozhkin The first goal is to be able to measure GPU (and invidual ring) busyness without having to poll registers from userspace. (Which not only incurs holding the forcewake lock indefinitely, perturbing the system, but also runs the risk of hanging the machine.) As an alternative we can use the perf event counter interface to sample the ring registers periodically and send those results to userspace. Functionality we are exporting to userspace is via the existing perf PMU API and can be exercised via the existing tools. For example: perf stat -a -e i915/rcs0-busy/ -I 1000 Will print the render engine busynnes once per second. All the performance counters can be enumerated (perf list) and have their unit of measure correctly reported in sysfs. v1-v2 (Chris Wilson): v2: Use a common timer for the ring sampling. v3: (Tvrtko Ursulin) * Decouple uAPI from i915 engine ids. * Complete uAPI defines. * Refactor some code to helpers for clarity. * Skip sampling disabled engines. * Expose counters in sysfs. * Pass in fake regs to avoid null ptr deref in perf core. * Convert to class/instance uAPI. * Use shared driver code for rc6 residency, power and frequency. v4: (Dmitry Rogozhkin) * Register PMU with .task_ctx_nr=perf_invalid_context * Expose cpumask for the PMU with the single CPU in the mask * Properly support pmu->stop(): it should call pmu->read() * Properly support pmu->del(): it should call stop(event, PERF_EF_UPDATE) * Introduce refcounting of event subscriptions. * Make pmu.busy_stats a refcounter to avoid busy stats going away with some deleted event. * Expose cpumask for i915 PMU to avoid multiple events creation of the same type followed by counter aggregation by perf-stat. * Track CPUs getting online/offline to migrate perf context. If (likely) cpumask will initially set CPU0, CONFIG_BOOTPARAM_HOTPLUG_CPU0 will be needed to see effect of CPU status tracking. * End result is that only global events are supported and perf stat works correctly. * Deny perf driver level sampling - it is prohibited for uncore PMU. v5: (Tvrtko Ursulin) * Don't hardcode number of engine samplers. * Rewrite event ref-counting for correctness and simplicity. * Store initial counter value when starting already enabled events to correctly report values to all listeners. * Fix RC6 residency readout. * Comments, GPL header. v6: * Add missing entry to v4 changelog. * Fix accounting in CPU hotplug case by copying the approach from arch/x86/events/intel/cstate.c. (Dmitry Rogozhkin) v7: * Log failure message only on failure. * Remove CPU hotplug notification state on unregister. v8: * Fix error unwind on failed registration. * Checkpatch cleanup. v9: * Drop the energy metric, it is available via intel_rapl_perf. (Ville Syrjälä) * Use HAS_RC6(p). (Chris Wilson) * Handle unsupported non-engine events. (Dmitry Rogozhkin) * Rebase for intel_rc6_residency_ns needing caller managed runtime pm. * Drop HAS_RC6 checks from the read callback since creating those events will be rejected at init time already. * Add counter units to sysfs so perf stat output is nicer. * Cleanup the attribute tables for brevity and readability. v10: * Fixed queued accounting. v11: * Move intel_engine_lookup_user to intel_engine_cs.c * Commit update. (Joonas Lahtinen) v12: * More accurate sampling. (Chris Wilson) * Store and report frequency in MHz for better usability from perf stat. * Removed metrics: queued, interrupts, rc6 counters. * Sample engine busyness based on seqno difference only for less MMIO (and forcewake) on all platforms. (Chris Wilson) v13: * Comment spelling, use mul_u32_u32 to work around potential GCC issue and somne code alignment changes. (Chris Wilson) v14: * Rebase. v15: * Rebase for RPS refactoring. v16: * Use the dynamic slot in the CPU hotplug state machine so that we are free to setup our state as multi-instance. Previously we were re-using the CPUHP_AP_PERF_X86_UNCORE_ONLINE slot which is neither used as multi-instance, nor owned by our driver to start with. * Register the CPU hotplug handlers after the PMU, otherwise the callback will get called before the PMU is initialized which can end up in perf_pmu_migrate_context with an un-initialized base. * Added workaround for a probable bug in cpuhp core. v17: * Remove workaround for the cpuhp bug. v18: * Rebase for drm_i915_gem_engine_class getting upstream before us. v19: * Rebase. (trivial) Signed-off-by: Chris Wilson Signed-off-by: Tvrtko Ursulin Signed-off-by: Dmitry Rogozhkin Cc: Tvrtko Ursulin Cc: Chris Wilson Cc: Dmitry Rogozhkin Cc: Peter Zijlstra Reviewed-by: Chris Wilson Signed-off-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-2-tvrtko.ursulin@linux.intel.com --- include/uapi/drm/i915_drm.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/uapi/drm') diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index b57985929553..40e7b438bdaa 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -102,6 +102,45 @@ enum drm_i915_gem_engine_class { I915_ENGINE_CLASS_INVALID = -1 }; +/** + * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915 + * + */ + +enum drm_i915_pmu_engine_sample { + I915_SAMPLE_BUSY = 0, + I915_SAMPLE_WAIT = 1, + I915_SAMPLE_SEMA = 2, + I915_ENGINE_SAMPLE_MAX /* non-ABI */ +}; + +#define I915_PMU_SAMPLE_BITS (4) +#define I915_PMU_SAMPLE_MASK (0xf) +#define I915_PMU_SAMPLE_INSTANCE_BITS (8) +#define I915_PMU_CLASS_SHIFT \ + (I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS) + +#define __I915_PMU_ENGINE(class, instance, sample) \ + ((class) << I915_PMU_CLASS_SHIFT | \ + (instance) << I915_PMU_SAMPLE_BITS | \ + (sample)) + +#define I915_PMU_ENGINE_BUSY(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) + +#define I915_PMU_ENGINE_WAIT(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) + +#define I915_PMU_ENGINE_SEMA(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA) + +#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) + +#define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0) +#define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) + +#define I915_PMU_LAST I915_PMU_REQUESTED_FREQUENCY + /* Each region is a minimum of 16k, and there are at most 255 of them. */ #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use -- cgit v1.2.3-55-g7522 From 0cd4684d6ea9a4ffec33fc19de4dd667bb90d0a5 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 21 Nov 2017 18:18:50 +0000 Subject: drm/i915/pmu: Add interrupt count metric For clients like intel-gpu-overlay it is easier to read the count via the perf API than having to parse /proc. Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-7-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 23 +++++++++++++++++++++++ include/uapi/drm/i915_drm.h | 4 +++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi/drm') diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 6a428e7218d2..fef389ebf92c 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -277,6 +277,22 @@ static enum hrtimer_restart i915_sample(struct hrtimer *hrtimer) return HRTIMER_RESTART; } +static u64 count_interrupts(struct drm_i915_private *i915) +{ + /* open-coded kstat_irqs() */ + struct irq_desc *desc = irq_to_desc(i915->drm.pdev->irq); + u64 sum = 0; + int cpu; + + if (!desc || !desc->kstat_irqs) + return 0; + + for_each_possible_cpu(cpu) + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); + + return sum; +} + static void i915_pmu_event_destroy(struct perf_event *event) { WARN_ON(event->parent); @@ -343,6 +359,8 @@ static int i915_pmu_event_init(struct perf_event *event) if (INTEL_GEN(i915) < 6) ret = -ENODEV; break; + case I915_PMU_INTERRUPTS: + break; default: ret = -ENOENT; break; @@ -392,6 +410,9 @@ static u64 __i915_pmu_event_read(struct perf_event *event) div_u64(i915->pmu.sample[__I915_SAMPLE_FREQ_REQ].cur, FREQUENCY); break; + case I915_PMU_INTERRUPTS: + val = count_interrupts(i915); + break; } } @@ -654,6 +675,8 @@ static struct attribute *i915_pmu_events_attrs[] = { I915_EVENT(actual-frequency, I915_PMU_ACTUAL_FREQUENCY, "MHz"), I915_EVENT(requested-frequency, I915_PMU_REQUESTED_FREQUENCY, "MHz"), + I915_EVENT_ATTR(interrupts, I915_PMU_INTERRUPTS), + NULL, }; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 40e7b438bdaa..d840ff083520 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -139,7 +139,9 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0) #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) -#define I915_PMU_LAST I915_PMU_REQUESTED_FREQUENCY +#define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) + +#define I915_PMU_LAST I915_PMU_INTERRUPTS /* Each region is a minimum of 16k, and there are at most 255 of them. */ -- cgit v1.2.3-55-g7522 From 6060b6aec03c76f9ce0977b70c27429d39d2956e Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 21 Nov 2017 18:18:52 +0000 Subject: drm/i915/pmu: Add RC6 residency metrics For clients like intel-gpu-overlay it is easier to read the counters via the perf API than having to parse sysfs. Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20171121181852.16128-9-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 31 +++++++++++++++++++++++++++++++ include/uapi/drm/i915_drm.h | 6 +++++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'include/uapi/drm') diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index fef389ebf92c..1071935bfa67 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -361,6 +361,15 @@ static int i915_pmu_event_init(struct perf_event *event) break; case I915_PMU_INTERRUPTS: break; + case I915_PMU_RC6_RESIDENCY: + if (!HAS_RC6(i915)) + ret = -ENODEV; + break; + case I915_PMU_RC6p_RESIDENCY: + case I915_PMU_RC6pp_RESIDENCY: + if (!HAS_RC6p(i915)) + ret = -ENODEV; + break; default: ret = -ENOENT; break; @@ -413,6 +422,24 @@ static u64 __i915_pmu_event_read(struct perf_event *event) case I915_PMU_INTERRUPTS: val = count_interrupts(i915); break; + case I915_PMU_RC6_RESIDENCY: + intel_runtime_pm_get(i915); + val = intel_rc6_residency_ns(i915, + IS_VALLEYVIEW(i915) ? + VLV_GT_RENDER_RC6 : + GEN6_GT_GFX_RC6); + intel_runtime_pm_put(i915); + break; + case I915_PMU_RC6p_RESIDENCY: + intel_runtime_pm_get(i915); + val = intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p); + intel_runtime_pm_put(i915); + break; + case I915_PMU_RC6pp_RESIDENCY: + intel_runtime_pm_get(i915); + val = intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp); + intel_runtime_pm_put(i915); + break; } } @@ -677,6 +704,10 @@ static struct attribute *i915_pmu_events_attrs[] = { I915_EVENT_ATTR(interrupts, I915_PMU_INTERRUPTS), + I915_EVENT(rc6-residency, I915_PMU_RC6_RESIDENCY, "ns"), + I915_EVENT(rc6p-residency, I915_PMU_RC6p_RESIDENCY, "ns"), + I915_EVENT(rc6pp-residency, I915_PMU_RC6pp_RESIDENCY, "ns"), + NULL, }; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index d840ff083520..915a6e85a855 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -141,7 +141,11 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) -#define I915_PMU_LAST I915_PMU_INTERRUPTS +#define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) +#define I915_PMU_RC6p_RESIDENCY __I915_PMU_OTHER(4) +#define I915_PMU_RC6pp_RESIDENCY __I915_PMU_OTHER(5) + +#define I915_PMU_LAST I915_PMU_RC6pp_RESIDENCY /* Each region is a minimum of 16k, and there are at most 255 of them. */ -- cgit v1.2.3-55-g7522 From b552ae444e454eb3254c958e05b69820c0ef346d Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Thu, 23 Nov 2017 10:07:01 +0000 Subject: drm/i915/pmu: Drop I915_ENGINE_SAMPLE_MAX from uapi headers We have agreed during the engine classes discussion that fields marked as non-ABI are better left out altogether from uapi headers. v2: Use a local define for maintanability. (Chris Wilson) Signed-off-by: Tvrtko Ursulin Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20171123100701.18430-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/intel_ringbuffer.h | 1 + include/uapi/drm/i915_drm.h | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/drm') diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 43473e6d1a4f..d38d059285dc 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -358,6 +358,7 @@ struct intel_engine_cs { * * Our internal timer stores the current counters in this field. */ +#define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_SEMA + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; /** * @busy_stats: Has enablement of engine stats tracking been diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 915a6e85a855..239e8633edc9 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -110,8 +110,7 @@ enum drm_i915_gem_engine_class { enum drm_i915_pmu_engine_sample { I915_SAMPLE_BUSY = 0, I915_SAMPLE_WAIT = 1, - I915_SAMPLE_SEMA = 2, - I915_ENGINE_SAMPLE_MAX /* non-ABI */ + I915_SAMPLE_SEMA = 2 }; #define I915_PMU_SAMPLE_BITS (4) -- cgit v1.2.3-55-g7522 From 3452fa3095e91acbcb1f6290e0d70fa7d3695a3a Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 24 Nov 2017 17:13:31 +0000 Subject: drm/i915/pmu: Aggregate all RC6 states into one counter Chris has discovered that RC6, RC6p and RC6pp counters are mutually exclusive, and even that on some SNB SKUs you get RC6p increasing, and on the others RC6. Furthermore RC6p and RC6pp were only present starting from GEN6 until, GEN7, not including Haswell. All this combined makes it questionable whether we need to reserve new ABI for these counters. One idea was to just combine them all under the RC6 counter to simplify things for userspace. So that is what this patch does. Signed-off-by: Tvrtko Ursulin Suggested-by: Chris Wilson Signed-off-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20171124171331.17981-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 23 ++++++----------------- include/uapi/drm/i915_drm.h | 6 +----- 2 files changed, 7 insertions(+), 22 deletions(-) (limited to 'include/uapi/drm') diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 39310cf13c3a..3357b690ce90 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -359,11 +359,6 @@ static int i915_pmu_event_init(struct perf_event *event) if (!HAS_RC6(i915)) ret = -ENODEV; break; - case I915_PMU_RC6p_RESIDENCY: - case I915_PMU_RC6pp_RESIDENCY: - if (!HAS_RC6p(i915)) - ret = -ENODEV; - break; default: ret = -ENOENT; break; @@ -421,16 +416,12 @@ static u64 __i915_pmu_event_read(struct perf_event *event) IS_VALLEYVIEW(i915) ? VLV_GT_RENDER_RC6 : GEN6_GT_GFX_RC6); - intel_runtime_pm_put(i915); - break; - case I915_PMU_RC6p_RESIDENCY: - intel_runtime_pm_get(i915); - val = intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p); - intel_runtime_pm_put(i915); - break; - case I915_PMU_RC6pp_RESIDENCY: - intel_runtime_pm_get(i915); - val = intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp); + if (HAS_RC6p(i915)) { + val += intel_rc6_residency_ns(i915, + GEN6_GT_GFX_RC6p); + val += intel_rc6_residency_ns(i915, + GEN6_GT_GFX_RC6pp); + } intel_runtime_pm_put(i915); break; } @@ -708,8 +699,6 @@ static struct attribute *i915_pmu_events_attrs[] = { I915_EVENT_ATTR(interrupts, I915_PMU_INTERRUPTS), I915_EVENT(rc6-residency, I915_PMU_RC6_RESIDENCY, "ns"), - I915_EVENT(rc6p-residency, I915_PMU_RC6p_RESIDENCY, "ns"), - I915_EVENT(rc6pp-residency, I915_PMU_RC6pp_RESIDENCY, "ns"), NULL, }; diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 239e8633edc9..536ee4febd74 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -137,14 +137,10 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0) #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) - #define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) - #define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) -#define I915_PMU_RC6p_RESIDENCY __I915_PMU_OTHER(4) -#define I915_PMU_RC6pp_RESIDENCY __I915_PMU_OTHER(5) -#define I915_PMU_LAST I915_PMU_RC6pp_RESIDENCY +#define I915_PMU_LAST I915_PMU_RC6_RESIDENCY /* Each region is a minimum of 16k, and there are at most 255 of them. */ -- cgit v1.2.3-55-g7522