linux/drivers/gpu/drm/i915/i915_debugfs.c

1016 lines
30 KiB
C
Raw Normal View History

/*
* Copyright © 2008 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
* Keith Packard <keithp@keithp.com>
*
*/
#include <linux/sched/mm.h>
#include <linux/sort.h>
drm: Split out drm_probe_helper.h Having the probe helper stuff (which pretty much everyone needs) in the drm_crtc_helper.h file (which atomic drivers should never need) is confusing. Split them out. To make sure I actually achieved the goal here I went through all drivers. And indeed, all atomic drivers are now free of drm_crtc_helper.h includes. v2: Make it compile. There was so much compile fail on arm drivers that I figured I'll better not include any of the acks on v1. v3: Massive rebase because i915 has lost a lot of drmP.h includes, but not all: Through drm_crtc_helper.h > drm_modeset_helper.h -> drmP.h there was still one, which this patch largely removes. Which means rolling out lots more includes all over. This will also conflict with ongoing drmP.h cleanup by others I expect. v3: Rebase on top of atomic bochs. v4: Review from Laurent for bridge/rcar/omap/shmob/core bits: - (re)move some of the added includes, use the better include files in other places (all suggested from Laurent adopted unchanged). - sort alphabetically v5: Actually try to sort them, and while at it, sort all the ones I touch. v6: Rebase onto i915 changes. v7: Rebase once more. Acked-by: Harry Wentland <harry.wentland@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Cc: Sam Ravnborg <sam@ravnborg.org> Cc: Jani Nikula <jani.nikula@linux.intel.com> Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Acked-by: Benjamin Gaignard <benjamin.gaignard@linaro.org> Acked-by: Jani Nikula <jani.nikula@intel.com> Acked-by: Neil Armstrong <narmstrong@baylibre.com> Acked-by: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> Acked-by: CK Hu <ck.hu@mediatek.com> Acked-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Sam Ravnborg <sam@ravnborg.org> Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com> Acked-by: Liviu Dudau <liviu.dudau@arm.com> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com> Cc: linux-arm-kernel@lists.infradead.org Cc: virtualization@lists.linux-foundation.org Cc: etnaviv@lists.freedesktop.org Cc: linux-samsung-soc@vger.kernel.org Cc: intel-gfx@lists.freedesktop.org Cc: linux-mediatek@lists.infradead.org Cc: linux-amlogic@lists.infradead.org Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Cc: nouveau@lists.freedesktop.org Cc: spice-devel@lists.freedesktop.org Cc: amd-gfx@lists.freedesktop.org Cc: linux-renesas-soc@vger.kernel.org Cc: linux-rockchip@lists.infradead.org Cc: linux-stm32@st-md-mailman.stormreply.com Cc: linux-tegra@vger.kernel.org Cc: xen-devel@lists.xen.org Link: https://patchwork.freedesktop.org/patch/msgid/20190117210334.13234-1-daniel.vetter@ffwll.ch
2019-01-18 05:03:34 +08:00
#include <drm/drm_debugfs.h>
#include "gem/i915_gem_context.h"
#include "gt/intel_gt_buffer_pool.h"
#include "gt/intel_gt_clock_utils.h"
#include "gt/intel_gt.h"
drm/i915: Defer final intel_wakeref_put to process context As we need to acquire a mutex to serialise the final intel_wakeref_put, we need to ensure that we are in process context at that time. However, we want to allow operation on the intel_wakeref from inside timer and other hardirq context, which means that need to defer that final put to a workqueue. Inside the final wakeref puts, we are safe to operate in any context, as we are simply marking up the HW and state tracking for the potential sleep. It's only the serialisation with the potential sleeping getting that requires careful wait avoidance. This allows us to retain the immediate processing as before (we only need to sleep over the same races as the current mutex_lock). v2: Add a selftest to ensure we exercise the code while lockdep watches. v3: That test was extremely loud and complained about many things! v4: Not a whale! Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=111295 References: https://bugs.freedesktop.org/show_bug.cgi?id=111245 References: https://bugs.freedesktop.org/show_bug.cgi?id=111256 Fixes: 18398904ca9e ("drm/i915: Only recover active engines") Fixes: 51fbd8de87dc ("drm/i915/pmu: Atomically acquire the gt_pm wakeref") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190808202758.10453-1-chris@chris-wilson.co.uk
2019-08-09 04:27:58 +08:00
#include "gt/intel_gt_pm.h"
#include "gt/intel_gt_requests.h"
#include "gt/intel_reset.h"
#include "gt/intel_rc6.h"
#include "gt/intel_rps.h"
#include "gt/intel_sseu_debugfs.h"
#include "i915_debugfs.h"
#include "i915_debugfs_params.h"
#include "i915_irq.h"
#include "i915_scheduler.h"
#include "i915_trace.h"
#include "intel_pm.h"
#include "intel_sideband.h"
static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
{
return to_i915(node->minor->dev);
}
static int i915_capabilities(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct drm_printer p = drm_seq_file_printer(m);
seq_printf(m, "pch: %d\n", INTEL_PCH_TYPE(i915));
intel_device_info_print_static(INTEL_INFO(i915), &p);
intel_device_info_print_runtime(RUNTIME_INFO(i915), &p);
intel_gt_info_print(&i915->gt.info, &p);
intel_driver_caps_print(&i915->caps, &p);
kernel_param_lock(THIS_MODULE);
i915_params_dump(&i915->params, &p);
kernel_param_unlock(THIS_MODULE);
return 0;
}
static char get_tiling_flag(struct drm_i915_gem_object *obj)
{
switch (i915_gem_object_get_tiling(obj)) {
default:
case I915_TILING_NONE: return ' ';
case I915_TILING_X: return 'X';
case I915_TILING_Y: return 'Y';
}
}
static char get_global_flag(struct drm_i915_gem_object *obj)
{
return READ_ONCE(obj->userfault_count) ? 'g' : ' ';
}
static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
{
return obj->mm.mapping ? 'M' : ' ';
}
static const char *
stringify_page_sizes(unsigned int page_sizes, char *buf, size_t len)
{
size_t x = 0;
switch (page_sizes) {
case 0:
return "";
case I915_GTT_PAGE_SIZE_4K:
return "4K";
case I915_GTT_PAGE_SIZE_64K:
return "64K";
case I915_GTT_PAGE_SIZE_2M:
return "2M";
default:
if (!buf)
return "M";
if (page_sizes & I915_GTT_PAGE_SIZE_2M)
x += snprintf(buf + x, len - x, "2M, ");
if (page_sizes & I915_GTT_PAGE_SIZE_64K)
x += snprintf(buf + x, len - x, "64K, ");
if (page_sizes & I915_GTT_PAGE_SIZE_4K)
x += snprintf(buf + x, len - x, "4K, ");
buf[x-2] = '\0';
return buf;
}
}
void
i915_debugfs_describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
drm/i915: Implement inter-engine read-read optimisations Currently, we only track the last request globally across all engines. This prevents us from issuing concurrent read requests on e.g. the RCS and BCS engines (or more likely the render and media engines). Without semaphores, we incur costly stalls as we synchronise between rings - greatly impacting the current performance of Broadwell versus Haswell in certain workloads (like video decode). With the introduction of reference counted requests, it is much easier to track the last request per ring, as well as the last global write request so that we can optimise inter-engine read read requests (as well as better optimise certain CPU waits). v2: Fix inverted readonly condition for nonblocking waits. v3: Handle non-continguous engine array after waits v4: Rebase, tidy, rewrite ring list debugging v5: Use obj->active as a bitfield, it looks cool v6: Micro-optimise, mostly involving moving code around v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf) v8: Rebase v9: Refactor i915_gem_object_sync() to allow the compiler to better optimise it. Benchmark: igt/gem_read_read_speed hsw:gt3e (with semaphores): Before: Time to read-read 1024k: 275.794µs After: Time to read-read 1024k: 123.260µs hsw:gt3e (w/o semaphores): Before: Time to read-read 1024k: 230.433µs After: Time to read-read 1024k: 124.593µs bdw-u (w/o semaphores): Before After Time to read-read 1x1: 26.274µs 10.350µs Time to read-read 128x128: 40.097µs 21.366µs Time to read-read 256x256: 77.087µs 42.608µs Time to read-read 512x512: 281.999µs 181.155µs Time to read-read 1024x1024: 1196.141µs 1118.223µs Time to read-read 2048x2048: 5639.072µs 5225.837µs Time to read-read 4096x4096: 22401.662µs 21137.067µs Time to read-read 8192x8192: 89617.735µs 85637.681µs Testcase: igt/gem_concurrent_blit (read-read and friends) Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8] [danvet: s/\<rq\>/req/g] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2015-04-27 20:41:17 +08:00
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct intel_engine_cs *engine;
struct i915_vma *vma;
int pin_count = 0;
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 12:02:47 +08:00
seq_printf(m, "%pK: %c%c%c %8zdKiB %02x %02x %s%s%s",
&obj->base,
get_tiling_flag(obj),
get_global_flag(obj),
get_pin_mapped_flag(obj),
obj->base.size / 1024,
obj->read_domains,
obj->write_domain,
i915_cache_level_str(dev_priv, obj->cache_level),
obj->mm.dirty ? " dirty" : "",
obj->mm.madv == I915_MADV_DONTNEED ? " purgeable" : "");
if (obj->base.name)
seq_printf(m, " (name: %d)", obj->base.name);
spin_lock(&obj->vma.lock);
list_for_each_entry(vma, &obj->vma.list, obj_link) {
if (!drm_mm_node_allocated(&vma->node))
continue;
spin_unlock(&obj->vma.lock);
if (i915_vma_is_pinned(vma))
pin_count++;
seq_printf(m, " (%sgtt offset: %08llx, size: %08llx, pages: %s",
i915_vma_is_ggtt(vma) ? "g" : "pp",
vma->node.start, vma->node.size,
stringify_page_sizes(vma->page_sizes.gtt, NULL, 0));
if (i915_vma_is_ggtt(vma)) {
switch (vma->ggtt_view.type) {
case I915_GGTT_VIEW_NORMAL:
seq_puts(m, ", normal");
break;
case I915_GGTT_VIEW_PARTIAL:
seq_printf(m, ", partial [%08llx+%x]",
vma->ggtt_view.partial.offset << PAGE_SHIFT,
vma->ggtt_view.partial.size << PAGE_SHIFT);
break;
case I915_GGTT_VIEW_ROTATED:
seq_printf(m, ", rotated [(%ux%u, stride=%u, offset=%u), (%ux%u, stride=%u, offset=%u)]",
vma->ggtt_view.rotated.plane[0].width,
vma->ggtt_view.rotated.plane[0].height,
vma->ggtt_view.rotated.plane[0].stride,
vma->ggtt_view.rotated.plane[0].offset,
vma->ggtt_view.rotated.plane[1].width,
vma->ggtt_view.rotated.plane[1].height,
vma->ggtt_view.rotated.plane[1].stride,
vma->ggtt_view.rotated.plane[1].offset);
break;
case I915_GGTT_VIEW_REMAPPED:
seq_printf(m, ", remapped [(%ux%u, stride=%u, offset=%u), (%ux%u, stride=%u, offset=%u)]",
vma->ggtt_view.remapped.plane[0].width,
vma->ggtt_view.remapped.plane[0].height,
vma->ggtt_view.remapped.plane[0].stride,
vma->ggtt_view.remapped.plane[0].offset,
vma->ggtt_view.remapped.plane[1].width,
vma->ggtt_view.remapped.plane[1].height,
vma->ggtt_view.remapped.plane[1].stride,
vma->ggtt_view.remapped.plane[1].offset);
break;
default:
MISSING_CASE(vma->ggtt_view.type);
break;
}
}
if (vma->fence)
seq_printf(m, " , fence: %d", vma->fence->id);
seq_puts(m, ")");
spin_lock(&obj->vma.lock);
}
spin_unlock(&obj->vma.lock);
seq_printf(m, " (pinned x %d)", pin_count);
if (i915_gem_object_is_stolen(obj))
seq_printf(m, " (stolen: %08llx)", obj->stolen->start);
drm/i915: Replace obj->pin_global with obj->frontbuffer obj->pin_global was originally used as a means to keep the shrinker off the active scanout, but we use the vma->pin_count itself for that and the obj->frontbuffer to delay shrinking active framebuffers. The other role that obj->pin_global gained was for spotting display objects inside GEM and working harder to keep those coherent; for which we can again simply inspect obj->frontbuffer directly. Coming up next, we will want to manipulate the pin_global counter outside of the principle locks, so would need to make pin_global atomic. However, since obj->frontbuffer is already managed atomically, it makes sense to use that the primary key for display objects instead of having pin_global. Ville pointed out the principle difference is that obj->frontbuffer is set for as long as an intel_framebuffer is attached to an object, but obj->pin_global was only raised for as long as the object was active. In practice, this means that we consider the object as being on the scanout for longer than is strictly required, causing us to be more proactive in flushing -- though it should be true that we would have flushed eventually when the back became the front, except that on the flip path that flush is async but when hit from another ioctl it will be synchronous. v2: i915_gem_object_is_framebuffer() Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190902040303.14195-5-chris@chris-wilson.co.uk
2019-09-02 12:02:47 +08:00
if (i915_gem_object_is_framebuffer(obj))
seq_printf(m, " (fb)");
drm/i915: Move GEM activity tracking into a common struct reservation_object In preparation to support many distinct timelines, we need to expand the activity tracking on the GEM object to handle more than just a request per engine. We already use the struct reservation_object on the dma-buf to handle many fence contexts, so integrating that into the GEM object itself is the preferred solution. (For example, we can now share the same reservation_object between every consumer/producer using this buffer and skip the manual import/export via dma-buf.) v2: Reimplement busy-ioctl (by walking the reservation object), postpone the ABI change for another day. Similarly use the reservation object to find the last_write request (if active and from i915) for choosing display CS flips. Caveats: * busy-ioctl: busy-ioctl only reports on the native fences, it will not warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is being rendered to by external fences. It also will not report the same busy state as wait-ioctl (or polling on the dma-buf) in the same circumstances. On the plus side, it does retain reporting of which *i915* engines are engaged with this object. * non-blocking atomic modesets take a step backwards as the wait for render completion blocks the ioctl. This is fixed in a subsequent patch to use a fence instead for awaiting on the rendering, see "drm/i915: Restore nonblocking awaits for modesetting" * dynamic array manipulation for shared-fences in reservation is slower than the previous lockless static assignment (e.g. gem_exec_lut_handle runtime on ivb goes from 42s to 66s), mainly due to atomic operations (maintaining the fence refcounts). * loss of object-level retirement callbacks, emulated by VMA retirement tracking. * minor loss of object-level last activity information from debugfs, could be replaced with per-vma information if desired Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 20:58:44 +08:00
engine = i915_gem_object_last_write_engine(obj);
if (engine)
seq_printf(m, " (%s)", engine->name);
}
static int i915_gem_object_info(struct seq_file *m, void *data)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_memory_region *mr;
enum intel_region_id id;
seq_printf(m, "%u shrinkable [%u free] objects, %llu bytes\n",
i915->mm.shrink_count,
atomic_read(&i915->mm.free_count),
i915->mm.shrink_memory);
for_each_memory_region(mr, i915, id)
seq_printf(m, "%s: total:%pa, available:%pa bytes\n",
mr->name, &mr->total, &mr->avail);
return 0;
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
static ssize_t gpu_state_read(struct file *file, char __user *ubuf,
size_t count, loff_t *pos)
{
struct i915_gpu_coredump *error;
ssize_t ret;
void *buf;
error = file->private_data;
if (!error)
return 0;
/* Bounce buffer required because of kernfs __user API convenience. */
buf = kmalloc(count, GFP_KERNEL);
if (!buf)
return -ENOMEM;
ret = i915_gpu_coredump_copy_to_buffer(error, buf, *pos, count);
if (ret <= 0)
goto out;
if (!copy_to_user(ubuf, buf, ret))
*pos += ret;
else
ret = -EFAULT;
out:
kfree(buf);
return ret;
}
static int gpu_state_release(struct inode *inode, struct file *file)
{
i915_gpu_coredump_put(file->private_data);
return 0;
}
static int i915_gpu_info_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct i915_gpu_coredump *gpu;
intel_wakeref_t wakeref;
gpu = NULL;
with_intel_runtime_pm(&i915->runtime_pm, wakeref)
gpu = i915_gpu_coredump(&i915->gt, ALL_ENGINES);
if (IS_ERR(gpu))
return PTR_ERR(gpu);
file->private_data = gpu;
return 0;
}
static const struct file_operations i915_gpu_info_fops = {
.owner = THIS_MODULE,
.open = i915_gpu_info_open,
.read = gpu_state_read,
.llseek = default_llseek,
.release = gpu_state_release,
};
static ssize_t
i915_error_state_write(struct file *filp,
const char __user *ubuf,
size_t cnt,
loff_t *ppos)
{
struct i915_gpu_coredump *error = filp->private_data;
if (!error)
return 0;
drm_dbg(&error->i915->drm, "Resetting error state\n");
i915_reset_error_state(error->i915);
return cnt;
}
static int i915_error_state_open(struct inode *inode, struct file *file)
{
struct i915_gpu_coredump *error;
error = i915_first_error_state(inode->i_private);
if (IS_ERR(error))
return PTR_ERR(error);
file->private_data = error;
return 0;
}
static const struct file_operations i915_error_state_fops = {
.owner = THIS_MODULE,
.open = i915_error_state_open,
.read = gpu_state_read,
.write = i915_error_state_write,
.llseek = default_llseek,
.release = gpu_state_release,
};
#endif
static int i915_frequency_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_uncore *uncore = &dev_priv->uncore;
struct intel_rps *rps = &dev_priv->gt.rps;
intel_wakeref_t wakeref;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
if (IS_GEN(dev_priv, 5)) {
u16 rgvswctl = intel_uncore_read16(uncore, MEMSWCTL);
u16 rgvstat = intel_uncore_read16(uncore, MEMSTAT_ILK);
seq_printf(m, "Requested P-state: %d\n", (rgvswctl >> 8) & 0xf);
seq_printf(m, "Requested VID: %d\n", rgvswctl & 0x3f);
seq_printf(m, "Current VID: %d\n", (rgvstat & MEMSTAT_VID_MASK) >>
MEMSTAT_VID_SHIFT);
seq_printf(m, "Current P-state: %d\n",
(rgvstat & MEMSTAT_PSTATE_MASK) >> MEMSTAT_PSTATE_SHIFT);
} else if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {
u32 rpmodectl, freq_sts;
rpmodectl = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CONTROL);
seq_printf(m, "Video Turbo Mode: %s\n",
yesno(rpmodectl & GEN6_RP_MEDIA_TURBO));
seq_printf(m, "HW control enabled: %s\n",
yesno(rpmodectl & GEN6_RP_ENABLE));
seq_printf(m, "SW control enabled: %s\n",
yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) ==
GEN6_RP_MEDIA_SW_MODE));
vlv_punit_get(dev_priv);
freq_sts = vlv_punit_read(dev_priv, PUNIT_REG_GPU_FREQ_STS);
vlv_punit_put(dev_priv);
seq_printf(m, "PUNIT_REG_GPU_FREQ_STS: 0x%08x\n", freq_sts);
seq_printf(m, "DDR freq: %d MHz\n", dev_priv->mem_freq);
seq_printf(m, "actual GPU freq: %d MHz\n",
intel_gpu_freq(rps, (freq_sts >> 8) & 0xff));
seq_printf(m, "current GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->cur_freq));
seq_printf(m, "max GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, "min GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->min_freq));
seq_printf(m, "idle GPU freq: %d MHz\n",
intel_gpu_freq(rps, rps->idle_freq));
seq_printf(m,
"efficient (RPe) frequency: %d MHz\n",
intel_gpu_freq(rps, rps->efficient_freq));
} else if (INTEL_GEN(dev_priv) >= 6) {
u32 rp_state_limits;
u32 gt_perf_status;
u32 rp_state_cap;
u32 rpmodectl, rpinclimit, rpdeclimit;
u32 rpstat, cagf, reqf;
u32 rpupei, rpcurup, rpprevup;
u32 rpdownei, rpcurdown, rpprevdown;
u32 pm_ier, pm_imr, pm_isr, pm_iir, pm_mask;
int max_freq;
rp_state_limits = intel_uncore_read(&dev_priv->uncore, GEN6_RP_STATE_LIMITS);
if (IS_GEN9_LP(dev_priv)) {
rp_state_cap = intel_uncore_read(&dev_priv->uncore, BXT_RP_STATE_CAP);
gt_perf_status = intel_uncore_read(&dev_priv->uncore, BXT_GT_PERF_STATUS);
} else {
rp_state_cap = intel_uncore_read(&dev_priv->uncore, GEN6_RP_STATE_CAP);
gt_perf_status = intel_uncore_read(&dev_priv->uncore, GEN6_GT_PERF_STATUS);
}
/* RPSTAT1 is in the GT power well */
intel_uncore_forcewake_get(&dev_priv->uncore, FORCEWAKE_ALL);
reqf = intel_uncore_read(&dev_priv->uncore, GEN6_RPNSWREQ);
if (INTEL_GEN(dev_priv) >= 9)
reqf >>= 23;
else {
reqf &= ~GEN6_TURBO_DISABLE;
if (IS_HASWELL(dev_priv) || IS_BROADWELL(dev_priv))
reqf >>= 24;
else
reqf >>= 25;
}
reqf = intel_gpu_freq(rps, reqf);
rpmodectl = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CONTROL);
rpinclimit = intel_uncore_read(&dev_priv->uncore, GEN6_RP_UP_THRESHOLD);
rpdeclimit = intel_uncore_read(&dev_priv->uncore, GEN6_RP_DOWN_THRESHOLD);
rpstat = intel_uncore_read(&dev_priv->uncore, GEN6_RPSTAT1);
rpupei = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CUR_UP_EI) & GEN6_CURICONT_MASK;
rpcurup = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CUR_UP) & GEN6_CURBSYTAVG_MASK;
rpprevup = intel_uncore_read(&dev_priv->uncore, GEN6_RP_PREV_UP) & GEN6_CURBSYTAVG_MASK;
rpdownei = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CUR_DOWN_EI) & GEN6_CURIAVG_MASK;
rpcurdown = intel_uncore_read(&dev_priv->uncore, GEN6_RP_CUR_DOWN) & GEN6_CURBSYTAVG_MASK;
rpprevdown = intel_uncore_read(&dev_priv->uncore, GEN6_RP_PREV_DOWN) & GEN6_CURBSYTAVG_MASK;
cagf = intel_rps_read_actual_frequency(rps);
intel_uncore_forcewake_put(&dev_priv->uncore, FORCEWAKE_ALL);
if (INTEL_GEN(dev_priv) >= 11) {
pm_ier = intel_uncore_read(&dev_priv->uncore, GEN11_GPM_WGBOXPERF_INTR_ENABLE);
pm_imr = intel_uncore_read(&dev_priv->uncore, GEN11_GPM_WGBOXPERF_INTR_MASK);
/*
* The equivalent to the PM ISR & IIR cannot be read
* without affecting the current state of the system
*/
pm_isr = 0;
pm_iir = 0;
} else if (INTEL_GEN(dev_priv) >= 8) {
pm_ier = intel_uncore_read(&dev_priv->uncore, GEN8_GT_IER(2));
pm_imr = intel_uncore_read(&dev_priv->uncore, GEN8_GT_IMR(2));
pm_isr = intel_uncore_read(&dev_priv->uncore, GEN8_GT_ISR(2));
pm_iir = intel_uncore_read(&dev_priv->uncore, GEN8_GT_IIR(2));
} else {
pm_ier = intel_uncore_read(&dev_priv->uncore, GEN6_PMIER);
pm_imr = intel_uncore_read(&dev_priv->uncore, GEN6_PMIMR);
pm_isr = intel_uncore_read(&dev_priv->uncore, GEN6_PMISR);
pm_iir = intel_uncore_read(&dev_priv->uncore, GEN6_PMIIR);
}
pm_mask = intel_uncore_read(&dev_priv->uncore, GEN6_PMINTRMSK);
seq_printf(m, "Video Turbo Mode: %s\n",
yesno(rpmodectl & GEN6_RP_MEDIA_TURBO));
seq_printf(m, "HW control enabled: %s\n",
yesno(rpmodectl & GEN6_RP_ENABLE));
seq_printf(m, "SW control enabled: %s\n",
yesno((rpmodectl & GEN6_RP_MEDIA_MODE_MASK) ==
GEN6_RP_MEDIA_SW_MODE));
seq_printf(m, "PM IER=0x%08x IMR=0x%08x, MASK=0x%08x\n",
pm_ier, pm_imr, pm_mask);
if (INTEL_GEN(dev_priv) <= 10)
seq_printf(m, "PM ISR=0x%08x IIR=0x%08x\n",
pm_isr, pm_iir);
seq_printf(m, "pm_intrmsk_mbz: 0x%08x\n",
rps->pm_intrmsk_mbz);
seq_printf(m, "GT_PERF_STATUS: 0x%08x\n", gt_perf_status);
seq_printf(m, "Render p-state ratio: %d\n",
(gt_perf_status & (INTEL_GEN(dev_priv) >= 9 ? 0x1ff00 : 0xff00)) >> 8);
seq_printf(m, "Render p-state VID: %d\n",
gt_perf_status & 0xff);
seq_printf(m, "Render p-state limit: %d\n",
rp_state_limits & 0xff);
seq_printf(m, "RPSTAT1: 0x%08x\n", rpstat);
seq_printf(m, "RPMODECTL: 0x%08x\n", rpmodectl);
seq_printf(m, "RPINCLIMIT: 0x%08x\n", rpinclimit);
seq_printf(m, "RPDECLIMIT: 0x%08x\n", rpdeclimit);
seq_printf(m, "RPNSWREQ: %dMHz\n", reqf);
seq_printf(m, "CAGF: %dMHz\n", cagf);
seq_printf(m, "RP CUR UP EI: %d (%lldns)\n",
rpupei,
intel_gt_pm_interval_to_ns(&dev_priv->gt, rpupei));
seq_printf(m, "RP CUR UP: %d (%lldun)\n",
rpcurup,
intel_gt_pm_interval_to_ns(&dev_priv->gt, rpcurup));
seq_printf(m, "RP PREV UP: %d (%lldns)\n",
rpprevup,
intel_gt_pm_interval_to_ns(&dev_priv->gt, rpprevup));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 21:26:29 +08:00
seq_printf(m, "Up threshold: %d%%\n",
rps->power.up_threshold);
seq_printf(m, "RP CUR DOWN EI: %d (%lldns)\n",
rpdownei,
intel_gt_pm_interval_to_ns(&dev_priv->gt,
rpdownei));
seq_printf(m, "RP CUR DOWN: %d (%lldns)\n",
rpcurdown,
intel_gt_pm_interval_to_ns(&dev_priv->gt,
rpcurdown));
seq_printf(m, "RP PREV DOWN: %d (%lldns)\n",
rpprevdown,
intel_gt_pm_interval_to_ns(&dev_priv->gt,
rpprevdown));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 21:26:29 +08:00
seq_printf(m, "Down threshold: %d%%\n",
rps->power.down_threshold);
max_freq = (IS_GEN9_LP(dev_priv) ? rp_state_cap >> 0 :
rp_state_cap >> 16) & 0xff;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Lowest (RPN) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
max_freq = (rp_state_cap & 0xff00) >> 8;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Nominal (RP1) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
max_freq = (IS_GEN9_LP(dev_priv) ? rp_state_cap >> 16 :
rp_state_cap >> 0) & 0xff;
max_freq *= (IS_GEN9_BC(dev_priv) ||
INTEL_GEN(dev_priv) >= 10 ? GEN9_FREQ_SCALER : 1);
seq_printf(m, "Max non-overclocked (RP0) frequency: %dMHz\n",
intel_gpu_freq(rps, max_freq));
seq_printf(m, "Max overclocked frequency: %dMHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, "Current freq: %d MHz\n",
intel_gpu_freq(rps, rps->cur_freq));
seq_printf(m, "Actual freq: %d MHz\n", cagf);
seq_printf(m, "Idle freq: %d MHz\n",
intel_gpu_freq(rps, rps->idle_freq));
seq_printf(m, "Min freq: %d MHz\n",
intel_gpu_freq(rps, rps->min_freq));
seq_printf(m, "Boost freq: %d MHz\n",
intel_gpu_freq(rps, rps->boost_freq));
seq_printf(m, "Max freq: %d MHz\n",
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m,
"efficient (RPe) frequency: %d MHz\n",
intel_gpu_freq(rps, rps->efficient_freq));
} else {
seq_puts(m, "no P-state info available\n");
}
seq_printf(m, "Current CD clock frequency: %d kHz\n", dev_priv->cdclk.hw.cdclk);
seq_printf(m, "Max CD clock frequency: %d kHz\n", dev_priv->max_cdclk_freq);
seq_printf(m, "Max pixel clock frequency: %d kHz\n", dev_priv->max_dotclk_freq);
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static const char *swizzle_string(unsigned swizzle)
{
switch (swizzle) {
case I915_BIT_6_SWIZZLE_NONE:
return "none";
case I915_BIT_6_SWIZZLE_9:
return "bit9";
case I915_BIT_6_SWIZZLE_9_10:
return "bit9/bit10";
case I915_BIT_6_SWIZZLE_9_11:
return "bit9/bit11";
case I915_BIT_6_SWIZZLE_9_10_11:
return "bit9/bit10/bit11";
case I915_BIT_6_SWIZZLE_9_17:
return "bit9/bit17";
case I915_BIT_6_SWIZZLE_9_10_17:
return "bit9/bit10/bit17";
case I915_BIT_6_SWIZZLE_UNKNOWN:
return "unknown";
}
return "bug";
}
static int i915_swizzle_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_uncore *uncore = &dev_priv->uncore;
intel_wakeref_t wakeref;
seq_printf(m, "bit6 swizzle for X-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_x));
seq_printf(m, "bit6 swizzle for Y-tiling = %s\n",
swizzle_string(dev_priv->ggtt.bit_6_swizzle_y));
if (dev_priv->quirks & QUIRK_PIN_SWIZZLED_PAGES)
seq_puts(m, "L-shaped memory detected\n");
/* On BDW+, swizzling is not used. See detect_bit_6_swizzle() */
if (INTEL_GEN(dev_priv) >= 8 || IS_VALLEYVIEW(dev_priv))
return 0;
wakeref = intel_runtime_pm_get(&dev_priv->runtime_pm);
if (IS_GEN_RANGE(dev_priv, 3, 4)) {
seq_printf(m, "DDC = 0x%08x\n",
intel_uncore_read(uncore, DCC));
seq_printf(m, "DDC2 = 0x%08x\n",
intel_uncore_read(uncore, DCC2));
seq_printf(m, "C0DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C0DRB3));
seq_printf(m, "C1DRB3 = 0x%04x\n",
intel_uncore_read16(uncore, C1DRB3));
} else if (INTEL_GEN(dev_priv) >= 6) {
seq_printf(m, "MAD_DIMM_C0 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C0));
seq_printf(m, "MAD_DIMM_C1 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C1));
seq_printf(m, "MAD_DIMM_C2 = 0x%08x\n",
intel_uncore_read(uncore, MAD_DIMM_C2));
seq_printf(m, "TILECTL = 0x%08x\n",
intel_uncore_read(uncore, TILECTL));
if (INTEL_GEN(dev_priv) >= 8)
seq_printf(m, "GAMTARBMODE = 0x%08x\n",
intel_uncore_read(uncore, GAMTARBMODE));
else
seq_printf(m, "ARB_MODE = 0x%08x\n",
intel_uncore_read(uncore, ARB_MODE));
seq_printf(m, "DISP_ARB_CTL = 0x%08x\n",
intel_uncore_read(uncore, DISP_ARB_CTL));
}
intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref);
return 0;
}
static int i915_rps_boost_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct intel_rps *rps = &dev_priv->gt.rps;
seq_printf(m, "RPS enabled? %s\n", yesno(intel_rps_is_enabled(rps)));
seq_printf(m, "RPS active? %s\n", yesno(intel_rps_is_active(rps)));
drm/i915: Invert the GEM wakeref hierarchy In the current scheme, on submitting a request we take a single global GEM wakeref, which trickles down to wake up all GT power domains. This is undesirable as we would like to be able to localise our power management to the available power domains and to remove the global GEM operations from the heart of the driver. (The intent there is to push global GEM decisions to the boundary as used by the GEM user interface.) Now during request construction, each request is responsible via its logical context to acquire a wakeref on each power domain it intends to utilize. Currently, each request takes a wakeref on the engine(s) and the engines themselves take a chipset wakeref. This gives us a transition on each engine which we can extend if we want to insert more powermangement control (such as soft rc6). The global GEM operations that currently require a struct_mutex are reduced to listening to pm events from the chipset GT wakeref. As we reduce the struct_mutex requirement, these listeners should evaporate. Perhaps the biggest immediate change is that this removes the struct_mutex requirement around GT power management, allowing us greater flexibility in request construction. Another important knock-on effect, is that by tracking engine usage, we can insert a switch back to the kernel context on that engine immediately, avoiding any extra delay or inserting global synchronisation barriers. This makes tracking when an engine and its associated contexts are idle much easier -- important for when we forgo our assumed execution ordering and need idle barriers to unpin used contexts. In the process, it means we remove a large chunk of code whose only purpose was to switch back to the kernel context. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Imre Deak <imre.deak@intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190424200717.1686-5-chris@chris-wilson.co.uk
2019-04-25 04:07:17 +08:00
seq_printf(m, "GPU busy? %s\n", yesno(dev_priv->gt.awake));
drm/i915: Avoid keeping waitboost active for signaling threads Once a client has requested a waitboost, we keep that waitboost active until all clients are no longer waiting. This is because we don't distinguish which waiter deserves the boost. However, with the advent of fence signaling, the signaler threads appear as waiters to the RPS interrupt handler. So instead of using a single boolean to track when to keep the waitboost active, use a counter of all outstanding waitboosted requests. At this point, I have removed all vestiges of the rate limiting on clients. Whilst this means that compositors should remain more fluid, it also means that boosts are more prevalent. See commit b29c19b64528 ("drm/i915: Boost RPS frequency for CPU stalls") for a longer discussion on the pros and cons of both approaches. A drawback of this implementation is that it requires constant request submission to keep the waitboost trimmed (as it is now cancelled when the request is completed). This will be fine for a busy system, but near idle the boosts may be kept for longer than desired (effectively tens of vblanks worstcase) and there is a reliance on rc6 instead. v2: Remove defunct rps.client_lock Reported-by: Michał Winiarski <michal.winiarski@intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Michał Winiarski <michal.winiarski@intel.com> Reviewed-by: Michał Winiarski <michal.winiarski@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170628123548.9236-1-chris@chris-wilson.co.uk
2017-06-28 20:35:48 +08:00
seq_printf(m, "Boosts outstanding? %d\n",
atomic_read(&rps->num_waiters));
drm/i915: Interactive RPS mode RPS provides a feedback loop where we use the load during the previous evaluation interval to decide whether to up or down clock the GPU frequency. Our responsiveness is split into 3 regimes, a high and low plateau with the intent to keep the gpu clocked high to cover occasional stalls under high load, and low despite occasional glitches under steady low load, and inbetween. However, we run into situations like kodi where we want to stay at low power (video decoding is done efficiently inside the fixed function HW and doesn't need high clocks even for high bitrate streams), but just occasionally the pipeline is more complex than a video decode and we need a smidgen of extra GPU power to present on time. In the high power regime, we sample at sub frame intervals with a bias to upclocking, and conversely at low power we sample over a few frames worth to provide what we consider to be the right levels of responsiveness respectively. At low power, we more or less expect to be kicked out to high power at the start of a busy sequence by waitboosting. Prior to commit e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") whenever we missed the frame or stalled, we would immediate go full throttle and upclock the GPU to max. But in commit e9af4ea2b9e7, we relaxed the waitboosting to only apply if the pipeline was deep to avoid over-committing resources for a near miss. Sadly though, a near miss is still a miss, and perceptible as jitter in the frame delivery. To try and prevent the near miss before having to resort to boosting after the fact, we use the pageflip queue as an indication that we are in an "interactive" regime and so should sample the load more frequently to provide power before the frame misses it vblank. This will make us more favorable to providing a small power increase (one or two bins) as required rather than going all the way to maximum and then having to work back down again. (We still keep the waitboosting mechanism around just in case a dramatic change in system load requires urgent uplocking, faster than we can provide in a few evaluation intervals.) v2: Reduce rps_set_interactive to a boolean parameter to avoid the confusion of what if they wanted a new power mode after pinning to a different mode (which to choose?) v3: Only reprogram RPS while the GT is awake, it will be set when we wake the GT, and while off warns about being used outside of rpm. v4: Fix deferred application of interactive mode v5: s/state/interactive/ v6: Group the mutex with its principle in a substruct Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107111 Fixes: e9af4ea2b9e7 ("drm/i915: Avoid waitboosting on the active request") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Radoslaw Szwichtenberg <radoslaw.szwichtenberg@intel.com> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20180731132629.3381-1-chris@chris-wilson.co.uk
2018-07-31 21:26:29 +08:00
seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive));
seq_printf(m, "Frequency requested %d, actual %d\n",
intel_gpu_freq(rps, rps->cur_freq),
intel_rps_read_actual_frequency(rps));
seq_printf(m, " min hard:%d, soft:%d; max soft:%d, hard:%d\n",
intel_gpu_freq(rps, rps->min_freq),
intel_gpu_freq(rps, rps->min_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq_softlimit),
intel_gpu_freq(rps, rps->max_freq));
seq_printf(m, " idle:%d, efficient:%d, boost:%d\n",
intel_gpu_freq(rps, rps->idle_freq),
intel_gpu_freq(rps, rps->efficient_freq),
intel_gpu_freq(rps, rps->boost_freq));
seq_printf(m, "Wait boosts: %d\n", READ_ONCE(rps->boosts));
return 0;
}
static int i915_runtime_pm_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct pci_dev *pdev = dev_priv->drm.pdev;
if (!HAS_RUNTIME_PM(dev_priv))
seq_puts(m, "Runtime power management not supported\n");
seq_printf(m, "Runtime power status: %s\n",
enableddisabled(!dev_priv->power_domains.init_wakeref));
seq_printf(m, "GPU idle: %s\n", yesno(!dev_priv->gt.awake));
seq_printf(m, "IRQs disabled: %s\n",
yesno(!intel_irqs_enabled(dev_priv)));
#ifdef CONFIG_PM
seq_printf(m, "Usage count: %d\n",
atomic_read(&dev_priv->drm.dev->power.usage_count));
#else
seq_printf(m, "Device Power Management (CONFIG_PM) disabled\n");
#endif
seq_printf(m, "PCI device power state: %s [%d]\n",
pci_power_name(pdev->current_state),
pdev->current_state);
if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_RUNTIME_PM)) {
struct drm_printer p = drm_seq_file_printer(m);
print_intel_runtime_pm_wakeref(&dev_priv->runtime_pm, &p);
}
return 0;
}
static int i915_engine_info(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_engine_cs *engine;
intel_wakeref_t wakeref;
struct drm_printer p;
wakeref = intel_runtime_pm_get(&i915->runtime_pm);
seq_printf(m, "GT awake? %s [%d], %llums\n",
yesno(i915->gt.awake),
atomic_read(&i915->gt.wakeref.count),
ktime_to_ms(intel_gt_get_awake_time(&i915->gt)));
seq_printf(m, "CS timestamp frequency: %u Hz, %d ns\n",
i915->gt.clock_frequency,
i915->gt.clock_period_ns);
p = drm_seq_file_printer(m);
for_each_uabi_engine(engine, i915)
intel_engine_dump(engine, &p, "%s\n", engine->name);
intel_gt_show_timelines(&i915->gt, &p, i915_request_show_with_schedule);
intel_runtime_pm_put(&i915->runtime_pm, wakeref);
return 0;
}
static int i915_wa_registers(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_engine_cs *engine;
for_each_uabi_engine(engine, i915) {
const struct i915_wa_list *wal = &engine->ctx_wa_list;
const struct i915_wa *wa;
unsigned int count;
count = wal->count;
if (!count)
continue;
seq_printf(m, "%s: Workarounds applied: %u\n",
engine->name, count);
for (wa = wal->list; count--; wa++)
seq_printf(m, "0x%X: 0x%08X, mask: 0x%08X\n",
i915_mmio_reg_offset(wa->reg),
wa->set, wa->clr);
seq_printf(m, "\n");
}
return 0;
}
static int
i915_wedged_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
int ret = intel_gt_terminally_wedged(&i915->gt);
switch (ret) {
case -EIO:
*val = 1;
return 0;
case 0:
*val = 0;
return 0;
default:
return ret;
}
}
static int
i915_wedged_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
/* Flush any previous reset before applying for a new one */
wait_event(i915->gt.reset.queue,
!test_bit(I915_RESET_BACKOFF, &i915->gt.reset.flags));
intel_gt_handle_error(&i915->gt, val, I915_ERROR_CAPTURE,
"Manually set wedged engine mask = %llx", val);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
i915_wedged_get, i915_wedged_set,
"%llu\n");
static int
i915_perf_noa_delay_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
/*
* This would lead to infinite waits as we're doing timestamp
* difference on the CS with only 32bits.
*/
if (intel_gt_ns_to_clock_interval(&i915->gt, val) > U32_MAX)
return -EINVAL;
atomic64_set(&i915->perf.noa_programming_delay, val);
return 0;
}
static int
i915_perf_noa_delay_get(void *data, u64 *val)
{
struct drm_i915_private *i915 = data;
*val = atomic64_read(&i915->perf.noa_programming_delay);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_perf_noa_delay_fops,
i915_perf_noa_delay_get,
i915_perf_noa_delay_set,
"%llu\n");
#define DROP_UNBOUND BIT(0)
#define DROP_BOUND BIT(1)
#define DROP_RETIRE BIT(2)
#define DROP_ACTIVE BIT(3)
#define DROP_FREED BIT(4)
#define DROP_SHRINK_ALL BIT(5)
#define DROP_IDLE BIT(6)
#define DROP_RESET_ACTIVE BIT(7)
#define DROP_RESET_SEQNO BIT(8)
#define DROP_RCU BIT(9)
#define DROP_ALL (DROP_UNBOUND | \
DROP_BOUND | \
DROP_RETIRE | \
DROP_ACTIVE | \
DROP_FREED | \
DROP_SHRINK_ALL |\
DROP_IDLE | \
DROP_RESET_ACTIVE | \
DROP_RESET_SEQNO | \
DROP_RCU)
static int
i915_drop_caches_get(void *data, u64 *val)
{
*val = DROP_ALL;
return 0;
}
static int
gt_drop_caches(struct intel_gt *gt, u64 val)
{
int ret;
if (val & DROP_RESET_ACTIVE &&
wait_for(intel_engines_are_idle(gt), I915_IDLE_ENGINES_TIMEOUT))
intel_gt_set_wedged(gt);
if (val & DROP_RETIRE)
intel_gt_retire_requests(gt);
if (val & (DROP_IDLE | DROP_ACTIVE)) {
ret = intel_gt_wait_for_idle(gt, MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
}
if (val & DROP_IDLE) {
ret = intel_gt_pm_wait_for_idle(gt);
if (ret)
return ret;
}
if (val & DROP_RESET_ACTIVE && intel_gt_terminally_wedged(gt))
intel_gt_handle_error(gt, ALL_ENGINES, 0, NULL);
if (val & DROP_FREED)
intel_gt_flush_buffer_pool(gt);
return 0;
}
static int
i915_drop_caches_set(void *data, u64 val)
{
struct drm_i915_private *i915 = data;
int ret;
DRM_DEBUG("Dropping caches: 0x%08llx [0x%08llx]\n",
val, val & DROP_ALL);
ret = gt_drop_caches(&i915->gt, val);
if (ret)
return ret;
fs_reclaim_acquire(GFP_KERNEL);
if (val & DROP_BOUND)
i915_gem_shrink(i915, LONG_MAX, NULL, I915_SHRINK_BOUND);
if (val & DROP_UNBOUND)
i915_gem_shrink(i915, LONG_MAX, NULL, I915_SHRINK_UNBOUND);
if (val & DROP_SHRINK_ALL)
i915_gem_shrink_all(i915);
fs_reclaim_release(GFP_KERNEL);
if (val & DROP_RCU)
rcu_barrier();
if (val & DROP_FREED)
i915_gem_drain_freed_objects(i915);
return 0;
}
DEFINE_SIMPLE_ATTRIBUTE(i915_drop_caches_fops,
i915_drop_caches_get, i915_drop_caches_set,
"0x%08llx\n");
static int i915_sseu_status(struct seq_file *m, void *unused)
{
struct drm_i915_private *i915 = node_to_i915(m->private);
struct intel_gt *gt = &i915->gt;
return intel_sseu_status(m, gt);
}
static int i915_forcewake_open(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct intel_gt *gt = &i915->gt;
atomic_inc(&gt->user_wakeref);
intel_gt_pm_get(gt);
if (INTEL_GEN(i915) >= 6)
intel_uncore_forcewake_user_get(gt->uncore);
return 0;
}
static int i915_forcewake_release(struct inode *inode, struct file *file)
{
struct drm_i915_private *i915 = inode->i_private;
struct intel_gt *gt = &i915->gt;
if (INTEL_GEN(i915) >= 6)
intel_uncore_forcewake_user_put(&i915->uncore);
intel_gt_pm_put(gt);
atomic_dec(&gt->user_wakeref);
return 0;
}
static const struct file_operations i915_forcewake_fops = {
.owner = THIS_MODULE,
.open = i915_forcewake_open,
.release = i915_forcewake_release,
};
static const struct drm_info_list i915_debugfs_list[] = {
{"i915_capabilities", i915_capabilities, 0},
{"i915_gem_objects", i915_gem_object_info, 0},
{"i915_frequency_info", i915_frequency_info, 0},
{"i915_swizzle_info", i915_swizzle_info, 0},
{"i915_runtime_pm_status", i915_runtime_pm_status, 0},
{"i915_engine_info", i915_engine_info, 0},
{"i915_wa_registers", i915_wa_registers, 0},
{"i915_sseu_status", i915_sseu_status, 0},
{"i915_rps_boost_info", i915_rps_boost_info, 0},
};
#define I915_DEBUGFS_ENTRIES ARRAY_SIZE(i915_debugfs_list)
static const struct i915_debugfs_files {
const char *name;
const struct file_operations *fops;
} i915_debugfs_files[] = {
{"i915_perf_noa_delay", &i915_perf_noa_delay_fops},
{"i915_wedged", &i915_wedged_fops},
{"i915_gem_drop_caches", &i915_drop_caches_fops},
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
{"i915_error_state", &i915_error_state_fops},
{"i915_gpu_info", &i915_gpu_info_fops},
#endif
};
void i915_debugfs_register(struct drm_i915_private *dev_priv)
{
struct drm_minor *minor = dev_priv->drm.primary;
int i;
i915_debugfs_params(dev_priv);
debugfs_create_file("i915_forcewake_user", S_IRUSR, minor->debugfs_root,
to_i915(minor->dev), &i915_forcewake_fops);
for (i = 0; i < ARRAY_SIZE(i915_debugfs_files); i++) {
debugfs_create_file(i915_debugfs_files[i].name,
S_IRUGO | S_IWUSR,
minor->debugfs_root,
to_i915(minor->dev),
i915_debugfs_files[i].fops);
}
drm_debugfs_create_files(i915_debugfs_list,
I915_DEBUGFS_ENTRIES,
minor->debugfs_root, minor);
}