diff options
author | Dave Airlie <airlied@redhat.com> | 2021-10-22 06:30:33 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2021-10-22 06:30:34 +1000 |
commit | 6f2f7c83303d2227f47551423e507d77d9ea01c7 (patch) | |
tree | 05dff35a0b96494a2419ff5747f80c4648c10cc4 /drivers/gpu/drm/i915/gem | |
parent | 94ff371eb8497d02b8cb9d0c2c7370193c98e9f7 (diff) | |
parent | ab5d964c001b9efffcbfa4d67a30186b67d79771 (diff) | |
download | linux-6f2f7c83303d2227f47551423e507d77d9ea01c7.tar.gz linux-6f2f7c83303d2227f47551423e507d77d9ea01c7.tar.bz2 linux-6f2f7c83303d2227f47551423e507d77d9ea01c7.zip |
Merge tag 'drm-intel-gt-next-2021-10-21' of git://anongit.freedesktop.org/drm/drm-intel into drm-next
UAPI Changes:
- Expose multi-LRC submission interface
Similar to the bonded submission interface but simplified.
Comes with GuC only implementation for now. See kerneldoc
for more details.
Userspace changes: https://github.com/intel/media-driver/pull/1252
- Expose logical engine instance to user
Needed by the multi-LRC submission interface for GuC
Userspace changes: https://github.com/intel/media-driver/pull/1252
Driver Changes:
- Fix blank screen booting crashes when CONFIG_CC_OPTIMIZE_FOR_SIZE=y (Hugh)
- Add support for multi-LRC submission in the GuC backend (Matt B)
- Add extra cache flushing before making pages userspace visible (Matt A, Thomas)
- Mark internal GPU object pages dirty so they will be flushed properly (Matt A)
- Move remaining debugfs interfaces i915_wedged/i915_forcewake_user into gt (Andi)
- Replace the unconditional clflushes with drm_clflush_virt_range() (Ville)
- Remove IS_ACTIVE macro completely (Lucas)
- Improve kerneldocs for cache_dirty (Matt A)
- Add missing includes (Lucas)
- Selftest improvements (Matt R, Ran, Matt A)
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/YXFmLKoq8Fg9JxSd@jlahtine-mobl.ger.corp.intel.com
Diffstat (limited to 'drivers/gpu/drm/i915/gem')
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_busy.c | 57 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_context.c | 227 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_context_types.h | 16 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 9 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c | 797 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_object.c | 26 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_object.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_object_types.h | 27 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 29 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 8 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/selftests/huge_pages.c | 7 | ||||
-rw-r--r-- | drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c | 29 |
13 files changed, 947 insertions, 288 deletions
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_busy.c b/drivers/gpu/drm/i915/gem/i915_gem_busy.c index 6234e17259c1..7358bebef15c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_busy.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_busy.c @@ -4,6 +4,8 @@ * Copyright © 2014-2016 Intel Corporation */ +#include <linux/dma-fence-array.h> + #include "gt/intel_engine.h" #include "i915_gem_ioctls.h" @@ -36,7 +38,7 @@ static __always_inline u32 __busy_write_id(u16 id) } static __always_inline unsigned int -__busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) +__busy_set_if_active(struct dma_fence *fence, u32 (*flag)(u16 id)) { const struct i915_request *rq; @@ -46,29 +48,60 @@ __busy_set_if_active(const struct dma_fence *fence, u32 (*flag)(u16 id)) * to eventually flush us, but to minimise latency just ask the * hardware. * - * Note we only report on the status of native fences. + * Note we only report on the status of native fences and we currently + * have two native fences: + * + * 1. A composite fence (dma_fence_array) constructed of i915 requests + * created during a parallel submission. In this case we deconstruct the + * composite fence into individual i915 requests and check the status of + * each request. + * + * 2. A single i915 request. */ - if (!dma_fence_is_i915(fence)) + if (dma_fence_is_array(fence)) { + struct dma_fence_array *array = to_dma_fence_array(fence); + struct dma_fence **child = array->fences; + unsigned int nchild = array->num_fences; + + do { + struct dma_fence *current_fence = *child++; + + /* Not an i915 fence, can't be busy per above */ + if (!dma_fence_is_i915(current_fence) || + !test_bit(I915_FENCE_FLAG_COMPOSITE, + ¤t_fence->flags)) { + return 0; + } + + rq = to_request(current_fence); + if (!i915_request_completed(rq)) + return flag(rq->engine->uabi_class); + } while (--nchild); + + /* All requests in array complete, not busy */ return 0; + } else { + if (!dma_fence_is_i915(fence)) + return 0; - /* opencode to_request() in order to avoid const warnings */ - rq = container_of(fence, const struct i915_request, fence); - if (i915_request_completed(rq)) - return 0; + rq = to_request(fence); + if (i915_request_completed(rq)) + return 0; - /* Beware type-expansion follies! */ - BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); - return flag(rq->engine->uabi_class); + /* Beware type-expansion follies! */ + BUILD_BUG_ON(!typecheck(u16, rq->engine->uabi_class)); + return flag(rq->engine->uabi_class); + } } static __always_inline unsigned int -busy_check_reader(const struct dma_fence *fence) +busy_check_reader(struct dma_fence *fence) { return __busy_set_if_active(fence, __busy_read_flag); } static __always_inline unsigned int -busy_check_writer(const struct dma_fence *fence) +busy_check_writer(struct dma_fence *fence) { if (!fence) return 0; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c index d225d3dd0b40..fb33d0322960 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c @@ -556,9 +556,147 @@ set_proto_ctx_engines_bond(struct i915_user_extension __user *base, void *data) return 0; } +static int +set_proto_ctx_engines_parallel_submit(struct i915_user_extension __user *base, + void *data) +{ + struct i915_context_engines_parallel_submit __user *ext = + container_of_user(base, typeof(*ext), base); + const struct set_proto_ctx_engines *set = data; + struct drm_i915_private *i915 = set->i915; + u64 flags; + int err = 0, n, i, j; + u16 slot, width, num_siblings; + struct intel_engine_cs **siblings = NULL; + intel_engine_mask_t prev_mask; + + /* FIXME: This is NIY for execlists */ + if (!(intel_uc_uses_guc_submission(&i915->gt.uc))) + return -ENODEV; + + if (get_user(slot, &ext->engine_index)) + return -EFAULT; + + if (get_user(width, &ext->width)) + return -EFAULT; + + if (get_user(num_siblings, &ext->num_siblings)) + return -EFAULT; + + if (slot >= set->num_engines) { + drm_dbg(&i915->drm, "Invalid placement value, %d >= %d\n", + slot, set->num_engines); + return -EINVAL; + } + + if (set->engines[slot].type != I915_GEM_ENGINE_TYPE_INVALID) { + drm_dbg(&i915->drm, + "Invalid placement[%d], already occupied\n", slot); + return -EINVAL; + } + + if (get_user(flags, &ext->flags)) + return -EFAULT; + + if (flags) { + drm_dbg(&i915->drm, "Unknown flags 0x%02llx", flags); + return -EINVAL; + } + + for (n = 0; n < ARRAY_SIZE(ext->mbz64); n++) { + err = check_user_mbz(&ext->mbz64[n]); + if (err) + return err; + } + + if (width < 2) { + drm_dbg(&i915->drm, "Width (%d) < 2\n", width); + return -EINVAL; + } + + if (num_siblings < 1) { + drm_dbg(&i915->drm, "Number siblings (%d) < 1\n", + num_siblings); + return -EINVAL; + } + + siblings = kmalloc_array(num_siblings * width, + sizeof(*siblings), + GFP_KERNEL); + if (!siblings) + return -ENOMEM; + + /* Create contexts / engines */ + for (i = 0; i < width; ++i) { + intel_engine_mask_t current_mask = 0; + struct i915_engine_class_instance prev_engine; + + for (j = 0; j < num_siblings; ++j) { + struct i915_engine_class_instance ci; + + n = i * num_siblings + j; + if (copy_from_user(&ci, &ext->engines[n], sizeof(ci))) { + err = -EFAULT; + goto out_err; + } + + siblings[n] = + intel_engine_lookup_user(i915, ci.engine_class, + ci.engine_instance); + if (!siblings[n]) { + drm_dbg(&i915->drm, + "Invalid sibling[%d]: { class:%d, inst:%d }\n", + n, ci.engine_class, ci.engine_instance); + err = -EINVAL; + goto out_err; + } + + if (n) { + if (prev_engine.engine_class != + ci.engine_class) { + drm_dbg(&i915->drm, + "Mismatched class %d, %d\n", + prev_engine.engine_class, + ci.engine_class); + err = -EINVAL; + goto out_err; + } + } + + prev_engine = ci; + current_mask |= siblings[n]->logical_mask; + } + + if (i > 0) { + if (current_mask != prev_mask << 1) { + drm_dbg(&i915->drm, + "Non contiguous logical mask 0x%x, 0x%x\n", + prev_mask, current_mask); + err = -EINVAL; + goto out_err; + } + } + prev_mask = current_mask; + } + + set->engines[slot].type = I915_GEM_ENGINE_TYPE_PARALLEL; + set->engines[slot].num_siblings = num_siblings; + set->engines[slot].width = width; + set->engines[slot].siblings = siblings; + + return 0; + +out_err: + kfree(siblings); + + return err; +} + static const i915_user_extension_fn set_proto_ctx_engines_extensions[] = { [I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_proto_ctx_engines_balance, [I915_CONTEXT_ENGINES_EXT_BOND] = set_proto_ctx_engines_bond, + [I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT] = + set_proto_ctx_engines_parallel_submit, }; static int set_proto_ctx_engines(struct drm_i915_file_private *fpriv, @@ -794,6 +932,7 @@ static int intel_context_set_gem(struct intel_context *ce, GEM_BUG_ON(rcu_access_pointer(ce->gem_context)); RCU_INIT_POINTER(ce->gem_context, ctx); + GEM_BUG_ON(intel_context_is_pinned(ce)); ce->ring_size = SZ_16K; i915_vm_put(ce->vm); @@ -818,6 +957,25 @@ static int intel_context_set_gem(struct intel_context *ce, return ret; } +static void __unpin_engines(struct i915_gem_engines *e, unsigned int count) +{ + while (count--) { + struct intel_context *ce = e->engines[count], *child; + + if (!ce || !test_bit(CONTEXT_PERMA_PIN, &ce->flags)) + continue; + + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + } +} + +static void unpin_engines(struct i915_gem_engines *e) +{ + __unpin_engines(e, e->num_engines); +} + static void __free_engines(struct i915_gem_engines *e, unsigned int count) { while (count--) { @@ -933,6 +1091,40 @@ free_engines: return err; } +static int perma_pin_contexts(struct intel_context *ce) +{ + struct intel_context *child; + int i = 0, j = 0, ret; + + GEM_BUG_ON(!intel_context_is_parent(ce)); + + ret = intel_context_pin(ce); + if (unlikely(ret)) + return ret; + + for_each_child(ce, child) { + ret = intel_context_pin(child); + if (unlikely(ret)) + goto unwind; + ++i; + } + + set_bit(CONTEXT_PERMA_PIN, &ce->flags); + + return 0; + +unwind: + intel_context_unpin(ce); + for_each_child(ce, child) { + if (j++ < i) + intel_context_unpin(child); + else + break; + } + + return ret; +} + static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, unsigned int num_engines, struct i915_gem_proto_engine *pe) @@ -946,7 +1138,7 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, e->num_engines = num_engines; for (n = 0; n < num_engines; n++) { - struct intel_context *ce; + struct intel_context *ce, *child; int ret; switch (pe[n].type) { @@ -956,7 +1148,13 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, case I915_GEM_ENGINE_TYPE_BALANCED: ce = intel_engine_create_virtual(pe[n].siblings, - pe[n].num_siblings); + pe[n].num_siblings, 0); + break; + + case I915_GEM_ENGINE_TYPE_PARALLEL: + ce = intel_engine_create_parallel(pe[n].siblings, + pe[n].num_siblings, + pe[n].width); break; case I915_GEM_ENGINE_TYPE_INVALID: @@ -977,6 +1175,30 @@ static struct i915_gem_engines *user_engines(struct i915_gem_context *ctx, err = ERR_PTR(ret); goto free_engines; } + for_each_child(ce, child) { + ret = intel_context_set_gem(child, ctx, pe->sseu); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } + + /* + * XXX: Must be done after calling intel_context_set_gem as that + * function changes the ring size. The ring is allocated when + * the context is pinned. If the ring size is changed after + * allocation we have a mismatch of the ring size and will cause + * the context to hang. Presumably with a bit of reordering we + * could move the perma-pin step to the backend function + * intel_engine_create_parallel. + */ + if (pe[n].type == I915_GEM_ENGINE_TYPE_PARALLEL) { + ret = perma_pin_contexts(ce); + if (ret) { + err = ERR_PTR(ret); + goto free_engines; + } + } } return e; @@ -1219,6 +1441,7 @@ static void context_close(struct i915_gem_context *ctx) /* Flush any concurrent set_engines() */ mutex_lock(&ctx->engines_mutex); + unpin_engines(__context_engines_static(ctx)); engines_idle_release(ctx, rcu_replace_pointer(ctx->engines, NULL, 1)); i915_gem_context_set_closed(ctx); mutex_unlock(&ctx->engines_mutex); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h index a627b09c4680..282cdb8a5c5a 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h @@ -78,13 +78,16 @@ enum i915_gem_engine_type { /** @I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set */ I915_GEM_ENGINE_TYPE_BALANCED, + + /** @I915_GEM_ENGINE_TYPE_PARALLEL: A parallel engine set */ + I915_GEM_ENGINE_TYPE_PARALLEL, }; /** * struct i915_gem_proto_engine - prototype engine * * This struct describes an engine that a context may contain. Engines - * have three types: + * have four types: * * - I915_GEM_ENGINE_TYPE_INVALID: Invalid engines can be created but they * show up as a NULL in i915_gem_engines::engines[i] and any attempt to @@ -97,6 +100,10 @@ enum i915_gem_engine_type { * * - I915_GEM_ENGINE_TYPE_BALANCED: A load-balanced engine set, described * i915_gem_proto_engine::num_siblings and i915_gem_proto_engine::siblings. + * + * - I915_GEM_ENGINE_TYPE_PARALLEL: A parallel submission engine set, described + * i915_gem_proto_engine::width, i915_gem_proto_engine::num_siblings, and + * i915_gem_proto_engine::siblings. */ struct i915_gem_proto_engine { /** @type: Type of this engine */ @@ -105,10 +112,13 @@ struct i915_gem_proto_engine { /** @engine: Engine, for physical */ struct intel_engine_cs *engine; - /** @num_siblings: Number of balanced siblings */ + /** @num_siblings: Number of balanced or parallel siblings */ unsigned int num_siblings; - /** @siblings: Balanced siblings */ + /** @width: Width of each sibling */ + unsigned int width; + + /** @siblings: Balanced siblings or num_siblings * width for parallel */ struct intel_engine_cs **siblings; /** @sseu: Client-set SSEU parameters */ diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index afa34111de02..1adcd8e02d29 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -232,6 +232,7 @@ struct dma_buf *i915_gem_prime_export(struct drm_gem_object *gem_obj, int flags) static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); struct sg_table *pages; unsigned int sg_page_sizes; @@ -242,8 +243,11 @@ static int i915_gem_object_get_pages_dmabuf(struct drm_i915_gem_object *obj) if (IS_ERR(pages)) return PTR_ERR(pages); - sg_page_sizes = i915_sg_dma_sizes(pages->sgl); + /* XXX: consider doing a vmap flush or something */ + if (!HAS_LLC(i915) || i915_gem_object_can_bypass_llc(obj)) + wbinvd_on_all_cpus(); + sg_page_sizes = i915_sg_dma_sizes(pages->sgl); __i915_gem_object_set_pages(obj, pages, sg_page_sizes); return 0; @@ -301,7 +305,8 @@ struct drm_gem_object *i915_gem_prime_import(struct drm_device *dev, } drm_gem_private_object_init(dev, &obj->base, dma_buf->size); - i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_object_dmabuf_ops, &lock_class, + I915_BO_ALLOC_USER); obj->base.import_attach = attach; obj->base.resv = dma_buf->resv; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 8b3a25bd93e6..4d7da07442f2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -246,17 +246,25 @@ struct i915_execbuffer { struct drm_i915_gem_exec_object2 *exec; /** ioctl execobj[] */ struct eb_vma *vma; - struct intel_engine_cs *engine; /** engine to queue the request to */ + struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */ - struct i915_request *request; /** our request to build */ - struct eb_vma *batch; /** identity of the batch obj/vma */ + /** our requests to build */ + struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; + /** identity of the batch obj/vma */ + struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; struct i915_vma *trampoline; /** trampoline used for chaining */ + /** used for excl fence in dma_resv objects when > 1 BB submitted */ + struct dma_fence *composite_fence; + /** actual size of execobj[] as we may extend it for the cmdparser */ unsigned int buffer_count; + /* number of batches in execbuf IOCTL */ + unsigned int num_batches; + /** list of vma not yet bound during reservation phase */ struct list_head unbound; @@ -283,7 +291,8 @@ struct i915_execbuffer { u64 invalid_flags; /** Set of execobj.flags that are invalid */ - u64 batch_len; /** Length of batch within object */ + /** Length of batch within object */ + u64 batch_len[MAX_ENGINE_INSTANCE + 1]; u32 batch_start_offset; /** Location within object of batch */ u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */ @@ -301,14 +310,13 @@ struct i915_execbuffer { }; static int eb_parse(struct i915_execbuffer *eb); -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, - bool throttle); +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle); static void eb_unpin_engine(struct i915_execbuffer *eb); static inline bool eb_use_cmdparser(const struct i915_execbuffer *eb) { - return intel_engine_requires_cmd_parser(eb->engine) || - (intel_engine_using_cmd_parser(eb->engine) && + return intel_engine_requires_cmd_parser(eb->context->engine) || + (intel_engine_using_cmd_parser(eb->context->engine) && eb->args->batch_len); } @@ -535,11 +543,21 @@ eb_validate_vma(struct i915_execbuffer *eb, return 0; } -static void +static inline bool +is_batch_buffer(struct i915_execbuffer *eb, unsigned int buffer_idx) +{ + return eb->args->flags & I915_EXEC_BATCH_FIRST ? + buffer_idx < eb->num_batches : + buffer_idx >= eb->args->buffer_count - eb->num_batches; +} + +static int eb_add_vma(struct i915_execbuffer *eb, - unsigned int i, unsigned batch_idx, + unsigned int *current_batch, + unsigned int i, struct i915_vma *vma) { + struct drm_i915_private *i915 = eb->i915; struct drm_i915_gem_exec_object2 *entry = &eb->exec[i]; struct eb_vma *ev = &eb->vma[i]; @@ -566,15 +584,43 @@ eb_add_vma(struct i915_execbuffer *eb, * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere. */ - if (i == batch_idx) { + if (is_batch_buffer(eb, i)) { if (entry->relocation_count && !(ev->flags & EXEC_OBJECT_PINNED)) ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; if (eb->reloc_cache.has_fence) ev->flags |= EXEC_OBJECT_NEEDS_FENCE; - eb->batch = ev; + eb->batches[*current_batch] = ev; + + if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) { + drm_dbg(&i915->drm, + "Attempting to use self-modifying batch buffer\n"); + return -EINVAL; + } + + if (range_overflows_t(u64, + eb->batch_start_offset, + eb->args->batch_len, + ev->vma->size)) { + drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); + return -EINVAL; + } + + if (eb->args->batch_len == 0) + eb->batch_len[*current_batch] = ev->vma->size - + eb->batch_start_offset; + else + eb->batch_len[*current_batch] = eb->args->batch_len; + if (unlikely(eb->batch_len[*current_batch] == 0)) { /* impossible! */ + drm_dbg(&i915->drm, "Invalid batch length\n"); + return -EINVAL; + } + + ++*current_batch; } + + return 0; } static inline int use_cpu_reloc(const struct reloc_cache *cache, @@ -718,14 +764,6 @@ static int eb_reserve(struct i915_execbuffer *eb) } while (1); } -static unsigned int eb_batch_index(const struct i915_execbuffer *eb) -{ - if (eb->args->flags & I915_EXEC_BATCH_FIRST) - return 0; - else - return eb->buffer_count - 1; -} - static int eb_select_context(struct i915_execbuffer *eb) { struct i915_gem_context *ctx; @@ -846,9 +884,7 @@ static struct i915_vma *eb_lookup_vma(struct i915_execbuffer *eb, u32 handle) static int eb_lookup_vmas(struct i915_execbuffer *eb) { - struct drm_i915_private *i915 = eb->i915; - unsigned int batch = eb_batch_index(eb); - unsigned int i; + unsigned int i, current_batch = 0; int err = 0; INIT_LIST_HEAD(&eb->relocs); @@ -868,7 +904,9 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) goto err; } - eb_add_vma(eb, i, batch, vma); + err = eb_add_vma(eb, ¤t_batch, i, vma); + if (err) + return err; if (i915_gem_object_is_userptr(vma->obj)) { err = i915_gem_object_userptr_submit_init(vma->obj); @@ -891,26 +929,6 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb) } } - if (unlikely(eb->batch->flags & EXEC_OBJECT_WRITE)) { - drm_dbg(&i915->drm, - "Attempting to use self-modifying batch buffer\n"); - return -EINVAL; - } - - if (range_overflows_t(u64, - eb->batch_start_offset, eb->batch_len, - eb->batch->vma->size)) { - drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); - return -EINVAL; - } - - if (eb->batch_len == 0) - eb->batch_len = eb->batch->vma->size - eb->batch_start_offset; - if (unlikely(eb->batch_len == 0)) { /* impossible! */ - drm_dbg(&i915->drm, "Invalid batch length\n"); - return -EINVAL; - } - return 0; err: @@ -1643,8 +1661,7 @@ static int eb_reinit_userptr(struct i915_execbuffer *eb) return 0; } -static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb, - struct i915_request *rq) +static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb) { bool have_copy = false; struct eb_vma *ev; @@ -1660,21 +1677,6 @@ repeat: eb_release_vmas(eb, false); i915_gem_ww_ctx_fini(&eb->ww); - if (rq) { - /* nonblocking is always false */ - if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, - MAX_SCHEDULE_TIMEOUT) < 0) { - i915_request_put(rq); - rq = NULL; - - err = -EINTR; - goto err_relock; - } - - i915_request_put(rq); - rq = NULL; - } - /* * We take 3 passes through the slowpatch. * @@ -1701,28 +1703,21 @@ repeat: if (!err) err = eb_reinit_userptr(eb); -err_relock: i915_gem_ww_ctx_init(&eb->ww, true); if (err) goto out; /* reacquire the objects */ repeat_validate: - rq = eb_pin_engine(eb, false); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, false); + if (err) goto err; - } - - /* We didn't throttle, should be NULL */ - GEM_WARN_ON(rq); err = eb_validate_vmas(eb); if (err) goto err; - GEM_BUG_ON(!eb->batch); + GEM_BUG_ON(!eb->batches[0]); list_for_each_entry(ev, &eb->relocs, reloc_link) { if (!have_copy) { @@ -1786,46 +1781,23 @@ out: } } - if (rq) - i915_request_put(rq); - return err; } static int eb_relocate_parse(struct i915_execbuffer *eb) { int err; - struct i915_request *rq = NULL; bool throttle = true; retry: - rq = eb_pin_engine(eb, throttle); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - rq = NULL; + err = eb_pin_engine(eb, throttle); + if (err) { if (err != -EDEADLK) return err; goto err; } - if (rq) { - bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; - - /* Need to drop all locks now for throttling, take slowpath */ - err = i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, 0); - if (err == -ETIME) { - if (nonblock) { - err = -EWOULDBLOCK; - i915_request_put(rq); - goto err; - } - goto slow; - } - i915_request_put(rq); - rq = NULL; - } - /* only throttle once, even if we didn't need to throttle */ throttle = false; @@ -1865,7 +1837,7 @@ err: return err; slow: - err = eb_relocate_parse_slow(eb, rq); + err = eb_relocate_parse_slow(eb); if (err) /* * If the user expects the execobject.offset and @@ -1879,11 +1851,40 @@ slow: return err; } +/* + * Using two helper loops for the order of which requests / batches are created + * and added the to backend. Requests are created in order from the parent to + * the last child. Requests are added in the reverse order, from the last child + * to parent. This is done for locking reasons as the timeline lock is acquired + * during request creation and released when the request is added to the + * backend. To make lockdep happy (see intel_context_timeline_lock) this must be + * the ordering. + */ +#define for_each_batch_create_order(_eb, _i) \ + for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) +#define for_each_batch_add_order(_eb, _i) \ + BUILD_BUG_ON(!typecheck(int, _i)); \ + for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i)) + +static struct i915_request * +eb_find_first_request_added(struct i915_execbuffer *eb) +{ + int i; + + for_each_batch_add_order(eb, i) + if (eb->requests[i]) + return eb->requests[i]; + + GEM_BUG_ON("Request not found"); + + return NULL; +} + static int eb_move_to_gpu(struct i915_execbuffer *eb) { const unsigned int count = eb->buffer_count; unsigned int i = count; - int err = 0; + int err = 0, j; while (i--) { struct eb_vma *ev = &eb->vma[i]; @@ -1896,11 +1897,17 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) if (flags & EXEC_OBJECT_CAPTURE) { struct i915_capture_list *capture; - capture = kmalloc(sizeof(*capture), GFP_KERNEL); - if (capture) { - capture->next = eb->request->capture_list; - capture->vma = vma; - eb->request->capture_list = capture; + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + capture = kmalloc(sizeof(*capture), GFP_KERNEL); + if (capture) { + capture->next = + eb->requests[j]->capture_list; + capture->vma = vma; + eb->requests[j]->capture_list = capture; + } } } @@ -1915,20 +1922,43 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) * but gcc's optimiser doesn't handle that as well and emits * two jumps instead of one. Maybe one day... + * + * FIXME: There is also sync flushing in set_pages(), which + * serves a different purpose(some of the time at least). + * + * We should consider: + * + * 1. Rip out the async flush code. + * + * 2. Or make the sync flushing use the async clflush path + * using mandatory fences underneath. Currently the below + * async flush happens after we bind the object. */ if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { if (i915_gem_clflush_object(obj, 0)) flags &= ~EXEC_OBJECT_ASYNC; } + /* We only need to await on the first request */ if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) { err = i915_request_await_object - (eb->request, obj, flags & EXEC_OBJECT_WRITE); + (eb_find_first_request_added(eb), obj, + flags & EXEC_OBJECT_WRITE); } - if (err == 0) - err = i915_vma_move_to_active(vma, eb->request, - flags | __EXEC_OBJECT_NO_RESERVE); + for_each_batch_add_order(eb, j) { + if (err) + break; + if (!eb->requests[j]) + continue; + + err = _i915_vma_move_to_active(vma, eb->requests[j], + j ? NULL : + eb->composite_fence ? + eb->composite_fence : + &eb->requests[j]->fence, + flags | __EXEC_OBJECT_NO_RESERVE); + } } #ifdef CONFIG_MMU_NOTIFIER @@ -1959,11 +1989,16 @@ static int eb_move_to_gpu(struct i915_execbuffer *eb) goto err_skip; /* Unconditionally flush any chipset caches (for streaming writes). */ - intel_gt_chipset_flush(eb->engine->gt); + intel_gt_chipset_flush(eb->gt); return 0; err_skip: - i915_request_set_error_once(eb->request, err); + for_each_batch_create_order(eb, j) { + if (!eb->requests[j]) + break; + + i915_request_set_error_once(eb->requests[j], err); + } return err; } @@ -2058,14 +2093,17 @@ static int eb_parse(struct i915_execbuffer *eb) int err; if (!eb_use_cmdparser(eb)) { - batch = eb_dispatch_secure(eb, eb->batch->vma); + batch = eb_dispatch_secure(eb, eb->batches[0]->vma); if (IS_ERR(batch)) return PTR_ERR(batch); goto secure_batch; } - len = eb->batch_len; + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + len = eb->batch_len[0]; if (!CMDPARSER_USES_GGTT(eb->i915)) { /* * ppGTT backed shadow buffers must be mapped RO, to prevent @@ -2079,11 +2117,11 @@ static int eb_parse(struct i915_execbuffer *eb) } else { len += I915_CMD_PARSER_TRAMPOLINE_SIZE; } - if (unlikely(len < eb->batch_len)) /* last paranoid check of overflow */ + if (unlikely(len < eb->batch_len[0])) /* last paranoid check of overflow */ return -EINVAL; if (!pool) { - pool = intel_gt_get_buffer_pool(eb->engine->gt, len, + pool = intel_gt_get_buffer_pool(eb->gt, len, I915_MAP_WB); if (IS_ERR(pool)) return PTR_ERR(pool); @@ -2108,7 +2146,7 @@ static int eb_parse(struct i915_execbuffer *eb) trampoline = shadow; shadow = shadow_batch_pin(eb, pool->obj, - &eb->engine->gt->ggtt->vm, + &eb->gt->ggtt->vm, PIN_GLOBAL); if (IS_ERR(shadow)) { err = PTR_ERR(shadow); @@ -2130,26 +2168,29 @@ static int eb_parse(struct i915_execbuffer *eb) if (err) goto err_trampoline; - err = intel_engine_cmd_parser(eb->engine, - eb->batch->vma, + err = intel_engine_cmd_parser(eb->context->engine, + eb->batches[0]->vma, eb->batch_start_offset, - eb->batch_len, + eb->batch_len[0], shadow, trampoline); if (err) goto err_unpin_batch; - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->vma = i915_vma_get(shadow); - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->vma = i915_vma_get(shadow); + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; eb->trampoline = trampoline; eb->batch_start_offset = 0; secure_batch: if (batch) { - eb->batch = &eb->vma[eb->buffer_count++]; - eb->batch->flags = __EXEC_OBJECT_HAS_PIN; - eb->batch->vma = i915_vma_get(batch); + if (intel_context_is_parallel(eb->context)) + return -EINVAL; + + eb->batches[0] = &eb->vma[eb->buffer_count++]; + eb->batches[0]->flags = __EXEC_OBJECT_HAS_PIN; + eb->batches[0]->vma = i915_vma_get(batch); } return 0; @@ -2165,19 +2206,18 @@ err: return err; } -static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) +static int eb_request_submit(struct i915_execbuffer *eb, + struct i915_request *rq, + struct i915_vma *batch, + u64 batch_len) { int err; - if (intel_context_nopreempt(eb->context)) - __set_bit(I915_FENCE_FLAG_NOPREEMPT, &eb->request->fence.flags); - - err = eb_move_to_gpu(eb); - if (err) - return err; + if (intel_context_nopreempt(rq->context)) + __set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) { - err = i915_reset_gen7_sol_offsets(eb->request); + err = i915_reset_gen7_sol_offsets(rq); if (err) return err; } @@ -2188,26 +2228,26 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) * allows us to determine if the batch is still waiting on the GPU * or actually running by checking the breadcrumb. */ - if (eb->engine->emit_init_breadcrumb) { - err = eb->engine->emit_init_breadcrumb(eb->request); + if (rq->context->engine->emit_init_breadcrumb) { + err = rq->context->engine->emit_init_breadcrumb(rq); if (err) return err; } - err = eb->engine->emit_bb_start(eb->request, - batch->node.start + - eb->batch_start_offset, - eb->batch_len, - eb->batch_flags); + err = rq->context->engine->emit_bb_start(rq, + batch->node.start + + eb->batch_start_offset, + batch_len, + eb->batch_flags); if (err) return err; if (eb->trampoline) { + GEM_BUG_ON(intel_context_is_parallel(rq->context)); GEM_BUG_ON(eb->batch_start_offset); - err = eb->engine->emit_bb_start(eb->request, - eb->trampoline->node.start + - eb->batch_len, - 0, 0); + err = rq->context->engine->emit_bb_start(rq, + eb->trampoline->node.start + + batch_len, 0, 0); if (err) return err; } @@ -2215,6 +2255,27 @@ static int eb_submit(struct i915_execbuffer *eb, struct i915_vma *batch) return 0; } +static int eb_submit(struct i915_execbuffer *eb) +{ + unsigned int i; + int err; + + err = eb_move_to_gpu(eb); + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + trace_i915_request_queue(eb->requests[i], eb->batch_flags); + if (!err) + err = eb_request_submit(eb, eb->requests[i], + eb->batches[i]->vma, + eb->batch_len[i]); + } + + return err; +} + static int num_vcs_engines(const struct drm_i915_private *i915) { return hweight_long(VDBOX_MASK(&i915->gt)); @@ -2280,26 +2341,11 @@ static struct i915_request *eb_throttle(struct i915_execbuffer *eb, struct intel return i915_request_get(rq); } -static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +static int eb_pin_timeline(struct i915_execbuffer *eb, struct intel_context *ce, + bool throttle) { - struct intel_context *ce = eb->context; struct intel_timeline *tl; struct i915_request *rq = NULL; - int err; - - GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); - - if (unlikely(intel_context_is_banned(ce))) - return ERR_PTR(-EIO); - - /* - * Pinning the contexts may generate requests in order to acquire - * GGTT space, so do this first before we reserve a seqno for - * ourselves. - */ - err = intel_context_pin_ww(ce, &eb->ww); - if (err) - return ERR_PTR(err); /* * Take a local wakeref for preparing to dispatch the execbuf as @@ -2310,33 +2356,108 @@ static struct i915_request *eb_pin_engine(struct i915_execbuffer *eb, bool throt * taken on the engine, and the parent device. */ tl = intel_context_timeline_lock(ce); - if (IS_ERR(tl)) { - intel_context_unpin(ce); - return ERR_CAST(tl); - } + if (IS_ERR(tl)) + return PTR_ERR(tl); intel_context_enter(ce); if (throttle) rq = eb_throttle(eb, ce); intel_context_timeline_unlock(tl); + if (rq) { + bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; + long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT; + + if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE, + timeout) < 0) { + i915_request_put(rq); + + tl = intel_context_timeline_lock(ce); + intel_context_exit(ce); + intel_context_timeline_unlock(tl); + + if (nonblock) + return -EWOULDBLOCK; + else + return -EINTR; + } + i915_request_put(rq); + } + + return 0; +} + +static int eb_pin_engine(struct i915_execbuffer *eb, bool throttle) +{ + struct intel_context *ce = eb->context, *child; + int err; + int i = 0, j = 0; + + GEM_BUG_ON(eb->args->flags & __EXEC_ENGINE_PINNED); + + if (unlikely(intel_context_is_banned(ce))) + return -EIO; + + /* + * Pinning the contexts may generate requests in order to acquire + * GGTT space, so do this first before we reserve a seqno for + * ourselves. + */ + err = intel_context_pin_ww(ce, &eb->ww); + if (err) + return err; + for_each_child(ce, child) { + err = intel_context_pin_ww(child, &eb->ww); + GEM_BUG_ON(err); /* perma-pinned should incr a counter */ + } + + for_each_child(ce, child) { + err = eb_pin_timeline(eb, child, throttle); + if (err) + goto unwind; + ++i; + } + err = eb_pin_timeline(eb, ce, throttle); + if (err) + goto unwind; + eb->args->flags |= __EXEC_ENGINE_PINNED; - return rq; + return 0; + +unwind: + for_each_child(ce, child) { + if (j++ < i) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + } + } + for_each_child(ce, child) + intel_context_unpin(child); + intel_context_unpin(ce); + return err; } static void eb_unpin_engine(struct i915_execbuffer *eb) { - struct intel_context *ce = eb->context; - struct intel_timeline *tl = ce->timeline; + struct intel_context *ce = eb->context, *child; if (!(eb->args->flags & __EXEC_ENGINE_PINNED)) return; eb->args->flags &= ~__EXEC_ENGINE_PINNED; - mutex_lock(&tl->mutex); + for_each_child(ce, child) { + mutex_lock(&child->timeline->mutex); + intel_context_exit(child); + mutex_unlock(&child->timeline->mutex); + + intel_context_unpin(child); + } + + mutex_lock(&ce->timeline->mutex); intel_context_exit(ce); - mutex_unlock(&tl->mutex); + mutex_unlock(&ce->timeline->mutex); intel_context_unpin(ce); } @@ -2387,7 +2508,7 @@ eb_select_legacy_ring(struct i915_execbuffer *eb) static int eb_select_engine(struct i915_execbuffer *eb) { - struct intel_context *ce; + struct intel_context *ce, *child; unsigned int idx; int err; @@ -2400,6 +2521,20 @@ eb_select_engine(struct i915_execbuffer *eb) if (IS_ERR(ce)) return PTR_ERR(ce); + if (intel_context_is_parallel(ce)) { + if (eb->buffer_count < ce->parallel.number_children + 1) { + intel_context_put(ce); + return -EINVAL; + } + if (eb->batch_start_offset || eb->args->batch_len) { + intel_context_put(ce); + return -EINVAL; + } + } + eb->num_batches = ce->parallel.number_children + 1; + + for_each_child(ce, child) + intel_context_get(child); intel_gt_pm_get(ce->engine->gt); if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) { @@ -2407,6 +2542,13 @@ eb_select_engine(struct i915_execbuffer *eb) if (err) goto err; } + for_each_child(ce, child) { + if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) { + err = intel_context_alloc_state(child); + if (err) + goto err; + } + } /* * ABI: Before userspace accesses the GPU (e.g. execbuffer), report @@ -2417,7 +2559,7 @@ eb_select_engine(struct i915_execbuffer *eb) goto err; eb->context = ce; - eb->engine = ce->engine; + eb->gt = ce->engine->gt; /* * Make sure engine pool stays alive even if we call intel_context_put @@ -2428,6 +2570,8 @@ eb_select_engine(struct i915_execbuffer *eb) err: intel_gt_pm_put(ce->engine->gt); + for_each_child(ce, child) + intel_context_put(child); intel_context_put(ce); return err; } @@ -2435,7 +2579,11 @@ err: static void eb_put_engine(struct i915_execbuffer *eb) { - intel_gt_pm_put(eb->engine->gt); + struct intel_context *child; + + intel_gt_pm_put(eb->gt); + for_each_child(eb->context, child) + intel_context_put(child); intel_context_put(eb->context); } @@ -2658,7 +2806,8 @@ static void put_fence_array(struct eb_fence *fences, int num_fences) } static int -await_fence_array(struct i915_execbuffer *eb) +await_fence_array(struct i915_execbuffer *eb, + struct i915_request *rq) { unsigned int n; int err; @@ -2672,8 +2821,7 @@ await_fence_array(struct i915_execbuffer *eb) if (!eb->fences[n].dma_fence) continue; - err = i915_request_await_dma_fence(eb->request, - eb->fences[n].dma_fence); + err = i915_request_await_dma_fence(rq, eb->fences[n].dma_fence); if (err < 0) return err; } @@ -2681,9 +2829,9 @@ await_fence_array(struct i915_execbuffer *eb) return 0; } -static void signal_fence_array(const struct i915_execbuffer *eb) +static void signal_fence_array(const struct i915_execbuffer *eb, + struct dma_fence * const fence) { - struct dma_fence * const fence = &eb->request->fence; unsigned int n; for (n = 0; n < eb->num_fences; n++) { @@ -2731,9 +2879,9 @@ static void retire_requests(struct intel_timeline *tl, struct i915_request *end) break; } -static int eb_request_add(struct i915_execbuffer *eb, int err) +static int eb_request_add(struct i915_execbuffer *eb, struct i915_request *rq, + int err, bool last_parallel) { - struct i915_request *rq = eb->request; struct intel_timeline * const tl = i915_request_timeline(rq); struct i915_sched_attr attr = {}; struct i915_request *prev; @@ -2755,6 +2903,17 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) err = -ENOENT; /* override any transient errors */ } + if (intel_context_is_parallel(eb->context)) { + if (err) { + __i915_request_skip(rq); + set_bit(I915_FENCE_FLAG_SKIP_PARALLEL, + &rq->fence.flags); + } + if (last_parallel) + set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL, + &rq->fence.flags); + } + __i915_request_queue(rq, &attr); /* Try to clean up the client's timeline after submitting the request */ @@ -2766,6 +2925,25 @@ static int eb_request_add(struct i915_execbuffer *eb, int err) return err; } +static int eb_requests_add(struct i915_execbuffer *eb, int err) +{ + int i; + + /* + * We iterate in reverse order of creation to release timeline mutexes in + * same order. + */ + for_each_batch_add_order(eb, i) { + struct i915_request *rq = eb->requests[i]; + + if (!rq) + continue; + err |= eb_request_add(eb, rq, err, i == 0); + } + + return err; +} + static const i915_user_extension_fn execbuf_extensions[] = { [DRM_I915_GEM_EXECBUFFER_EXT_TIMELINE_FENCES] = parse_timeline_fences, }; @@ -2792,6 +2970,185 @@ parse_execbuf2_extensions(struct drm_i915_gem_execbuffer2 *args, eb); } +static void eb_requests_get(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_get(eb->requests[i]); + } +} + +static void eb_requests_put(struct i915_execbuffer *eb) +{ + unsigned int i; + + for_each_batch_create_order(eb, i) { + if (!eb->requests[i]) + break; + + i915_request_put(eb->requests[i]); + } +} + +static struct sync_file * +eb_composite_fence_create(struct i915_execbuffer *eb, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + struct dma_fence_array *fence_array; + struct dma_fence **fences; + unsigned int i; + + GEM_BUG_ON(!intel_context_is_parent(eb->context)); + + fences = kmalloc_array(eb->num_batches, sizeof(*fences), GFP_KERNEL); + if (!fences) + return ERR_PTR(-ENOMEM); + + for_each_batch_create_order(eb, i) { + fences[i] = &eb->requests[i]->fence; + __set_bit(I915_FENCE_FLAG_COMPOSITE, + &eb->requests[i]->fence.flags); + } + + fence_array = dma_fence_array_create(eb->num_batches, + fences, + eb->context->parallel.fence_context, + eb->context->parallel.seqno, + false); + if (!fence_array) { + kfree(fences); + return ERR_PTR(-ENOMEM); + } + + /* Move ownership to the dma_fence_array created above */ + for_each_batch_create_order(eb, i) + dma_fence_get(fences[i]); + + if (out_fence_fd != -1) { + out_fence = sync_file_create(&fence_array->base); + /* sync_file now owns fence_arry, drop creation ref */ + dma_fence_put(&fence_array->base); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + eb->composite_fence = &fence_array->base; + + return out_fence; +} + +static struct sync_file * +eb_fences_add(struct i915_execbuffer *eb, struct i915_request *rq, + struct dma_fence *in_fence, int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + int err; + + if (unlikely(eb->gem_context->syncobj)) { + struct dma_fence *fence; + + fence = drm_syncobj_fence_get(eb->gem_context->syncobj); + err = i915_request_await_dma_fence(rq, fence); + dma_fence_put(fence); + if (err) + return ERR_PTR(err); + } + + if (in_fence) { + if (eb->args->flags & I915_EXEC_FENCE_SUBMIT) + err = i915_request_await_execution(rq, in_fence); + else + err = i915_request_await_dma_fence(rq, in_fence); + if (err < 0) + return ERR_PTR(err); + } + + if (eb->fences) { + err = await_fence_array(eb, rq); + if (err) + return ERR_PTR(err); + } + + if (intel_context_is_parallel(eb->context)) { + out_fence = eb_composite_fence_create(eb, out_fence_fd); + if (IS_ERR(out_fence)) + return ERR_PTR(-ENOMEM); + } else if (out_fence_fd != -1) { + out_fence = sync_file_create(&rq->fence); + if (!out_fence) + return ERR_PTR(-ENOMEM); + } + + return out_fence; +} + +static struct intel_context * +eb_find_context(struct i915_execbuffer *eb, unsigned int context_number) +{ + struct intel_context *child; + + if (likely(context_number == 0)) + return eb->context; + + for_each_child(eb->context, child) + if (!--context_number) + return child; + + GEM_BUG_ON("Context not found"); + + return NULL; +} + +static struct sync_file * +eb_requests_create(struct i915_execbuffer *eb, struct dma_fence *in_fence, + int out_fence_fd) +{ + struct sync_file *out_fence = NULL; + unsigned int i; + + for_each_batch_create_order(eb, i) { + /* Allocate a request for this batch buffer nice and early. */ + eb->requests[i] = i915_request_create(eb_find_context(eb, i)); + if (IS_ERR(eb->requests[i])) { + out_fence = ERR_PTR(PTR_ERR(eb->requests[i])); + eb->requests[i] = NULL; + return out_fence; + } + + /* + * Only the first request added (committed to backend) has to + * take the in fences into account as all subsequent requests + * will have fences inserted inbetween them. + */ + if (i + 1 == eb->num_batches) { + out_fence = eb_fences_add(eb, eb->requests[i], + in_fence, out_fence_fd); + if (IS_ERR(out_fence)) + return out_fence; + } + + /* + * Whilst this request exists, batch_obj will be on the + * active_list, and so will hold the active reference. Only when + * this request is retired will the batch_obj be moved onto + * the inactive_list and lose its active reference. Hence we do + * not need to explicitly hold another reference here. + */ + eb->requests[i]->batch = eb->batches[i]->vma; + if (eb->batch_pool) { + GEM_BUG_ON(intel_context_is_parallel(eb->context)); + intel_gt_buffer_pool_mark_active(eb->batch_pool, + eb->requests[i]); + } + } + + return out_fence; +} + static int i915_gem_do_execbuffer(struct drm_device *dev, struct drm_file *file, @@ -2802,7 +3159,6 @@ i915_gem_do_execbuffer(struct drm_device *dev, struct i915_execbuffer eb; struct dma_fence *in_fence = NULL; struct sync_file *out_fence = NULL; - struct i915_vma *batch; int out_fence_fd = -1; int err; @@ -2826,12 +3182,15 @@ i915_gem_do_execbuffer(struct drm_device *dev, eb.buffer_count = args->buffer_count; eb.batch_start_offset = args->batch_start_offset; - eb.batch_len = args->batch_len; eb.trampoline = NULL; eb.fences = NULL; eb.num_fences = 0; + memset(eb.requests, 0, sizeof(struct i915_request *) * + ARRAY_SIZE(eb.requests)); + eb.composite_fence = NULL; + eb.batch_flags = 0; if (args->flags & I915_EXEC_SECURE) { if (GRAPHICS_VER(i915) >= 11) @@ -2915,70 +3274,25 @@ i915_gem_do_execbuffer(struct drm_device *dev, ww_acquire_done(&eb.ww.ctx); - batch = eb.batch->vma; - - /* Allocate a request for this batch buffer nice and early. */ - eb.request = i915_request_create(eb.context); - if (IS_ERR(eb.request)) { - err = PTR_ERR(eb.request); - goto err_vma; - } - - if (unlikely(eb.gem_context->syncobj)) { - struct dma_fence *fence; - - fence = drm_syncobj_fence_get(eb.gem_context->syncobj); - err = i915_request_await_dma_fence(eb.request, fence); - dma_fence_put(fence); - if (err) - goto err_ext; - } - - if (in_fence) { - if (args->flags & I915_EXEC_FENCE_SUBMIT) - err = i915_request_await_execution(eb.request, - in_fence); - else - err = i915_request_await_dma_fence(eb.request, - in_fence); - if (err < 0) - goto err_request; - } - - if (eb.fences) { - err = await_fence_array(&eb); - if (err) + out_fence = eb_requests_create(&eb, in_fence, out_fence_fd); + if (IS_ERR(out_fence)) { + err = PTR_ERR(out_fence); + if (eb.requests[0]) goto err_request; + else + goto err_vma; } - if (out_fence_fd != -1) { - out_fence = sync_file_create(&eb.request->fence); - if (!out_fence) { - err = -ENOMEM; - goto err_request; - } - } - - /* - * Whilst this request exists, batch_obj will be on the - * active_list, and so will hold the active reference. Only when this - * request is retired will the the batch_obj be moved onto the - * inactive_list and lose its active reference. Hence we do not need - * to explicitly hold another reference here. - */ - eb.request->batch = batch; - if (eb.batch_pool) - intel_gt_buffer_pool_mark_active(eb.batch_pool, eb.request); - - trace_i915_request_queue(eb.request, eb.batch_flags); - err = eb_submit(&eb, batch); + err = eb_submit(&eb); err_request: - i915_request_get(eb.request); - err = eb_request_add(&eb, err); + eb_requests_get(&eb); + err = eb_requests_add(&eb, err); if (eb.fences) - signal_fence_array(&eb); + signal_fence_array(&eb, eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); if (out_fence) { if (err == 0) { @@ -2993,10 +3307,15 @@ err_request: if (unlikely(eb.gem_context->syncobj)) { drm_syncobj_replace_fence(eb.gem_context->syncobj, - &eb.request->fence); + eb.composite_fence ? + eb.composite_fence : + &eb.requests[0]->fence); } - i915_request_put(eb.request); + if (!out_fence && eb.composite_fence) + dma_fence_put(eb.composite_fence); + + eb_requests_put(&eb); err_vma: eb_release_vmas(&eb, true); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c b/drivers/gpu/drm/i915/gem/i915_gem_internal.c index e5ae9c06510c..a57a6b7013c2 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c @@ -134,6 +134,8 @@ static void i915_gem_object_put_pages_internal(struct drm_i915_gem_object *obj, internal_free_pages(pages); obj->mm.dirty = false; + + __start_cpu_write(obj); } static const struct drm_i915_gem_object_ops i915_gem_object_internal_ops = { diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c b/drivers/gpu/drm/i915/gem/i915_gem_object.c index 76ce6a1500bc..1e426a42a36c 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c @@ -128,6 +128,32 @@ void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE); } +bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj) +{ + struct drm_i915_private *i915 = to_i915(obj->base.dev); + + /* + * This is purely from a security perspective, so we simply don't care + * about non-userspace objects being able to bypass the LLC. + */ + if (!(obj->flags & I915_BO_ALLOC_USER)) + return false; + + /* + * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it + * possible for userspace to bypass the GTT caching bits set by the + * kernel, as per the given object cache_level. This is troublesome + * since the heavy flush we apply when first gathering the pages is + * skipped if the kernel thinks the object is coherent with the GPU. As + * a result it might be possible to bypass the cache and read the + * contents of the page directly, which could be stale data. If it's + * just a case of userspace shooting themselves in the foot then so be + * it, but since i915 takes the stance of always zeroing memory before + * handing it to userspace, we need to prevent this. + */ + return IS_JSL_EHL(i915); +} + static void i915_gem_close_object(struct drm_gem_object *gem, struct drm_file *file) { struct drm_i915_gem_object *obj = to_intel_bo(gem); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h b/drivers/gpu/drm/i915/gem/i915_gem_object.h index 9df3ee60604e..59201801cec5 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h @@ -514,6 +514,7 @@ i915_gem_object_finish_access(struct drm_i915_gem_object *obj) void i915_gem_object_set_cache_coherency(struct drm_i915_gem_object *obj, unsigned int cache_level); +bool i915_gem_object_can_bypass_llc(struct drm_i915_gem_object *obj); void i915_gem_object_flush_if_display(struct drm_i915_gem_object *obj); void i915_gem_object_flush_if_display_locked(struct drm_i915_gem_object *obj); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h index 7c3da4e3e737..da85169006d4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h +++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h @@ -427,6 +427,33 @@ struct drm_i915_gem_object { * can freely bypass the CPU cache when touching the pages with the GPU, * where the kernel is completely unaware. On such platform we need * apply the sledgehammer-on-acquire regardless of the @cache_coherent. + * + * Special care is taken on non-LLC platforms, to prevent potential + * information leak. The driver currently ensures: + * + * 1. All userspace objects, by default, have @cache_level set as + * I915_CACHE_NONE. The only exception is userptr objects, where we + * instead force I915_CACHE_LLC, but we also don't allow userspace to + * ever change the @cache_level for such objects. Another special case + * is dma-buf, which doesn't rely on @cache_dirty, but there we + * always do a forced flush when acquiring the pages, if there is a + * chance that the pages can be read directly from main memory with + * the GPU. + * + * 2. All I915_CACHE_NONE objects have @cache_dirty initially true. + * + * 3. All swapped-out objects(i.e shmem) have @cache_dirty set to + * true. + * + * 4. The @cache_dirty is never freely reset before the initial + * flush, even if userspace adjusts the @cache_level through the + * i915_gem_set_caching_ioctl. + * + * 5. All @cache_dirty objects(including swapped-in) are initially + * flushed with a synchronous call to drm_clflush_sg in + * __i915_gem_object_set_pages. The @cache_dirty can be freely reset + * at this point. All further asynchronous clfushes are never security + * critical, i.e userspace is free to race against itself. */ unsigned int cache_dirty:1; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c index 11f072193f3b..d77da59fae04 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c @@ -182,22 +182,7 @@ rebuild_st: if (i915_gem_object_needs_bit17_swizzle(obj)) i915_gem_object_do_bit_17_swizzle(obj, st); - /* - * EHL and JSL add the 'Bypass LLC' MOCS entry, which should make it - * possible for userspace to bypass the GTT caching bits set by the - * kernel, as per the given object cache_level. This is troublesome - * since the heavy flush we apply when first gathering the pages is - * skipped if the kernel thinks the object is coherent with the GPU. As - * a result it might be possible to bypass the cache and read the - * contents of the page directly, which could be stale data. If it's - * just a case of userspace shooting themselves in the foot then so be - * it, but since i915 takes the stance of always zeroing memory before - * handing it to userspace, we need to prevent this. - * - * By setting cache_dirty here we make the clflush in set_pages - * unconditional on such platforms. - */ - if (IS_JSL_EHL(i915) && obj->flags & I915_BO_ALLOC_USER) + if (i915_gem_object_can_bypass_llc(obj)) obj->cache_dirty = true; __i915_gem_object_set_pages(obj, st, sg_page_sizes); @@ -301,6 +286,8 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages, bool needs_clflush) { + struct drm_i915_private *i915 = to_i915(obj->base.dev); + GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); if (obj->mm.madv == I915_MADV_DONTNEED) @@ -312,6 +299,16 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, drm_clflush_sg(pages); __start_cpu_write(obj); + /* + * On non-LLC platforms, force the flush-on-acquire if this is ever + * swapped-in. Our async flush path is not trust worthy enough yet(and + * happens in the wrong order), and with some tricks it's conceivable + * for userspace to change the cache-level to I915_CACHE_NONE after the + * pages are swapped-in, and since execbuf binds the object before doing + * the async flush, we have a race window. + */ + if (!HAS_LLC(i915)) + obj->cache_dirty = true; } void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 8ea0fa665e53..3173c9f9a040 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -165,8 +165,11 @@ alloc_table: goto err; } - sg_page_sizes = i915_sg_dma_sizes(st->sgl); + WARN_ON_ONCE(!(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_WRITE)); + if (i915_gem_object_can_bypass_llc(obj)) + obj->cache_dirty = true; + sg_page_sizes = i915_sg_dma_sizes(st->sgl); __i915_gem_object_set_pages(obj, st, sg_page_sizes); return 0; @@ -546,7 +549,8 @@ i915_gem_userptr_ioctl(struct drm_device *dev, return -ENOMEM; drm_gem_private_object_init(dev, &obj->base, args->user_size); - i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, 0); + i915_gem_object_init(obj, &i915_gem_userptr_ops, &lock_class, + I915_BO_ALLOC_USER); obj->mem_flags = I915_BO_FLAG_STRUCT_PAGE; obj->read_domains = I915_GEM_DOMAIN_CPU; obj->write_domain = I915_GEM_DOMAIN_CPU; diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c index 41d0680f3bd7..b2003133deaf 100644 --- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c +++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c @@ -136,6 +136,8 @@ static void put_huge_pages(struct drm_i915_gem_object *obj, huge_pages_free_pages(pages); obj->mm.dirty = false; + + __start_cpu_write(obj); } static const struct drm_i915_gem_object_ops huge_page_ops = { @@ -152,6 +154,7 @@ huge_pages_object(struct drm_i915_private *i915, { static struct lock_class_key lock_class; struct drm_i915_gem_object *obj; + unsigned int cache_level; GEM_BUG_ON(!size); GEM_BUG_ON(!IS_ALIGNED(size, BIT(__ffs(page_mask)))); @@ -173,7 +176,9 @@ huge_pages_object(struct drm_i915_private *i915, obj->write_domain = I915_GEM_DOMAIN_CPU; obj->read_domains = I915_GEM_DOMAIN_CPU; - obj->cache_level = I915_CACHE_NONE; + + cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE; + i915_gem_object_set_cache_coherency(obj, cache_level); obj->mm.page_mask = page_mask; diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c index ecbcbb86ae1e..8402ed925a69 100644 --- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c +++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c @@ -17,13 +17,20 @@ #include "huge_gem_object.h" #include "mock_context.h" +enum client_tiling { + CLIENT_TILING_LINEAR, + CLIENT_TILING_X, + CLIENT_TILING_Y, + CLIENT_NUM_TILING_TYPES +}; + #define WIDTH 512 #define HEIGHT 32 struct blit_buffer { struct i915_vma *vma; u32 start_val; - u32 tiling; + enum client_tiling tiling; }; struct tiled_blits { @@ -53,9 +60,9 @@ static int prepare_blit(const struct tiled_blits *t, *cs++ = MI_LOAD_REGISTER_IMM(1); *cs++ = i915_mmio_reg_offset(BCS_SWCTRL); cmd = (BCS_SRC_Y | BCS_DST_Y) << 16; - if (src->tiling == I915_TILING_Y) + if (src->tiling == CLIENT_TILING_Y) cmd |= BCS_SRC_Y; - if (dst->tiling == I915_TILING_Y) + if (dst->tiling == CLIENT_TILING_Y) cmd |= BCS_DST_Y; *cs++ = cmd; @@ -172,7 +179,7 @@ static int tiled_blits_create_buffers(struct tiled_blits *t, t->buffers[i].vma = vma; t->buffers[i].tiling = - i915_prandom_u32_max_state(I915_TILING_Y + 1, prng); + i915_prandom_u32_max_state(CLIENT_TILING_Y + 1, prng); } return 0; @@ -197,17 +204,17 @@ static u64 swizzle_bit(unsigned int bit, u64 offset) static u64 tiled_offset(const struct intel_gt *gt, u64 v, unsigned int stride, - unsigned int tiling) + enum client_tiling tiling) { unsigned int swizzle; u64 x, y; - if (tiling == I915_TILING_NONE) + if (tiling == CLIENT_TILING_LINEAR) return v; y = div64_u64_rem(v, stride, &x); - if (tiling == I915_TILING_X) { + if (tiling == CLIENT_TILING_X) { v = div64_u64_rem(y, 8, &y) * stride * 8; v += y * 512; v += div64_u64_rem(x, 512, &x) << 12; @@ -244,12 +251,12 @@ static u64 tiled_offset(const struct intel_gt *gt, return v; } -static const char *repr_tiling(int tiling) +static const char *repr_tiling(enum client_tiling tiling) { switch (tiling) { - case I915_TILING_NONE: return "linear"; - case I915_TILING_X: return "X"; - case I915_TILING_Y: return "Y"; + case CLIENT_TILING_LINEAR: return "linear"; + case CLIENT_TILING_X: return "X"; + case CLIENT_TILING_Y: return "Y"; default: return "unknown"; } } |