diff options
Diffstat (limited to 'drivers/misc')
-rw-r--r-- | drivers/misc/habanalabs/command_submission.c | 341 | ||||
-rw-r--r-- | drivers/misc/habanalabs/context.c | 8 | ||||
-rw-r--r-- | drivers/misc/habanalabs/habanalabs.h | 2 |
3 files changed, 323 insertions, 28 deletions
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index a4211cfc752a..7da9847eb9d6 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -18,6 +18,26 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq); static void cs_do_release(struct kref *ref); +static void hl_sob_reset(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + hdev->asic_funcs->reset_sob(hdev, hw_sob); +} + +void hl_sob_reset_error(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + dev_crit(hdev->dev, + "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n", + hw_sob->q_idx, hw_sob->sob_id); +} + static const char *hl_fence_get_driver_name(struct dma_fence *fence) { return "HabanaLabs"; @@ -40,6 +60,37 @@ static void hl_fence_release(struct dma_fence *fence) { struct hl_cs_compl *hl_cs_cmpl = container_of(fence, struct hl_cs_compl, base_fence); + struct hl_device *hdev = hl_cs_cmpl->hdev; + + if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || + (hl_cs_cmpl->type == CS_TYPE_WAIT)) { + + dev_dbg(hdev->dev, + "CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n", + hl_cs_cmpl->cs_seq, + hl_cs_cmpl->type, + hl_cs_cmpl->hw_sob->sob_id, + hl_cs_cmpl->sob_val); + + /* + * A signal CS can get completion while the corresponding wait + * for signal CS is on its way to the PQ. The wait for signal CS + * will get stuck if the signal CS incremented the SOB to its + * max value and there are no pending (submitted) waits on this + * SOB. + * We do the following to void this situation: + * 1. The wait for signal CS must get a ref for the signal CS as + * soon as possible in cs_ioctl_signal_wait() and put it + * before being submitted to the PQ but after it incremented + * the SOB refcnt in init_signal_wait_cs(). + * 2. Signal/Wait for signal CS will decrement the SOB refcnt + * here. + * These two measures guarantee that the wait for signal CS will + * reset the SOB upon completion rather than the signal CS and + * hence the above scenario is avoided. + */ + kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset); + } kfree_rcu(hl_cs_cmpl, base_fence.rcu); } @@ -189,17 +240,6 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job) kfree(job); } -void hl_sob_reset_error(struct kref *ref) -{ - struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, - kref); - struct hl_device *hdev = hw_sob->hdev; - - dev_crit(hdev->dev, - "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n", - hw_sob->q_idx, hw_sob->sob_id); -} - static void cs_do_release(struct kref *ref) { struct hl_cs *cs = container_of(ref, struct hl_cs, @@ -273,6 +313,12 @@ static void cs_do_release(struct kref *ref) spin_unlock(&hdev->hw_queues_mirror_lock); } + } else if (cs->type == CS_TYPE_WAIT) { + /* + * In case the wait for signal CS was submitted, the put occurs + * in init_signal_wait_cs() right before hanging on the PQ. + */ + dma_fence_put(cs->signal_fence); } /* @@ -326,7 +372,7 @@ static void cs_timedout(struct work_struct *work) } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, - struct hl_cs **cs_new) + enum hl_cs_type cs_type, struct hl_cs **cs_new) { struct hl_cs_compl *cs_cmpl; struct dma_fence *other = NULL; @@ -340,6 +386,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->ctx = ctx; cs->submitted = false; cs->completed = false; + cs->type = cs_type; INIT_LIST_HEAD(&cs->job_list); INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); kref_init(&cs->refcount); @@ -352,6 +399,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, } cs_cmpl->hdev = hdev; + cs_cmpl->type = cs->type; spin_lock_init(&cs_cmpl->lock); cs->fence = &cs_cmpl->base_fence; @@ -513,8 +561,8 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, return job; } -static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, - u32 num_chunks, u64 *cs_seq) +static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, + u32 num_chunks, u64 *cs_seq) { struct hl_device *hdev = hpriv->hdev; struct hl_cs_chunk *cs_chunk_array; @@ -552,7 +600,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, /* increment refcnt for context */ hl_ctx_get(hdev, hpriv->ctx); - rc = allocate_cs(hdev, hpriv->ctx, &cs); + rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT, &cs); if (rc) { hl_ctx_put(hpriv->ctx); goto free_cs_chunk_array; @@ -666,12 +714,229 @@ out: return rc; } +static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, + void __user *chunks, u32 num_chunks, + u64 *cs_seq) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + struct hl_cs_chunk *cs_chunk_array, *chunk; + struct hw_queue_properties *hw_queue_prop; + struct dma_fence *sig_fence = NULL; + struct hl_cs_job *job; + struct hl_cs *cs; + struct hl_cb *cb; + u64 *signal_seq_arr = NULL, signal_seq; + u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size; + int rc; + + *cs_seq = ULLONG_MAX; + + if (num_chunks > HL_MAX_JOBS_PER_CS) { + dev_err(hdev->dev, + "Number of chunks can NOT be larger than %d\n", + HL_MAX_JOBS_PER_CS); + rc = -EINVAL; + goto out; + } + + cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array), + GFP_ATOMIC); + if (!cs_chunk_array) { + rc = -ENOMEM; + goto out; + } + + size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); + if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) { + dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); + rc = -EFAULT; + goto free_cs_chunk_array; + } + + /* currently it is guaranteed to have only one chunk */ + chunk = &cs_chunk_array[0]; + q_idx = chunk->queue_index; + hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + + if ((q_idx >= HL_MAX_QUEUES) || + (hw_queue_prop->type != QUEUE_TYPE_EXT)) { + dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + if (cs_type == CS_TYPE_WAIT) { + struct hl_cs_compl *sig_waitcs_cmpl; + + signal_seq_arr_len = chunk->num_signal_seq_arr; + + /* currently only one signal seq is supported */ + if (signal_seq_arr_len != 1) { + dev_err(hdev->dev, + "Wait for signal CS supports only one signal CS seq\n"); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + signal_seq_arr = kmalloc_array(signal_seq_arr_len, + sizeof(*signal_seq_arr), + GFP_ATOMIC); + if (!signal_seq_arr) { + rc = -ENOMEM; + goto free_cs_chunk_array; + } + + size_to_copy = chunk->num_signal_seq_arr * + sizeof(*signal_seq_arr); + if (copy_from_user(signal_seq_arr, + (void __user *) chunk->signal_seq_arr, + size_to_copy)) { + dev_err(hdev->dev, + "Failed to copy signal seq array from user\n"); + rc = -EFAULT; + goto free_signal_seq_array; + } + + /* currently it is guaranteed to have only one signal seq */ + signal_seq = signal_seq_arr[0]; + sig_fence = hl_ctx_get_fence(ctx, signal_seq); + if (IS_ERR(sig_fence)) { + dev_err(hdev->dev, + "Failed to get signal CS with seq 0x%llx\n", + signal_seq); + rc = PTR_ERR(sig_fence); + goto free_signal_seq_array; + } + + if (!sig_fence) { + /* signal CS already finished */ + rc = 0; + goto free_signal_seq_array; + } + + sig_waitcs_cmpl = + container_of(sig_fence, struct hl_cs_compl, base_fence); + + if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) { + dev_err(hdev->dev, + "CS seq 0x%llx is not of a signal CS\n", + signal_seq); + dma_fence_put(sig_fence); + rc = -EINVAL; + goto free_signal_seq_array; + } + + if (dma_fence_is_signaled(sig_fence)) { + /* signal CS already finished */ + dma_fence_put(sig_fence); + rc = 0; + goto free_signal_seq_array; + } + } + + /* increment refcnt for context */ + hl_ctx_get(hdev, ctx); + + rc = allocate_cs(hdev, ctx, cs_type, &cs); + if (rc) { + if (cs_type == CS_TYPE_WAIT) + dma_fence_put(sig_fence); + hl_ctx_put(ctx); + goto free_signal_seq_array; + } + + /* + * Save the signal CS fence for later initialization right before + * hanging the wait CS on the queue. + */ + if (cs->type == CS_TYPE_WAIT) + cs->signal_fence = sig_fence; + + hl_debugfs_add_cs(cs); + + *cs_seq = cs->sequence; + + job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto put_cs; + } + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (!cb) { + kfree(job); + rc = -EFAULT; + goto put_cs; + } + + if (cs->type == CS_TYPE_WAIT) + cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); + else + cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); + + job->id = 0; + job->cs = cs; + job->user_cb = cb; + job->user_cb->cs_cnt++; + job->user_cb_size = cb_size; + job->hw_queue_id = q_idx; + + /* + * No need in parsing, user CB is the patched CB. + * We call hl_cb_destroy() out of two reasons - we don't need the CB in + * the CB idr anymore and to decrement its refcount as it was + * incremented inside hl_cb_kernel_create(). + */ + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size; + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + + list_add_tail(&job->cs_node, &cs->job_list); + + /* increment refcount as for external queues we get completion */ + cs_get(cs); + + hl_debugfs_add_job(hdev, job); + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to submit CS %d.%llu to H/W queues, error %d\n", + ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + rc = HL_CS_STATUS_SUCCESS; + goto put_cs; + +free_cs_object: + cs_rollback(hdev, cs); + *cs_seq = ULLONG_MAX; + /* The path below is both for good and erroneous exits */ +put_cs: + /* We finished with the CS in this function, so put the ref */ + cs_put(cs); +free_signal_seq_array: + if (cs_type == CS_TYPE_WAIT) + kfree(signal_seq_arr); +free_cs_chunk_array: + kfree(cs_chunk_array); +out: + return rc; +} + int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) { struct hl_device *hdev = hpriv->hdev; union hl_cs_args *args = data; struct hl_ctx *ctx = hpriv->ctx; void __user *chunks_execute, *chunks_restore; + enum hl_cs_type cs_type; u32 num_chunks_execute, num_chunks_restore, sig_wait_flags; u64 cs_seq = ULONG_MAX; int rc, do_ctx_switch; @@ -687,6 +952,14 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT; + if (unlikely(sig_wait_flags == HL_CS_FLAGS_SIG_WAIT)) { + dev_err(hdev->dev, + "Signal and wait CS flags are mutually exclusive, context %d\n", + ctx->asid); + rc = -EINVAL; + goto out; + } + if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) && (!hdev->supports_sync_stream))) { dev_err(hdev->dev, "Sync stream CS is not supported\n"); @@ -694,12 +967,27 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) goto out; } + if (args->in.cs_flags & HL_CS_FLAGS_SIGNAL) + cs_type = CS_TYPE_SIGNAL; + else if (args->in.cs_flags & HL_CS_FLAGS_WAIT) + cs_type = CS_TYPE_WAIT; + else + cs_type = CS_TYPE_DEFAULT; + chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute; num_chunks_execute = args->in.num_chunks_execute; - if (!num_chunks_execute) { + if (cs_type == CS_TYPE_DEFAULT) { + if (!num_chunks_execute) { + dev_err(hdev->dev, + "Got execute CS with 0 chunks, context %d\n", + ctx->asid); + rc = -EINVAL; + goto out; + } + } else if (num_chunks_execute != 1) { dev_err(hdev->dev, - "Got execute CS with 0 chunks, context %d\n", + "Sync stream CS mandates one chunk only, context %d\n", ctx->asid); rc = -EINVAL; goto out; @@ -745,7 +1033,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) "Need to run restore phase but restore CS is empty\n"); rc = 0; } else { - rc = _hl_cs_ioctl(hpriv, chunks_restore, + rc = cs_ioctl_default(hpriv, chunks_restore, num_chunks_restore, &cs_seq); } @@ -787,7 +1075,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) } } - rc = _hl_cs_ioctl(hpriv, chunks_execute, num_chunks_execute, &cs_seq); + if (cs_type == CS_TYPE_DEFAULT) + rc = cs_ioctl_default(hpriv, chunks_execute, num_chunks_execute, + &cs_seq); + else + rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks_execute, + num_chunks_execute, &cs_seq); out: if (rc != -EAGAIN) { @@ -819,6 +1112,10 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, fence = hl_ctx_get_fence(ctx, seq); if (IS_ERR(fence)) { rc = PTR_ERR(fence); + if (rc == -EINVAL) + dev_notice_ratelimited(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu\n", + seq, ctx->cs_sequence); } else if (fence) { rc = dma_fence_wait_timeout(fence, true, timeout); if (fence->error == -ETIMEDOUT) @@ -826,8 +1123,12 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, else if (fence->error == -EIO) rc = -EIO; dma_fence_put(fence); - } else + } else { + dev_dbg(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", + seq, ctx->cs_sequence); rc = 1; + } hl_ctx_put(ctx); diff --git a/drivers/misc/habanalabs/context.c b/drivers/misc/habanalabs/context.c index 2df6fb87e7ff..ec92b3506b1f 100644 --- a/drivers/misc/habanalabs/context.c +++ b/drivers/misc/habanalabs/context.c @@ -170,24 +170,16 @@ int hl_ctx_put(struct hl_ctx *ctx) struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq) { - struct hl_device *hdev = ctx->hdev; struct dma_fence *fence; spin_lock(&ctx->cs_lock); if (seq >= ctx->cs_sequence) { - dev_notice_ratelimited(hdev->dev, - "Can't wait on seq %llu because current CS is at seq %llu\n", - seq, ctx->cs_sequence); spin_unlock(&ctx->cs_lock); return ERR_PTR(-EINVAL); } - if (seq + HL_MAX_PENDING_CS < ctx->cs_sequence) { - dev_dbg(hdev->dev, - "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", - seq, ctx->cs_sequence); spin_unlock(&ctx->cs_lock); return NULL; } diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 630ed43eb410..a68df32094f1 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -843,6 +843,7 @@ struct hl_userptr { * @fence: pointer to the fence object of this CS. * @signal_fence: pointer to the fence object of the signal CS (used by wait * CS only). + * @finish_work: workqueue object to run when CS is completed by H/W. * @work_tdr: delayed work node for TDR. * @mirror_node : node in device mirror list of command submissions. * @debugfs_list: node in debugfs list of command submissions. @@ -863,6 +864,7 @@ struct hl_cs { struct kref refcount; struct dma_fence *fence; struct dma_fence *signal_fence; + struct work_struct finish_work; struct delayed_work work_tdr; struct list_head mirror_node; struct list_head debugfs_list; |