From da451a8e419a1615ce00b8ac3053f577c00c2e84 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 20 Apr 2026 16:56:42 -0700 Subject: [PATCH 01/36] added the retry logic --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 285 ++++++++++++------ src/Worker/Grpc/Logs.cs | 5 +- 2 files changed, 192 insertions(+), 98 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 58a6db040..7b770dc9a 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -373,13 +373,16 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationTok { try { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); - await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation); + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellation); this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -391,17 +394,20 @@ await this.client.AbandonTaskOrchestratorWorkItemAsync( { try { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem?.CompletionToken ?? string.Empty); - await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation); + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); this.Logger.AbandonedActivityWorkItem( instanceId, workItem.ActivityRequest.Name, @@ -417,17 +423,20 @@ await this.client.AbandonTaskActivityWorkItemAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem?.CompletionToken ?? string.Empty); - await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -439,17 +448,20 @@ await this.client.AbandonTaskEntityWorkItemAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem?.CompletionToken ?? string.Empty); - await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -599,17 +611,20 @@ async Task OnRunOrchestratorAsync( cancellationToken); } - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken); - - return; + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; } // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. @@ -700,17 +715,20 @@ await this.client.AbandonTaskOrchestratorWorkItemAsync( }, }; } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken); - - return; + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; } } else @@ -811,15 +829,18 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, } else { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation); + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); } return; @@ -850,10 +871,13 @@ await this.client.AbandonTaskActivityWorkItemAsync( CompletionToken = completionToken, }; - // Stop the trace activity here to avoid including the completion time in the latency calculation - traceActivity?.Stop(); - - await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation); + // Stop the trace activity here to avoid including the completion time in the latency calculation + traceActivity?.Stop(); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + cancellation); } async Task OnRunEntityBatchAsync( @@ -915,11 +939,14 @@ async Task OnRunEntityBatchAsync( }; } - P.EntityBatchResult response = batchResult.ToEntityBatchResult( - completionToken, - operationInfos?.Take(batchResult.Results?.Count ?? 0)); - - await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation); + P.EntityBatchResult response = batchResult.ToEntityBatchResult( + completionToken, + operationInfos?.Take(batchResult.Results?.Count ?? 0)); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + cancellation); } /// @@ -981,10 +1008,13 @@ async Task CompleteOrchestratorTaskWithChunkingAsync( }, }; - await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken); - return; - } - + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + // Helper to add an action to the current chunk if it fits static bool TryAddAction( Google.Protobuf.Collections.RepeatedField dest, @@ -1003,13 +1033,16 @@ static bool TryAddAction( return true; } - // Check if the entire response fits in one chunk - int totalSize = response.CalculateSize(); - if (totalSize <= maxChunkBytes) - { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken); - return; + // Check if the entire response fits in one chunk + int totalSize = response.CalculateSize(); + if (totalSize <= maxChunkBytes) + { + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; } // Response is too large, split into multiple chunks @@ -1065,11 +1098,69 @@ static bool TryAddAction( chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; } - chunkIndex++; - - // Send the chunk - await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken); + chunkIndex++; + + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); } - } - } + } + + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + CancellationToken cancellationToken) + { + const int maxAttempts = 10; + TimeSpan delay = TimeSpan.FromMilliseconds(200); + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Back off with jitter for transient transport errors +#if NET6_0_OR_GREATER + int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#else + int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#endif + TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down, propagate original exception + throw; + } + + // Exponential increase, capping at 15 seconds + delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); + continue; + } + } + } + } } diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index b7d1f957e..9226ab74f 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -78,6 +78,9 @@ static partial class Logs public static partial void AbandoningEntityWorkItem(this ILogger logger, string instanceId, string completionToken); [LoggerMessage(EventId = 65, Level = LogLevel.Information, Message = "{instanceId}: Abandoned entity work item. Completion token = '{completionToken}'")] - public static partial void AbandonedEntityWorkItem(this ILogger logger, string instanceId, string completionToken); + public static partial void AbandonedEntityWorkItem(this ILogger logger, string instanceId, string completionToken); + + [LoggerMessage(EventId = 66, Level = LogLevel.Warning, Message = "Transient gRPC error for '{OperationName}'. Attempt {Attempt} of {MaxAttempts}. Retrying in {BackoffMs} ms. StatusCode={StatusCode}")] + public static partial void TransientGrpcRetry(this ILogger logger, string operationName, int attempt, int maxAttempts, double backoffMs, int statusCode, Exception exception); } } From 4fc52003c0c27fd838949c57edf84eba18d5d8ed Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:10:32 -0700 Subject: [PATCH 02/36] Update src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 2162 ++++++++--------- 1 file changed, 1081 insertions(+), 1081 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 87933d72d..abcdfd854 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -1,479 +1,479 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Diagnostics; -using System.Linq; -using System.Text; -using DurableTask.Core; -using DurableTask.Core.Entities; -using DurableTask.Core.Entities.OperationFormat; -using DurableTask.Core.History; -using Google.Protobuf; -using Microsoft.DurableTask.Abstractions; -using Microsoft.DurableTask.Entities; -using Microsoft.DurableTask.Tracing; -using Microsoft.DurableTask.Worker.Grpc.Internal; -using Microsoft.DurableTask.Worker.Shims; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Logging; -using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; -using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; -using DTCore = DurableTask.Core; -using P = Microsoft.DurableTask.Protobuf; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// The gRPC Durable Task worker. -/// -sealed partial class GrpcDurableTaskWorker -{ - class Processor - { - static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); - - readonly GrpcDurableTaskWorker worker; - readonly TaskHubSidecarServiceClient client; - readonly DurableTaskShimFactory shimFactory; - readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; - readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; - [Obsolete("Experimental")] - readonly IOrchestrationFilter? orchestrationFilter; - - public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) - { - this.worker = worker; - this.client = client; - this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); - this.internalOptions = this.worker.grpcOptions.Internal; - this.orchestrationFilter = orchestrationFilter; - this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null - ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) - : null; - } - - ILogger Logger => this.worker.logger; - - public async Task ExecuteAsync(CancellationToken cancellation) - { - // Tracks consecutive failures against the same channel. Reset only after the stream - // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). - int consecutiveChannelFailures = 0; - - // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. - int reconnectAttempt = 0; - Random backoffRandom = ReconnectBackoff.CreateRandom(); - - while (!cancellation.IsCancellationRequested) - { - bool channelLikelyPoisoned = false; - try - { - using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); - await this.ProcessWorkItemsAsync( - stream, - cancellation, - onFirstMessage: () => - { - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - }, - onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); - } - catch (RpcException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) - { - // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: - // Cancelled is ambiguous and shouldn't drive recreate storms. - this.Logger.SidecarDisconnected(); - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) - { - // Only HelloAsync carries a deadline. Once the work-item stream is established, - // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. - // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. - this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) - { - // Sidecar is down - keep retrying. - this.Logger.SidecarUnavailable(); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) - { - // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel - // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure - // counters: a status reply is proof the transport itself is healthy, so prior - // transport failures should not combine with later ones to trip the recreate. - this.Logger.AuthenticationFailed(ex); - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) - { - // We retry on a NotFound for several reasons: - // 1. It was the existing behavior through the UnexpectedError path. - // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes - // time to propagate so we should retry instead of making the user restart the application. - // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed - // between the scheduler and task hub, it would need to be restarted to function. - this.Logger.TaskHubNotFound(); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Shutting down, lets exit gracefully. - return ProcessorExitReason.Shutdown; - } - catch (Exception ex) - { - // Unknown failure - retry? - this.Logger.UnexpectedError(ex, string.Empty); - } - - if (channelLikelyPoisoned) - { - consecutiveChannelFailures++; - int threshold = this.internalOptions.ChannelRecreateFailureThreshold; - if (threshold > 0 && consecutiveChannelFailures >= threshold) - { - this.Logger.RecreatingChannel(consecutiveChannelFailures); - return ProcessorExitReason.ChannelRecreateRequested; - } - } - - try - { - TimeSpan delay = ReconnectBackoff.Compute( - reconnectAttempt, - this.internalOptions.ReconnectBackoffBase, - this.internalOptions.ReconnectBackoffCap, - backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); - reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt - await Task.Delay(delay, cancellation); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - } - - return ProcessorExitReason.Shutdown; - } - - - static string GetActionsListForLogging(IReadOnlyList actions) - { - if (actions.Count == 0) - { - return string.Empty; - } - else if (actions.Count == 1) - { - return actions[0].OrchestratorActionTypeCase.ToString(); - } - else - { - // Returns something like "ScheduleTask x5, CreateTimer x1,..." - return string.Join(", ", actions - .GroupBy(a => a.OrchestratorActionTypeCase) - .Select(group => $"{group.Key} x{group.Count()}")); - } - } - - static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) - { - P.TaskFailureDetails? failureDetails = null; - versionCheckFailed = false; - if (versioning != null) - { - int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); - - switch (versioning.MatchStrategy) - { - case DurableTaskWorkerOptions.VersionMatchStrategy.None: - // No versioning, breakout. - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: - // Comparison of 0 indicates equality. - if (versionComparison != 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: - // Comparison > 0 indicates the orchestration version is greater than the worker version. - if (versionComparison > 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - default: - // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionError", - ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", - IsNonRetriable = true, - }; - break; - } - - versionCheckFailed = failureDetails != null; - } - - return failureDetails; - } - - async ValueTask BuildRuntimeStateAsync( - P.OrchestratorRequest orchestratorRequest, - ProtoUtils.EntityConversionState? entityConversionState, - CancellationToken cancellation) - { - Func converter = entityConversionState is null - ? ProtoUtils.ConvertHistoryEvent - : entityConversionState.ConvertFromProto; - - IEnumerable pastEvents = []; - if (orchestratorRequest.RequiresHistoryStreaming) - { - // Stream the remaining events from the remote service - P.StreamInstanceHistoryRequest streamRequest = new() - { - InstanceId = orchestratorRequest.InstanceId, - ExecutionId = orchestratorRequest.ExecutionId, - ForWorkItemProcessing = true, - }; - - using AsyncServerStreamingCall streamResponse = - this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); - - await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) - { - pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); - } - } - else - { - // The history was already provided in the work item request - pastEvents = orchestratorRequest.PastEvents.Select(converter); - } - - IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); - - // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events - var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); - foreach (HistoryEvent e in newEvents) - { - // AddEvent() puts events into the NewEvents list. - runtimeState.AddEvent(e); - } - - if (runtimeState.ExecutionStartedEvent == null) - { - // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? - throw new InvalidOperationException("The provided orchestration history was incomplete"); - } - - return runtimeState; - } - - async Task> ConnectAsync(CancellationToken cancellation) - { - TimeSpan helloDeadline = this.internalOptions.HelloDeadline; - DateTime? deadline = null; - - if (helloDeadline > TimeSpan.Zero) - { - // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot - // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains - // unambiguous during internal normalization. - DateTime now = DateTime.UtcNow; - DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); - TimeSpan maxOffset = maxDeadlineUtc - now; - deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); - } - - await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); - this.Logger.EstablishedWorkItemConnection(); - - DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; - - // Get the stream for receiving work-items - return this.client!.GetWorkItems( - new P.GetWorkItemsRequest - { - MaxConcurrentActivityWorkItems = - workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, - MaxConcurrentOrchestrationWorkItems = - workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, - MaxConcurrentEntityWorkItems = - workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, - Capabilities = { this.worker.grpcOptions.Capabilities }, - WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), - }, - cancellationToken: cancellation); - } - - async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, - CancellationToken cancellation, - Action? onFirstMessage = null, - Action? onChannelLikelyPoisoned = null) - { - // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — - // including health pings sent periodically by the server — arrive within the configured - // window. If that fires we treat the stream as silently disconnected and reconnect. - TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; - - // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. - // The underlying IAsyncStreamReader is single-use — once the server terminates the stream - // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext - // returns false forever and re-entering await foreach would tight-spin with no yield. - WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( - ct => stream.ResponseStream.ReadAllAsync(ct), - silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, cancellation), - onFirstMessage, - cancellation); - - switch (result.Outcome) - { - case WorkItemStreamOutcome.Shutdown: - return; - - case WorkItemStreamOutcome.SilentDisconnect: - // Stream stopped producing messages (including health pings) for longer than the - // configured window. Treat as a poisoned channel. - this.Logger.ConnectionTimeout(); - onChannelLikelyPoisoned?.Invoke(); - return; - - case WorkItemStreamOutcome.GracefulDrain: - // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + - // OK trailers when a DTS instance is being replaced). Log it explicitly so - // operators can see it. Only count it toward the channel-poisoned threshold when - // the stream produced no messages: a stream that successfully delivered work and - // was then closed by the server is healthy behavior (e.g. routine rolling - // upgrade), and counting those would let a long-lived process accumulate spurious - // "poison" credits across many healthy drains. An empty drain, on the other hand, - // is a strong signal the channel is latched onto a dead/evacuated backend and - // needs to be recreated to pick up fresh DNS/routing. - this.Logger.StreamEndedByPeer(); - if (!result.FirstMessageObserved) - { - onChannelLikelyPoisoned?.Invoke(); - } - - return; - } - } - - void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. - // Logging at Trace allows operators to confirm liveness without flooding info-level - // telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } - } - - void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) - { - // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? - _ = Task.Run( - async () => - { - try - { - await handler(); - } - catch (OperationCanceledException) - { - // Shutting down - ignore - } - catch (Exception ex) - { - string instanceId = - workItem?.OrchestratorRequest?.InstanceId ?? - workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? - workItem?.EntityRequest?.InstanceId ?? - workItem?.EntityRequestV2?.InstanceId ?? - string.Empty; - this.Logger.UnexpectedError(ex, instanceId); - - if (workItem?.OrchestratorRequest != null) - { - try - { +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Linq; +using System.Text; +using DurableTask.Core; +using DurableTask.Core.Entities; +using DurableTask.Core.Entities.OperationFormat; +using DurableTask.Core.History; +using Google.Protobuf; +using Microsoft.DurableTask.Abstractions; +using Microsoft.DurableTask.Entities; +using Microsoft.DurableTask.Tracing; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.DurableTask.Worker.Shims; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; +using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; +using DTCore = DurableTask.Core; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// The gRPC Durable Task worker. +/// +sealed partial class GrpcDurableTaskWorker +{ + class Processor + { + static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); + + readonly GrpcDurableTaskWorker worker; + readonly TaskHubSidecarServiceClient client; + readonly DurableTaskShimFactory shimFactory; + readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; + readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; + [Obsolete("Experimental")] + readonly IOrchestrationFilter? orchestrationFilter; + + public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) + { + this.worker = worker; + this.client = client; + this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); + this.internalOptions = this.worker.grpcOptions.Internal; + this.orchestrationFilter = orchestrationFilter; + this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null + ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) + : null; + } + + ILogger Logger => this.worker.logger; + + public async Task ExecuteAsync(CancellationToken cancellation) + { + // Tracks consecutive failures against the same channel. Reset only after the stream + // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). + int consecutiveChannelFailures = 0; + + // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. + int reconnectAttempt = 0; + Random backoffRandom = ReconnectBackoff.CreateRandom(); + + while (!cancellation.IsCancellationRequested) + { + bool channelLikelyPoisoned = false; + try + { + using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); + await this.ProcessWorkItemsAsync( + stream, + cancellation, + onFirstMessage: () => + { + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + }, + onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); + } + catch (RpcException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) + { + // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: + // Cancelled is ambiguous and shouldn't drive recreate storms. + this.Logger.SidecarDisconnected(); + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) + { + // Only HelloAsync carries a deadline. Once the work-item stream is established, + // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. + // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. + this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) + { + // Sidecar is down - keep retrying. + this.Logger.SidecarUnavailable(); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) + { + // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel + // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure + // counters: a status reply is proof the transport itself is healthy, so prior + // transport failures should not combine with later ones to trip the recreate. + this.Logger.AuthenticationFailed(ex); + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) + { + // We retry on a NotFound for several reasons: + // 1. It was the existing behavior through the UnexpectedError path. + // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes + // time to propagate so we should retry instead of making the user restart the application. + // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed + // between the scheduler and task hub, it would need to be restarted to function. + this.Logger.TaskHubNotFound(); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Shutting down, lets exit gracefully. + return ProcessorExitReason.Shutdown; + } + catch (Exception ex) + { + // Unknown failure - retry? + this.Logger.UnexpectedError(ex, string.Empty); + } + + if (channelLikelyPoisoned) + { + consecutiveChannelFailures++; + int threshold = this.internalOptions.ChannelRecreateFailureThreshold; + if (threshold > 0 && consecutiveChannelFailures >= threshold) + { + this.Logger.RecreatingChannel(consecutiveChannelFailures); + return ProcessorExitReason.ChannelRecreateRequested; + } + } + + try + { + TimeSpan delay = ReconnectBackoff.Compute( + reconnectAttempt, + this.internalOptions.ReconnectBackoffBase, + this.internalOptions.ReconnectBackoffCap, + backoffRandom); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); + reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + await Task.Delay(delay, cancellation); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + } + + return ProcessorExitReason.Shutdown; + } + + + static string GetActionsListForLogging(IReadOnlyList actions) + { + if (actions.Count == 0) + { + return string.Empty; + } + else if (actions.Count == 1) + { + return actions[0].OrchestratorActionTypeCase.ToString(); + } + else + { + // Returns something like "ScheduleTask x5, CreateTimer x1,..." + return string.Join(", ", actions + .GroupBy(a => a.OrchestratorActionTypeCase) + .Select(group => $"{group.Key} x{group.Count()}")); + } + } + + static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) + { + P.TaskFailureDetails? failureDetails = null; + versionCheckFailed = false; + if (versioning != null) + { + int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); + + switch (versioning.MatchStrategy) + { + case DurableTaskWorkerOptions.VersionMatchStrategy.None: + // No versioning, breakout. + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: + // Comparison of 0 indicates equality. + if (versionComparison != 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: + // Comparison > 0 indicates the orchestration version is greater than the worker version. + if (versionComparison > 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + default: + // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionError", + ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", + IsNonRetriable = true, + }; + break; + } + + versionCheckFailed = failureDetails != null; + } + + return failureDetails; + } + + async ValueTask BuildRuntimeStateAsync( + P.OrchestratorRequest orchestratorRequest, + ProtoUtils.EntityConversionState? entityConversionState, + CancellationToken cancellation) + { + Func converter = entityConversionState is null + ? ProtoUtils.ConvertHistoryEvent + : entityConversionState.ConvertFromProto; + + IEnumerable pastEvents = []; + if (orchestratorRequest.RequiresHistoryStreaming) + { + // Stream the remaining events from the remote service + P.StreamInstanceHistoryRequest streamRequest = new() + { + InstanceId = orchestratorRequest.InstanceId, + ExecutionId = orchestratorRequest.ExecutionId, + ForWorkItemProcessing = true, + }; + + using AsyncServerStreamingCall streamResponse = + this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); + + await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) + { + pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); + } + } + else + { + // The history was already provided in the work item request + pastEvents = orchestratorRequest.PastEvents.Select(converter); + } + + IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); + + // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events + var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); + foreach (HistoryEvent e in newEvents) + { + // AddEvent() puts events into the NewEvents list. + runtimeState.AddEvent(e); + } + + if (runtimeState.ExecutionStartedEvent == null) + { + // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? + throw new InvalidOperationException("The provided orchestration history was incomplete"); + } + + return runtimeState; + } + + async Task> ConnectAsync(CancellationToken cancellation) + { + TimeSpan helloDeadline = this.internalOptions.HelloDeadline; + DateTime? deadline = null; + + if (helloDeadline > TimeSpan.Zero) + { + // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains + // unambiguous during internal normalization. + DateTime now = DateTime.UtcNow; + DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); + TimeSpan maxOffset = maxDeadlineUtc - now; + deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); + } + + await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); + this.Logger.EstablishedWorkItemConnection(); + + DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; + + // Get the stream for receiving work-items + return this.client!.GetWorkItems( + new P.GetWorkItemsRequest + { + MaxConcurrentActivityWorkItems = + workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, + MaxConcurrentOrchestrationWorkItems = + workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, + MaxConcurrentEntityWorkItems = + workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, + Capabilities = { this.worker.grpcOptions.Capabilities }, + WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), + }, + cancellationToken: cancellation); + } + + async Task ProcessWorkItemsAsync( + AsyncServerStreamingCall stream, + CancellationToken cancellation, + Action? onFirstMessage = null, + Action? onChannelLikelyPoisoned = null) + { + // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — + // including health pings sent periodically by the server — arrive within the configured + // window. If that fires we treat the stream as silently disconnected and reconnect. + TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; + + // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. + // The underlying IAsyncStreamReader is single-use — once the server terminates the stream + // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext + // returns false forever and re-entering await foreach would tight-spin with no yield. + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + ct => stream.ResponseStream.ReadAllAsync(ct), + silentDisconnectTimeout, + workItem => this.DispatchWorkItem(workItem, cancellation), + onFirstMessage, + cancellation); + + switch (result.Outcome) + { + case WorkItemStreamOutcome.Shutdown: + return; + + case WorkItemStreamOutcome.SilentDisconnect: + // Stream stopped producing messages (including health pings) for longer than the + // configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onChannelLikelyPoisoned?.Invoke(); + return; + + case WorkItemStreamOutcome.GracefulDrain: + // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + + // OK trailers when a DTS instance is being replaced). Log it explicitly so + // operators can see it. Only count it toward the channel-poisoned threshold when + // the stream produced no messages: a stream that successfully delivered work and + // was then closed by the server is healthy behavior (e.g. routine rolling + // upgrade), and counting those would let a long-lived process accumulate spurious + // "poison" credits across many healthy drains. An empty drain, on the other hand, + // is a strong signal the channel is latched onto a dead/evacuated backend and + // needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + if (!result.FirstMessageObserved) + { + onChannelLikelyPoisoned?.Invoke(); + } + + return; + } + } + + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) + { + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. + // Logging at Trace allows operators to confirm liveness without flooding info-level + // telemetry. + this.Logger.ReceivedHealthPing(); + } + else + { + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); + } + } + + void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) + { + // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? + _ = Task.Run( + async () => + { + try + { + await handler(); + } + catch (OperationCanceledException) + { + // Shutting down - ignore + } + catch (Exception ex) + { + string instanceId = + workItem?.OrchestratorRequest?.InstanceId ?? + workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? + workItem?.EntityRequest?.InstanceId ?? + workItem?.EntityRequestV2?.InstanceId ?? + string.Empty; + this.Logger.UnexpectedError(ex, instanceId); + + if (workItem?.OrchestratorRequest != null) + { + try + { this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); await this.ExecuteWithRetryAsync( async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( @@ -484,17 +484,17 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.ActivityRequest != null) - { - try - { + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.ActivityRequest != null) + { + try + { this.Logger.AbandoningActivityWorkItem( instanceId, workItem.ActivityRequest.Name, @@ -509,21 +509,21 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), cancellation); - this.Logger.AbandonedActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem?.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.EntityRequest != null) - { - try - { + this.Logger.AbandonedActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem?.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.EntityRequest != null) + { + try + { this.Logger.AbandoningEntityWorkItem( workItem.EntityRequest.InstanceId, workItem?.CompletionToken ?? string.Empty); @@ -538,17 +538,17 @@ await this.ExecuteWithRetryAsync( cancellation); this.Logger.AbandonedEntityWorkItem( workItem.EntityRequest.InstanceId, - workItem?.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); - } - } - else if (workItem?.EntityRequestV2 != null) - { - try - { + workItem?.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); + } + } + else if (workItem?.EntityRequestV2 != null) + { + try + { this.Logger.AbandoningEntityWorkItem( workItem.EntityRequestV2.InstanceId, workItem?.CompletionToken ?? string.Empty); @@ -563,155 +563,155 @@ await this.ExecuteWithRetryAsync( cancellation); this.Logger.AbandonedEntityWorkItem( workItem.EntityRequestV2.InstanceId, - workItem?.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); - } - } - } - }); - } - - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, - CancellationToken cancellationToken) - { - var executionStartedEvent = - request - .NewEvents - .Concat(request.PastEvents) - .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) - .Select(e => e.ExecutionStarted) - .FirstOrDefault(); - - Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( - executionStartedEvent, - request.OrchestrationTraceContext); - - if (executionStartedEvent is not null) - { - P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) - { - var subOrchestrationEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) - .FirstOrDefault(x => x.EventId == eventId); - - return subOrchestrationEvent; - } - - P.HistoryEvent? GetTaskScheduledEvent(int eventId) - { - var taskScheduledEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) - .LastOrDefault(x => x.EventId == eventId); - - return taskScheduledEvent; - } - - foreach (var newEvent in request.NewEvents) - { - switch (newEvent.EventTypeCase) - { - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationFailed( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, - newEvent.SubOrchestrationInstanceFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskCompleted( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskFailed: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskFailed( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled, - newEvent.TaskFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TimerFired: - TraceHelper.EmitTraceActivityForTimer( - request.InstanceId, - executionStartedEvent.Name, - newEvent.Timestamp.ToDateTime(), - newEvent.TimerFired); - break; - } - } - } - - OrchestratorExecutionResult? result = null; - P.TaskFailureDetails? failureDetails = null; - TaskName name = new("(unknown)"); - - ProtoUtils.EntityConversionState? entityConversionState = - this.internalOptions.ConvertOrchestrationEntityEvents - ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) - : null; - - DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; - bool versionFailure = false; - try - { - OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( - request, - entityConversionState, - cancellationToken); - - bool filterPassed = true; - if (this.orchestrationFilter != null) - { - filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( - new OrchestrationFilterParameters - { - Name = runtimeState.Name, - Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, - }, - cancellationToken); - } - + workItem?.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); + } + } + } + }); + } + + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + CancellationToken cancellationToken) + { + var executionStartedEvent = + request + .NewEvents + .Concat(request.PastEvents) + .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) + .Select(e => e.ExecutionStarted) + .FirstOrDefault(); + + Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( + executionStartedEvent, + request.OrchestrationTraceContext); + + if (executionStartedEvent is not null) + { + P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) + { + var subOrchestrationEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) + .FirstOrDefault(x => x.EventId == eventId); + + return subOrchestrationEvent; + } + + P.HistoryEvent? GetTaskScheduledEvent(int eventId) + { + var taskScheduledEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) + .LastOrDefault(x => x.EventId == eventId); + + return taskScheduledEvent; + } + + foreach (var newEvent in request.NewEvents) + { + switch (newEvent.EventTypeCase) + { + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationFailed( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, + newEvent.SubOrchestrationInstanceFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskCompleted( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskFailed: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskFailed( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled, + newEvent.TaskFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TimerFired: + TraceHelper.EmitTraceActivityForTimer( + request.InstanceId, + executionStartedEvent.Name, + newEvent.Timestamp.ToDateTime(), + newEvent.TimerFired); + break; + } + } + } + + OrchestratorExecutionResult? result = null; + P.TaskFailureDetails? failureDetails = null; + TaskName name = new("(unknown)"); + + ProtoUtils.EntityConversionState? entityConversionState = + this.internalOptions.ConvertOrchestrationEntityEvents + ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) + : null; + + DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; + bool versionFailure = false; + try + { + OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( + request, + entityConversionState, + cancellationToken); + + bool filterPassed = true; + if (this.orchestrationFilter != null) + { + filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( + new OrchestrationFilterParameters + { + Name = runtimeState.Name, + Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, + }, + cancellationToken); + } + if (!filterPassed) { this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); @@ -726,96 +726,96 @@ await this.ExecuteWithRetryAsync( cancellationToken); return; - } - - // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. - failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); - - // Only continue with the work if the versioning check passed. - if (failureDetails == null) - { - name = new TaskName(runtimeState.Name); - - this.Logger.ReceivedOrchestratorRequest( - name, - request.InstanceId, - runtimeState.PastEvents.Count, - runtimeState.NewEvents.Count); - - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateOrchestrator( - name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) - { - // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled - // as part of try/catch. - ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch - { - ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), - _ => null, - }; - - TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); - TaskOrchestrationExecutor executor = new( - runtimeState, - shim, - BehaviorOnContinueAsNew.Carryover, - request.EntityParameters.ToCore(), - ErrorPropagationMode.UseFailureDetails, - this.exceptionPropertiesProvider); - result = executor.Execute(); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "OrchestratorTaskNotFound", - ErrorMessage = $"No orchestrator task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - } - catch (Exception unexpected) - { - // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. - this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); - failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - - P.OrchestratorResponse response; - if (result != null) - { - response = ProtoUtils.ConstructOrchestratorResponse( - request.InstanceId, - request.ExecutionId, - result.CustomStatus, - result.Actions, - completionToken, - entityConversionState, - traceActivity); - } - else if (versioning != null && failureDetails != null && versionFailure) - { - this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); - if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) - { - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } + } + + // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. + failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); + + // Only continue with the work if the versioning check passed. + if (failureDetails == null) + { + name = new TaskName(runtimeState.Name); + + this.Logger.ReceivedOrchestratorRequest( + name, + request.InstanceId, + runtimeState.PastEvents.Count, + runtimeState.NewEvents.Count); + + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateOrchestrator( + name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) + { + // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled + // as part of try/catch. + ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch + { + ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), + _ => null, + }; + + TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); + TaskOrchestrationExecutor executor = new( + runtimeState, + shim, + BehaviorOnContinueAsNew.Carryover, + request.EntityParameters.ToCore(), + ErrorPropagationMode.UseFailureDetails, + this.exceptionPropertiesProvider); + result = executor.Execute(); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "OrchestratorTaskNotFound", + ErrorMessage = $"No orchestrator task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + } + catch (Exception unexpected) + { + // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. + this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); + failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + + P.OrchestratorResponse response; + if (result != null) + { + response = ProtoUtils.ConstructOrchestratorResponse( + request.InstanceId, + request.ExecutionId, + result.CustomStatus, + result.Actions, + completionToken, + entityConversionState, + traceActivity); + } + else if (versioning != null && failureDetails != null && versionFailure) + { + this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); + if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) + { + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } else { this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); @@ -830,106 +830,106 @@ await this.ExecuteWithRetryAsync( cancellationToken); return; - } - } - else - { - // This is the case for failures that happened *outside* the orchestrator executor - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - - var completeOrchestrationAction = response.Actions.FirstOrDefault( - a => a.CompleteOrchestration is not null); - - if (completeOrchestrationAction is not null) - { - if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) - { - traceActivity?.SetStatus( - ActivityStatusCode.Error, - completeOrchestrationAction.CompleteOrchestration.Result); - } - - traceActivity?.SetTag( - Schema.Task.Status, - completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); - - traceActivity?.Dispose(); - } - - this.Logger.SendingOrchestratorResponse( - name, - response.InstanceId, - response.Actions.Count, - GetActionsListForLogging(response.Actions)); - - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - cancellationToken); - } - - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) - { - using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); - - OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); - string rawInput = request.Input; - int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; - this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); - - P.TaskFailureDetails? failureDetails = null; - TaskContext innerContext = new(instance); - innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; - - TaskName name = new(request.Name); - string? output = null; - - failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); - if (!versioningFailed) - { - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskActivity shim = this.shimFactory.CreateActivity(name, activity); - output = await shim.RunAsync(innerContext, request.Input); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "ActivityTaskNotFound", - ErrorMessage = $"No activity task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - catch (Exception applicationException) - { - failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - } - else - { + } + } + else + { + // This is the case for failures that happened *outside* the orchestrator executor + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + + var completeOrchestrationAction = response.Actions.FirstOrDefault( + a => a.CompleteOrchestration is not null); + + if (completeOrchestrationAction is not null) + { + if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) + { + traceActivity?.SetStatus( + ActivityStatusCode.Error, + completeOrchestrationAction.CompleteOrchestration.Result); + } + + traceActivity?.SetTag( + Schema.Task.Status, + completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); + + traceActivity?.Dispose(); + } + + this.Logger.SendingOrchestratorResponse( + name, + response.InstanceId, + response.Actions.Count, + GetActionsListForLogging(response.Actions)); + + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + cancellationToken); + } + + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) + { + using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); + + OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); + string rawInput = request.Input; + int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; + this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); + + P.TaskFailureDetails? failureDetails = null; + TaskContext innerContext = new(instance); + innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; + + TaskName name = new(request.Name); + string? output = null; + + failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); + if (!versioningFailed) + { + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskActivity shim = this.shimFactory.CreateActivity(name, activity); + output = await shim.RunAsync(innerContext, request.Input); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "ActivityTaskNotFound", + ErrorMessage = $"No activity task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + catch (Exception applicationException) + { + failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + } + else + { if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) { this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); @@ -942,36 +942,36 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), cancellation); - } - - return; - } - - int outputSizeInBytes = 0; - if (failureDetails != null) - { - traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); - - outputSizeInBytes = failureDetails.GetApproximateByteCount(); - } - else if (output != null) - { - outputSizeInBytes = Encoding.UTF8.GetByteCount(output); - } - - string successOrFailure = failureDetails != null ? "failure" : "success"; - this.Logger.SendingActivityResponse( - successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); - - P.ActivityResponse response = new() - { - InstanceId = instance.InstanceId, - TaskId = request.TaskId, - Result = output, - FailureDetails = failureDetails, - CompletionToken = completionToken, - }; - + } + + return; + } + + int outputSizeInBytes = 0; + if (failureDetails != null) + { + traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); + + outputSizeInBytes = failureDetails.GetApproximateByteCount(); + } + else if (output != null) + { + outputSizeInBytes = Encoding.UTF8.GetByteCount(output); + } + + string successOrFailure = failureDetails != null ? "failure" : "success"; + this.Logger.SendingActivityResponse( + successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); + + P.ActivityResponse response = new() + { + InstanceId = instance.InstanceId, + TaskId = request.TaskId, + Result = output, + FailureDetails = failureDetails, + CompletionToken = completionToken, + }; + // Stop the trace activity here to avoid including the completion time in the latency calculation traceActivity?.Stop(); @@ -979,67 +979,67 @@ await this.ExecuteWithRetryAsync( async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), nameof(this.client.CompleteActivityTaskAsync), cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - CancellationToken cancellation, - string? completionToken = null, - List? operationInfos = null) - { - var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); - EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); - - TaskName name = new(entityId.Name); - - EntityBatchResult? batchResult; - - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; - - if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); - batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); - } - else - { - // we could not find the entity. This is considered an application error, - // so we return a non-retriable error-OperationResult for each operation in the batch. - batchResult = new EntityBatchResult() - { - Actions = [], // no actions - EntityState = batchRequest.EntityState, // state is unmodified - Results = Enumerable.Repeat( - new OperationResult() - { - FailureDetails = new FailureDetails( - errorType: "EntityTaskNotFound", - errorMessage: $"No entity task named '{name}' was found.", - stackTrace: null, - innerFailure: null, - isNonRetriable: true), - }, - batchRequest.Operations!.Count).ToList(), - FailureDetails = null, - }; - } - } - catch (Exception frameworkException) - { - // return a result with failure details. - // this will cause the batch to be abandoned and retried - // (possibly after a delay and on a different worker). - batchResult = new EntityBatchResult() - { - FailureDetails = new FailureDetails(frameworkException), - }; - } - + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + CancellationToken cancellation, + string? completionToken = null, + List? operationInfos = null) + { + var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); + EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); + + TaskName name = new(entityId.Name); + + EntityBatchResult? batchResult; + + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; + + if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); + batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); + } + else + { + // we could not find the entity. This is considered an application error, + // so we return a non-retriable error-OperationResult for each operation in the batch. + batchResult = new EntityBatchResult() + { + Actions = [], // no actions + EntityState = batchRequest.EntityState, // state is unmodified + Results = Enumerable.Repeat( + new OperationResult() + { + FailureDetails = new FailureDetails( + errorType: "EntityTaskNotFound", + errorMessage: $"No entity task named '{name}' was found.", + stackTrace: null, + innerFailure: null, + isNonRetriable: true), + }, + batchRequest.Operations!.Count).ToList(), + FailureDetails = null, + }; + } + } + catch (Exception frameworkException) + { + // return a result with failure details. + // this will cause the batch to be abandoned and retried + // (possibly after a delay and on a different worker). + batchResult = new EntityBatchResult() + { + FailureDetails = new FailureDetails(frameworkException), + }; + } + P.EntityBatchResult response = batchResult.ToEntityBatchResult( completionToken, operationInfos?.Take(batchResult.Results?.Count ?? 0)); @@ -1048,67 +1048,67 @@ await this.ExecuteWithRetryAsync( async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), nameof(this.client.CompleteEntityTaskAsync), cancellation); - } - - /// - /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. - /// - /// The orchestrator response to send. - /// The maximum size in bytes for each chunk. - /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, - CancellationToken cancellationToken) - { - // Validate that no single action exceeds the maximum chunk size - static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) - { - foreach (P.OrchestratorAction action in actions) - { - int actionSize = action.CalculateSize(); - if (actionSize > maxChunkBytes) - { - // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message - string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + - $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + - "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; - return new P.TaskFailureDetails - { - ErrorType = typeof(InvalidOperationException).FullName, - ErrorMessage = errorMessage, - IsNonRetriable = true, - }; - } - } - - return null; - } - - P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) - ? null - : ValidateActionsSize(response.Actions, maxChunkBytes); - if (validationFailure != null) - { - // Complete the orchestration with a failed status and failure details - P.OrchestratorResponse failureResponse = new() - { - InstanceId = response.InstanceId, - CompletionToken = response.CompletionToken, - OrchestrationTraceContext = response.OrchestrationTraceContext, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = validationFailure, - }, - }, - }, - }; - + } + + /// + /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. + /// + /// The orchestrator response to send. + /// The maximum size in bytes for each chunk. + /// The cancellation token. + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + CancellationToken cancellationToken) + { + // Validate that no single action exceeds the maximum chunk size + static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) + { + foreach (P.OrchestratorAction action in actions) + { + int actionSize = action.CalculateSize(); + if (actionSize > maxChunkBytes) + { + // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message + string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + + $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + + "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; + return new P.TaskFailureDetails + { + ErrorType = typeof(InvalidOperationException).FullName, + ErrorMessage = errorMessage, + IsNonRetriable = true, + }; + } + } + + return null; + } + + P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) + ? null + : ValidateActionsSize(response.Actions, maxChunkBytes); + if (validationFailure != null) + { + // Complete the orchestration with a failed status and failure details + P.OrchestratorResponse failureResponse = new() + { + InstanceId = response.InstanceId, + CompletionToken = response.CompletionToken, + OrchestrationTraceContext = response.OrchestrationTraceContext, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = validationFailure, + }, + }, + }, + }; + await this.ExecuteWithRetryAsync( async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), nameof(this.client.CompleteOrchestratorTaskAsync), @@ -1116,24 +1116,24 @@ await this.ExecuteWithRetryAsync( return; } - // Helper to add an action to the current chunk if it fits - static bool TryAddAction( - Google.Protobuf.Collections.RepeatedField dest, - P.OrchestratorAction action, - ref int currentSize, - int maxChunkBytes) - { - int actionSize = action.CalculateSize(); - if (currentSize + actionSize > maxChunkBytes && currentSize > 0) - { - return false; - } - - dest.Add(action); - currentSize += actionSize; - return true; - } - + // Helper to add an action to the current chunk if it fits + static bool TryAddAction( + Google.Protobuf.Collections.RepeatedField dest, + P.OrchestratorAction action, + ref int currentSize, + int maxChunkBytes) + { + int actionSize = action.CalculateSize(); + if (currentSize + actionSize > maxChunkBytes && currentSize > 0) + { + return false; + } + + dest.Add(action); + currentSize += actionSize; + return true; + } + // Check if the entire response fits in one chunk int totalSize = response.CalculateSize(); if (totalSize <= maxChunkBytes) @@ -1144,69 +1144,69 @@ await this.ExecuteWithRetryAsync( nameof(this.client.CompleteOrchestratorTaskAsync), cancellationToken); return; - } - - // Response is too large, split into multiple chunks - int actionsCompletedSoFar = 0, chunkIndex = 0; - List allActions = response.Actions.ToList(); - bool isPartial = true; - bool isChunkedMode = false; - - while (isPartial) - { - P.OrchestratorResponse chunkedResponse = new() - { - InstanceId = response.InstanceId, - CustomStatus = response.CustomStatus, - CompletionToken = response.CompletionToken, - RequiresHistory = response.RequiresHistory, - NumEventsProcessed = 0, - }; - - int chunkPayloadSize = 0; - - // Fill the chunk with actions until we reach the size limit - while (actionsCompletedSoFar < allActions.Count && - TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) - { - actionsCompletedSoFar++; - } - - // Determine if this is a partial chunk (more actions remaining) - isPartial = actionsCompletedSoFar < allActions.Count; - chunkedResponse.IsPartial = isPartial; - - // Only activate chunked mode when we actually need multiple chunks. - // A single oversized action that fits in one chunk (via TryAddAction allowing - // the first item in an empty chunk) should be sent as non-chunked to avoid - // backend issues with ChunkIndex=0 + IsPartial=false. - if (isPartial) - { - isChunkedMode = true; - } - - if (isChunkedMode) - { - chunkedResponse.ChunkIndex = chunkIndex; - } - - if (chunkIndex == 0) - { - // The first chunk preserves the original response's NumEventsProcessed value (null) - // When this is set to null, backend by default handles all the messages in the workitem. - // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. - chunkedResponse.NumEventsProcessed = null; - chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; - } - + } + + // Response is too large, split into multiple chunks + int actionsCompletedSoFar = 0, chunkIndex = 0; + List allActions = response.Actions.ToList(); + bool isPartial = true; + bool isChunkedMode = false; + + while (isPartial) + { + P.OrchestratorResponse chunkedResponse = new() + { + InstanceId = response.InstanceId, + CustomStatus = response.CustomStatus, + CompletionToken = response.CompletionToken, + RequiresHistory = response.RequiresHistory, + NumEventsProcessed = 0, + }; + + int chunkPayloadSize = 0; + + // Fill the chunk with actions until we reach the size limit + while (actionsCompletedSoFar < allActions.Count && + TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) + { + actionsCompletedSoFar++; + } + + // Determine if this is a partial chunk (more actions remaining) + isPartial = actionsCompletedSoFar < allActions.Count; + chunkedResponse.IsPartial = isPartial; + + // Only activate chunked mode when we actually need multiple chunks. + // A single oversized action that fits in one chunk (via TryAddAction allowing + // the first item in an empty chunk) should be sent as non-chunked to avoid + // backend issues with ChunkIndex=0 + IsPartial=false. + if (isPartial) + { + isChunkedMode = true; + } + + if (isChunkedMode) + { + chunkedResponse.ChunkIndex = chunkIndex; + } + + if (chunkIndex == 0) + { + // The first chunk preserves the original response's NumEventsProcessed value (null) + // When this is set to null, backend by default handles all the messages in the workitem. + // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. + chunkedResponse.NumEventsProcessed = null; + chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; + } + chunkIndex++; // Send the chunk await this.ExecuteWithRetryAsync( async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - } + cancellationToken); + } } async Task ExecuteWithRetryAsync( @@ -1253,7 +1253,7 @@ async Task ExecuteWithRetryAsync( } catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) { - // If shutting down, propagate original exception + // If shutting down during the retry delay, propagate the cancellation exception throw; } @@ -1264,4 +1264,4 @@ async Task ExecuteWithRetryAsync( } } } -} +} From 61f8a2135e77424a918b680aa68407cb38553e58 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 17:11:50 +0000 Subject: [PATCH 03/36] Add tests for ExecuteWithRetryAsync retry logic Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/e65654b0-082d-4c4a-b2c9-34330dd3eb37 Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- .../Grpc.Tests/ExecuteWithRetryTests.cs | 273 ++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100644 test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs new file mode 100644 index 000000000..b9a308310 --- /dev/null +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -0,0 +1,273 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Reflection; +using Grpc.Core; +using Microsoft.DurableTask.Tests.Logging; +using Microsoft.DurableTask.Worker; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using Xunit.Abstractions; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class ExecuteWithRetryTests +{ + const string Category = "Microsoft.DurableTask.Worker.Grpc"; + + static readonly MethodInfo ExecuteWithRetryAsyncMethod = typeof(GrpcDurableTaskWorker) + .GetNestedType("Processor", BindingFlags.NonPublic)! + .GetMethod("ExecuteWithRetryAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + + [Fact] + public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() + { + // Arrange + object processor = CreateProcessor(); + int callCount = 0; + + // Act + await InvokeExecuteWithRetryAsync( + processor, + () => { callCount++; return Task.CompletedTask; }, + "TestOperation", + CancellationToken.None); + + // Assert + callCount.Should().Be(1); + } + + [Theory] + [InlineData(StatusCode.Unavailable)] + [InlineData(StatusCode.Unknown)] + [InlineData(StatusCode.DeadlineExceeded)] + [InlineData(StatusCode.Internal)] + public async Task ExecuteWithRetryAsync_TransientError_RetriesAndEventuallySucceeds(StatusCode statusCode) + { + // Arrange + object processor = CreateProcessor(); + int callCount = 0; + + // Act - fail once then succeed + await InvokeExecuteWithRetryAsync( + processor, + () => + { + callCount++; + if (callCount == 1) + { + throw new RpcException(new Status(statusCode, "transient error")); + } + + return Task.CompletedTask; + }, + "TestOperation", + CancellationToken.None); + + // Assert + callCount.Should().Be(2); + } + + [Theory] + [InlineData(StatusCode.InvalidArgument)] + [InlineData(StatusCode.AlreadyExists)] + [InlineData(StatusCode.PermissionDenied)] + public async Task ExecuteWithRetryAsync_NonTransientError_ThrowsWithoutRetrying(StatusCode statusCode) + { + // Arrange + object processor = CreateProcessor(); + int callCount = 0; + + // Act + Func act = () => InvokeExecuteWithRetryAsync( + processor, + () => + { + callCount++; + throw new RpcException(new Status(statusCode, "non-transient error")); + }, + "TestOperation", + CancellationToken.None); + + // Assert + await act.Should().ThrowAsync().Where(e => e.StatusCode == statusCode); + callCount.Should().Be(1); + } + + [Fact] + public async Task ExecuteWithRetryAsync_CancellationRequestedDuringRetryDelay_ThrowsOperationCanceledException() + { + // Arrange + using CancellationTokenSource cts = new(); + object processor = CreateProcessor(); + + // Act - cancel immediately after first failure so the retry delay is cancelled + Func act = () => InvokeExecuteWithRetryAsync( + processor, + () => + { + cts.Cancel(); + throw new RpcException(new Status(StatusCode.Unavailable, "transient error")); + }, + "TestOperation", + cts.Token); + + // Assert + await act.Should().ThrowAsync(); + } + + [Fact] + public async Task ExecuteWithRetryAsync_TransientError_LogsRetryAttempt() + { + // Arrange + TestLogProvider logProvider = new(new NullOutput()); + object processor = CreateProcessor(logProvider); + int callCount = 0; + const string operationName = "CompleteOrchestratorTaskAsync"; + + // Act - fail once then succeed + await InvokeExecuteWithRetryAsync( + processor, + () => + { + callCount++; + if (callCount == 1) + { + throw new RpcException(new Status(StatusCode.Unavailable, "transient error")); + } + + return Task.CompletedTask; + }, + operationName, + CancellationToken.None); + + // Assert + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => + log.Message.Contains($"Transient gRPC error for '{operationName}'") && + log.Message.Contains("Attempt 1 of 10")); + } + + [Fact] + public async Task ExecuteWithRetryAsync_MultipleTransientErrors_LogsEachRetryAttempt() + { + // Arrange + TestLogProvider logProvider = new(new NullOutput()); + object processor = CreateProcessor(logProvider); + int callCount = 0; + const string operationName = "CompleteActivityTaskAsync"; + + // Act - fail twice then succeed + await InvokeExecuteWithRetryAsync( + processor, + () => + { + callCount++; + if (callCount < 3) + { + throw new RpcException(new Status(StatusCode.Unavailable, "transient error")); + } + + return Task.CompletedTask; + }, + operationName, + CancellationToken.None); + + // Assert + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => + log.Message.Contains($"Transient gRPC error for '{operationName}'") && + log.Message.Contains("Attempt 1 of 10")); + logs.Should().Contain(log => + log.Message.Contains($"Transient gRPC error for '{operationName}'") && + log.Message.Contains("Attempt 2 of 10")); + callCount.Should().Be(3); + } + + static object CreateProcessor(TestLogProvider? logProvider = null) + { + ILoggerFactory loggerFactory = logProvider is null + ? NullLoggerFactory.Instance + : new SimpleLoggerFactory(logProvider); + + Mock factoryMock = new(MockBehavior.Strict); + GrpcDurableTaskWorkerOptions grpcOptions = new(); + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + + GrpcDurableTaskWorker worker = new( + name: "Test", + factory: factoryMock.Object, + grpcOptions: new OptionsMonitorStub(grpcOptions), + workerOptions: new OptionsMonitorStub(workerOptions), + services: Mock.Of(), + loggerFactory: loggerFactory, + orchestrationFilter: null, + exceptionPropertiesProvider: null); + + CallInvoker callInvoker = Mock.Of(); + P.TaskHubSidecarService.TaskHubSidecarServiceClient client = new(callInvoker); + + Type processorType = typeof(GrpcDurableTaskWorker).GetNestedType("Processor", BindingFlags.NonPublic)!; + return Activator.CreateInstance( + processorType, + BindingFlags.Public | BindingFlags.Instance, + binder: null, + args: new object?[] { worker, client, null, null }, + culture: null)!; + } + + static Task InvokeExecuteWithRetryAsync( + object processor, + Func action, + string operationName, + CancellationToken cancellationToken) + { + return (Task)ExecuteWithRetryAsyncMethod.Invoke( + processor, + new object?[] { action, operationName, cancellationToken })!; + } + + sealed class OptionsMonitorStub : IOptionsMonitor where T : class, new() + { + readonly T value; + + public OptionsMonitorStub(T value) => this.value = value; + + public T CurrentValue => this.value; + + public T Get(string? name) => this.value; + + public IDisposable OnChange(Action listener) => NullDisposable.Instance; + + sealed class NullDisposable : IDisposable + { + public static readonly NullDisposable Instance = new(); + public void Dispose() { } + } + } + + sealed class SimpleLoggerFactory : ILoggerFactory + { + readonly ILoggerProvider provider; + + public SimpleLoggerFactory(ILoggerProvider provider) => this.provider = provider; + + public void AddProvider(ILoggerProvider provider) { } + + public ILogger CreateLogger(string categoryName) => this.provider.CreateLogger(categoryName); + + public void Dispose() { } + } + + sealed class NullOutput : ITestOutputHelper + { + public void WriteLine(string message) { } + public void WriteLine(string format, params object[] args) { } + } +} From 32cc282a6719d5b9c353ff4e93e44e46232aee22 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 10:17:57 -0700 Subject: [PATCH 04/36] removed redundant logs --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index abcdfd854..ce45ec2d3 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -474,17 +474,16 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationTok { try { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); await this.ExecuteWithRetryAsync( async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( new P.AbandonOrchestrationTaskRequest { - CompletionToken = workItem?.CompletionToken, + CompletionToken = workItem.CompletionToken, }, cancellationToken: cancellation), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -509,11 +508,6 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), cancellation); - this.Logger.AbandonedActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -536,9 +530,6 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -561,9 +552,6 @@ await this.ExecuteWithRetryAsync( cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem?.CompletionToken ?? string.Empty); } catch (Exception abandonException) { From d640ac670431d12856d6cb5185e46a284c20f29c Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 10:20:11 -0700 Subject: [PATCH 05/36] fixed line endings --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 2510 ++++++++--------- 1 file changed, 1255 insertions(+), 1255 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index ce45ec2d3..28a65a146 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -1,1255 +1,1255 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Diagnostics; -using System.Linq; -using System.Text; -using DurableTask.Core; -using DurableTask.Core.Entities; -using DurableTask.Core.Entities.OperationFormat; -using DurableTask.Core.History; -using Google.Protobuf; -using Microsoft.DurableTask.Abstractions; -using Microsoft.DurableTask.Entities; -using Microsoft.DurableTask.Tracing; -using Microsoft.DurableTask.Worker.Grpc.Internal; -using Microsoft.DurableTask.Worker.Shims; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Logging; -using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; -using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; -using DTCore = DurableTask.Core; -using P = Microsoft.DurableTask.Protobuf; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// The gRPC Durable Task worker. -/// -sealed partial class GrpcDurableTaskWorker -{ - class Processor - { - static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); - - readonly GrpcDurableTaskWorker worker; - readonly TaskHubSidecarServiceClient client; - readonly DurableTaskShimFactory shimFactory; - readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; - readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; - [Obsolete("Experimental")] - readonly IOrchestrationFilter? orchestrationFilter; - - public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) - { - this.worker = worker; - this.client = client; - this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); - this.internalOptions = this.worker.grpcOptions.Internal; - this.orchestrationFilter = orchestrationFilter; - this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null - ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) - : null; - } - - ILogger Logger => this.worker.logger; - - public async Task ExecuteAsync(CancellationToken cancellation) - { - // Tracks consecutive failures against the same channel. Reset only after the stream - // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). - int consecutiveChannelFailures = 0; - - // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. - int reconnectAttempt = 0; - Random backoffRandom = ReconnectBackoff.CreateRandom(); - - while (!cancellation.IsCancellationRequested) - { - bool channelLikelyPoisoned = false; - try - { - using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); - await this.ProcessWorkItemsAsync( - stream, - cancellation, - onFirstMessage: () => - { - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - }, - onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); - } - catch (RpcException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) - { - // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: - // Cancelled is ambiguous and shouldn't drive recreate storms. - this.Logger.SidecarDisconnected(); - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) - { - // Only HelloAsync carries a deadline. Once the work-item stream is established, - // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. - // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. - this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) - { - // Sidecar is down - keep retrying. - this.Logger.SidecarUnavailable(); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) - { - // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel - // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure - // counters: a status reply is proof the transport itself is healthy, so prior - // transport failures should not combine with later ones to trip the recreate. - this.Logger.AuthenticationFailed(ex); - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) - { - // We retry on a NotFound for several reasons: - // 1. It was the existing behavior through the UnexpectedError path. - // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes - // time to propagate so we should retry instead of making the user restart the application. - // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed - // between the scheduler and task hub, it would need to be restarted to function. - this.Logger.TaskHubNotFound(); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Shutting down, lets exit gracefully. - return ProcessorExitReason.Shutdown; - } - catch (Exception ex) - { - // Unknown failure - retry? - this.Logger.UnexpectedError(ex, string.Empty); - } - - if (channelLikelyPoisoned) - { - consecutiveChannelFailures++; - int threshold = this.internalOptions.ChannelRecreateFailureThreshold; - if (threshold > 0 && consecutiveChannelFailures >= threshold) - { - this.Logger.RecreatingChannel(consecutiveChannelFailures); - return ProcessorExitReason.ChannelRecreateRequested; - } - } - - try - { - TimeSpan delay = ReconnectBackoff.Compute( - reconnectAttempt, - this.internalOptions.ReconnectBackoffBase, - this.internalOptions.ReconnectBackoffCap, - backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); - reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt - await Task.Delay(delay, cancellation); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - } - - return ProcessorExitReason.Shutdown; - } - - - static string GetActionsListForLogging(IReadOnlyList actions) - { - if (actions.Count == 0) - { - return string.Empty; - } - else if (actions.Count == 1) - { - return actions[0].OrchestratorActionTypeCase.ToString(); - } - else - { - // Returns something like "ScheduleTask x5, CreateTimer x1,..." - return string.Join(", ", actions - .GroupBy(a => a.OrchestratorActionTypeCase) - .Select(group => $"{group.Key} x{group.Count()}")); - } - } - - static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) - { - P.TaskFailureDetails? failureDetails = null; - versionCheckFailed = false; - if (versioning != null) - { - int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); - - switch (versioning.MatchStrategy) - { - case DurableTaskWorkerOptions.VersionMatchStrategy.None: - // No versioning, breakout. - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: - // Comparison of 0 indicates equality. - if (versionComparison != 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: - // Comparison > 0 indicates the orchestration version is greater than the worker version. - if (versionComparison > 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - default: - // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionError", - ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", - IsNonRetriable = true, - }; - break; - } - - versionCheckFailed = failureDetails != null; - } - - return failureDetails; - } - - async ValueTask BuildRuntimeStateAsync( - P.OrchestratorRequest orchestratorRequest, - ProtoUtils.EntityConversionState? entityConversionState, - CancellationToken cancellation) - { - Func converter = entityConversionState is null - ? ProtoUtils.ConvertHistoryEvent - : entityConversionState.ConvertFromProto; - - IEnumerable pastEvents = []; - if (orchestratorRequest.RequiresHistoryStreaming) - { - // Stream the remaining events from the remote service - P.StreamInstanceHistoryRequest streamRequest = new() - { - InstanceId = orchestratorRequest.InstanceId, - ExecutionId = orchestratorRequest.ExecutionId, - ForWorkItemProcessing = true, - }; - - using AsyncServerStreamingCall streamResponse = - this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); - - await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) - { - pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); - } - } - else - { - // The history was already provided in the work item request - pastEvents = orchestratorRequest.PastEvents.Select(converter); - } - - IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); - - // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events - var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); - foreach (HistoryEvent e in newEvents) - { - // AddEvent() puts events into the NewEvents list. - runtimeState.AddEvent(e); - } - - if (runtimeState.ExecutionStartedEvent == null) - { - // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? - throw new InvalidOperationException("The provided orchestration history was incomplete"); - } - - return runtimeState; - } - - async Task> ConnectAsync(CancellationToken cancellation) - { - TimeSpan helloDeadline = this.internalOptions.HelloDeadline; - DateTime? deadline = null; - - if (helloDeadline > TimeSpan.Zero) - { - // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot - // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains - // unambiguous during internal normalization. - DateTime now = DateTime.UtcNow; - DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); - TimeSpan maxOffset = maxDeadlineUtc - now; - deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); - } - - await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); - this.Logger.EstablishedWorkItemConnection(); - - DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; - - // Get the stream for receiving work-items - return this.client!.GetWorkItems( - new P.GetWorkItemsRequest - { - MaxConcurrentActivityWorkItems = - workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, - MaxConcurrentOrchestrationWorkItems = - workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, - MaxConcurrentEntityWorkItems = - workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, - Capabilities = { this.worker.grpcOptions.Capabilities }, - WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), - }, - cancellationToken: cancellation); - } - - async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, - CancellationToken cancellation, - Action? onFirstMessage = null, - Action? onChannelLikelyPoisoned = null) - { - // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — - // including health pings sent periodically by the server — arrive within the configured - // window. If that fires we treat the stream as silently disconnected and reconnect. - TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; - - // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. - // The underlying IAsyncStreamReader is single-use — once the server terminates the stream - // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext - // returns false forever and re-entering await foreach would tight-spin with no yield. - WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( - ct => stream.ResponseStream.ReadAllAsync(ct), - silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, cancellation), - onFirstMessage, - cancellation); - - switch (result.Outcome) - { - case WorkItemStreamOutcome.Shutdown: - return; - - case WorkItemStreamOutcome.SilentDisconnect: - // Stream stopped producing messages (including health pings) for longer than the - // configured window. Treat as a poisoned channel. - this.Logger.ConnectionTimeout(); - onChannelLikelyPoisoned?.Invoke(); - return; - - case WorkItemStreamOutcome.GracefulDrain: - // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + - // OK trailers when a DTS instance is being replaced). Log it explicitly so - // operators can see it. Only count it toward the channel-poisoned threshold when - // the stream produced no messages: a stream that successfully delivered work and - // was then closed by the server is healthy behavior (e.g. routine rolling - // upgrade), and counting those would let a long-lived process accumulate spurious - // "poison" credits across many healthy drains. An empty drain, on the other hand, - // is a strong signal the channel is latched onto a dead/evacuated backend and - // needs to be recreated to pick up fresh DNS/routing. - this.Logger.StreamEndedByPeer(); - if (!result.FirstMessageObserved) - { - onChannelLikelyPoisoned?.Invoke(); - } - - return; - } - } - - void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. - // Logging at Trace allows operators to confirm liveness without flooding info-level - // telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } - } - - void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) - { - // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? - _ = Task.Run( - async () => - { - try - { - await handler(); - } - catch (OperationCanceledException) - { - // Shutting down - ignore - } - catch (Exception ex) - { - string instanceId = - workItem?.OrchestratorRequest?.InstanceId ?? - workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? - workItem?.EntityRequest?.InstanceId ?? - workItem?.EntityRequestV2?.InstanceId ?? - string.Empty; - this.Logger.UnexpectedError(ex, instanceId); - - if (workItem?.OrchestratorRequest != null) - { - try - { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.ActivityRequest != null) - { - try - { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem?.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.EntityRequest != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem?.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); - } - } - else if (workItem?.EntityRequestV2 != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem?.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); - } - } - } - }); - } - - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, - CancellationToken cancellationToken) - { - var executionStartedEvent = - request - .NewEvents - .Concat(request.PastEvents) - .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) - .Select(e => e.ExecutionStarted) - .FirstOrDefault(); - - Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( - executionStartedEvent, - request.OrchestrationTraceContext); - - if (executionStartedEvent is not null) - { - P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) - { - var subOrchestrationEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) - .FirstOrDefault(x => x.EventId == eventId); - - return subOrchestrationEvent; - } - - P.HistoryEvent? GetTaskScheduledEvent(int eventId) - { - var taskScheduledEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) - .LastOrDefault(x => x.EventId == eventId); - - return taskScheduledEvent; - } - - foreach (var newEvent in request.NewEvents) - { - switch (newEvent.EventTypeCase) - { - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationFailed( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, - newEvent.SubOrchestrationInstanceFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskCompleted( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskFailed: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskFailed( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled, - newEvent.TaskFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TimerFired: - TraceHelper.EmitTraceActivityForTimer( - request.InstanceId, - executionStartedEvent.Name, - newEvent.Timestamp.ToDateTime(), - newEvent.TimerFired); - break; - } - } - } - - OrchestratorExecutionResult? result = null; - P.TaskFailureDetails? failureDetails = null; - TaskName name = new("(unknown)"); - - ProtoUtils.EntityConversionState? entityConversionState = - this.internalOptions.ConvertOrchestrationEntityEvents - ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) - : null; - - DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; - bool versionFailure = false; - try - { - OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( - request, - entityConversionState, - cancellationToken); - - bool filterPassed = true; - if (this.orchestrationFilter != null) - { - filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( - new OrchestrationFilterParameters - { - Name = runtimeState.Name, - Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, - }, - cancellationToken); - } - - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - - // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. - failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); - - // Only continue with the work if the versioning check passed. - if (failureDetails == null) - { - name = new TaskName(runtimeState.Name); - - this.Logger.ReceivedOrchestratorRequest( - name, - request.InstanceId, - runtimeState.PastEvents.Count, - runtimeState.NewEvents.Count); - - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateOrchestrator( - name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) - { - // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled - // as part of try/catch. - ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch - { - ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), - _ => null, - }; - - TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); - TaskOrchestrationExecutor executor = new( - runtimeState, - shim, - BehaviorOnContinueAsNew.Carryover, - request.EntityParameters.ToCore(), - ErrorPropagationMode.UseFailureDetails, - this.exceptionPropertiesProvider); - result = executor.Execute(); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "OrchestratorTaskNotFound", - ErrorMessage = $"No orchestrator task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - } - catch (Exception unexpected) - { - // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. - this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); - failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - - P.OrchestratorResponse response; - if (result != null) - { - response = ProtoUtils.ConstructOrchestratorResponse( - request.InstanceId, - request.ExecutionId, - result.CustomStatus, - result.Actions, - completionToken, - entityConversionState, - traceActivity); - } - else if (versioning != null && failureDetails != null && versionFailure) - { - this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); - if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) - { - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - } - else - { - // This is the case for failures that happened *outside* the orchestrator executor - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - - var completeOrchestrationAction = response.Actions.FirstOrDefault( - a => a.CompleteOrchestration is not null); - - if (completeOrchestrationAction is not null) - { - if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) - { - traceActivity?.SetStatus( - ActivityStatusCode.Error, - completeOrchestrationAction.CompleteOrchestration.Result); - } - - traceActivity?.SetTag( - Schema.Task.Status, - completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); - - traceActivity?.Dispose(); - } - - this.Logger.SendingOrchestratorResponse( - name, - response.InstanceId, - response.Actions.Count, - GetActionsListForLogging(response.Actions)); - - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - cancellationToken); - } - - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) - { - using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); - - OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); - string rawInput = request.Input; - int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; - this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); - - P.TaskFailureDetails? failureDetails = null; - TaskContext innerContext = new(instance); - innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; - - TaskName name = new(request.Name); - string? output = null; - - failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); - if (!versioningFailed) - { - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskActivity shim = this.shimFactory.CreateActivity(name, activity); - output = await shim.RunAsync(innerContext, request.Input); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "ActivityTaskNotFound", - ErrorMessage = $"No activity task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - catch (Exception applicationException) - { - failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - } - else - { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - } - - return; - } - - int outputSizeInBytes = 0; - if (failureDetails != null) - { - traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); - - outputSizeInBytes = failureDetails.GetApproximateByteCount(); - } - else if (output != null) - { - outputSizeInBytes = Encoding.UTF8.GetByteCount(output); - } - - string successOrFailure = failureDetails != null ? "failure" : "success"; - this.Logger.SendingActivityResponse( - successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); - - P.ActivityResponse response = new() - { - InstanceId = instance.InstanceId, - TaskId = request.TaskId, - Result = output, - FailureDetails = failureDetails, - CompletionToken = completionToken, - }; - - // Stop the trace activity here to avoid including the completion time in the latency calculation - traceActivity?.Stop(); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteActivityTaskAsync), - cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - CancellationToken cancellation, - string? completionToken = null, - List? operationInfos = null) - { - var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); - EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); - - TaskName name = new(entityId.Name); - - EntityBatchResult? batchResult; - - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; - - if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); - batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); - } - else - { - // we could not find the entity. This is considered an application error, - // so we return a non-retriable error-OperationResult for each operation in the batch. - batchResult = new EntityBatchResult() - { - Actions = [], // no actions - EntityState = batchRequest.EntityState, // state is unmodified - Results = Enumerable.Repeat( - new OperationResult() - { - FailureDetails = new FailureDetails( - errorType: "EntityTaskNotFound", - errorMessage: $"No entity task named '{name}' was found.", - stackTrace: null, - innerFailure: null, - isNonRetriable: true), - }, - batchRequest.Operations!.Count).ToList(), - FailureDetails = null, - }; - } - } - catch (Exception frameworkException) - { - // return a result with failure details. - // this will cause the batch to be abandoned and retried - // (possibly after a delay and on a different worker). - batchResult = new EntityBatchResult() - { - FailureDetails = new FailureDetails(frameworkException), - }; - } - - P.EntityBatchResult response = batchResult.ToEntityBatchResult( - completionToken, - operationInfos?.Take(batchResult.Results?.Count ?? 0)); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteEntityTaskAsync), - cancellation); - } - - /// - /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. - /// - /// The orchestrator response to send. - /// The maximum size in bytes for each chunk. - /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, - CancellationToken cancellationToken) - { - // Validate that no single action exceeds the maximum chunk size - static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) - { - foreach (P.OrchestratorAction action in actions) - { - int actionSize = action.CalculateSize(); - if (actionSize > maxChunkBytes) - { - // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message - string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + - $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + - "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; - return new P.TaskFailureDetails - { - ErrorType = typeof(InvalidOperationException).FullName, - ErrorMessage = errorMessage, - IsNonRetriable = true, - }; - } - } - - return null; - } - - P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) - ? null - : ValidateActionsSize(response.Actions, maxChunkBytes); - if (validationFailure != null) - { - // Complete the orchestration with a failed status and failure details - P.OrchestratorResponse failureResponse = new() - { - InstanceId = response.InstanceId, - CompletionToken = response.CompletionToken, - OrchestrationTraceContext = response.OrchestrationTraceContext, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = validationFailure, - }, - }, - }, - }; - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Helper to add an action to the current chunk if it fits - static bool TryAddAction( - Google.Protobuf.Collections.RepeatedField dest, - P.OrchestratorAction action, - ref int currentSize, - int maxChunkBytes) - { - int actionSize = action.CalculateSize(); - if (currentSize + actionSize > maxChunkBytes && currentSize > 0) - { - return false; - } - - dest.Add(action); - currentSize += actionSize; - return true; - } - - // Check if the entire response fits in one chunk - int totalSize = response.CalculateSize(); - if (totalSize <= maxChunkBytes) - { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Response is too large, split into multiple chunks - int actionsCompletedSoFar = 0, chunkIndex = 0; - List allActions = response.Actions.ToList(); - bool isPartial = true; - bool isChunkedMode = false; - - while (isPartial) - { - P.OrchestratorResponse chunkedResponse = new() - { - InstanceId = response.InstanceId, - CustomStatus = response.CustomStatus, - CompletionToken = response.CompletionToken, - RequiresHistory = response.RequiresHistory, - NumEventsProcessed = 0, - }; - - int chunkPayloadSize = 0; - - // Fill the chunk with actions until we reach the size limit - while (actionsCompletedSoFar < allActions.Count && - TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) - { - actionsCompletedSoFar++; - } - - // Determine if this is a partial chunk (more actions remaining) - isPartial = actionsCompletedSoFar < allActions.Count; - chunkedResponse.IsPartial = isPartial; - - // Only activate chunked mode when we actually need multiple chunks. - // A single oversized action that fits in one chunk (via TryAddAction allowing - // the first item in an empty chunk) should be sent as non-chunked to avoid - // backend issues with ChunkIndex=0 + IsPartial=false. - if (isPartial) - { - isChunkedMode = true; - } - - if (isChunkedMode) - { - chunkedResponse.ChunkIndex = chunkIndex; - } - - if (chunkIndex == 0) - { - // The first chunk preserves the original response's NumEventsProcessed value (null) - // When this is set to null, backend by default handles all the messages in the workitem. - // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. - chunkedResponse.NumEventsProcessed = null; - chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; - } - - chunkIndex++; - - // Send the chunk - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - } - } - - async Task ExecuteWithRetryAsync( - Func action, - string operationName, - CancellationToken cancellationToken) - { - const int maxAttempts = 10; - TimeSpan delay = TimeSpan.FromMilliseconds(200); - - for (int attempt = 1; ; attempt++) - { - try - { - await action(); - return; - } - catch (RpcException ex) when ( - (ex.StatusCode == StatusCode.Unavailable || - ex.StatusCode == StatusCode.Unknown || - ex.StatusCode == StatusCode.DeadlineExceeded || - ex.StatusCode == StatusCode.Internal) && - attempt < maxAttempts) - { - // Back off with jitter for transient transport errors -#if NET6_0_OR_GREATER - int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#else - int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#endif - TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); - - this.Logger.TransientGrpcRetry( - operationName, - attempt, - maxAttempts, - backoff.TotalMilliseconds, - (int)ex.StatusCode, - ex); - - try - { - await Task.Delay(backoff, cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // If shutting down during the retry delay, propagate the cancellation exception - throw; - } - - // Exponential increase, capping at 15 seconds - delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); - continue; - } - } - } - } -} +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Linq; +using System.Text; +using DurableTask.Core; +using DurableTask.Core.Entities; +using DurableTask.Core.Entities.OperationFormat; +using DurableTask.Core.History; +using Google.Protobuf; +using Microsoft.DurableTask.Abstractions; +using Microsoft.DurableTask.Entities; +using Microsoft.DurableTask.Tracing; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.DurableTask.Worker.Shims; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; +using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; +using DTCore = DurableTask.Core; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// The gRPC Durable Task worker. +/// +sealed partial class GrpcDurableTaskWorker +{ + class Processor + { + static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); + + readonly GrpcDurableTaskWorker worker; + readonly TaskHubSidecarServiceClient client; + readonly DurableTaskShimFactory shimFactory; + readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; + readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; + [Obsolete("Experimental")] + readonly IOrchestrationFilter? orchestrationFilter; + + public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) + { + this.worker = worker; + this.client = client; + this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); + this.internalOptions = this.worker.grpcOptions.Internal; + this.orchestrationFilter = orchestrationFilter; + this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null + ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) + : null; + } + + ILogger Logger => this.worker.logger; + + public async Task ExecuteAsync(CancellationToken cancellation) + { + // Tracks consecutive failures against the same channel. Reset only after the stream + // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). + int consecutiveChannelFailures = 0; + + // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. + int reconnectAttempt = 0; + Random backoffRandom = ReconnectBackoff.CreateRandom(); + + while (!cancellation.IsCancellationRequested) + { + bool channelLikelyPoisoned = false; + try + { + using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); + await this.ProcessWorkItemsAsync( + stream, + cancellation, + onFirstMessage: () => + { + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + }, + onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); + } + catch (RpcException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) + { + // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: + // Cancelled is ambiguous and shouldn't drive recreate storms. + this.Logger.SidecarDisconnected(); + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) + { + // Only HelloAsync carries a deadline. Once the work-item stream is established, + // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. + // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. + this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) + { + // Sidecar is down - keep retrying. + this.Logger.SidecarUnavailable(); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) + { + // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel + // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure + // counters: a status reply is proof the transport itself is healthy, so prior + // transport failures should not combine with later ones to trip the recreate. + this.Logger.AuthenticationFailed(ex); + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) + { + // We retry on a NotFound for several reasons: + // 1. It was the existing behavior through the UnexpectedError path. + // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes + // time to propagate so we should retry instead of making the user restart the application. + // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed + // between the scheduler and task hub, it would need to be restarted to function. + this.Logger.TaskHubNotFound(); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Shutting down, lets exit gracefully. + return ProcessorExitReason.Shutdown; + } + catch (Exception ex) + { + // Unknown failure - retry? + this.Logger.UnexpectedError(ex, string.Empty); + } + + if (channelLikelyPoisoned) + { + consecutiveChannelFailures++; + int threshold = this.internalOptions.ChannelRecreateFailureThreshold; + if (threshold > 0 && consecutiveChannelFailures >= threshold) + { + this.Logger.RecreatingChannel(consecutiveChannelFailures); + return ProcessorExitReason.ChannelRecreateRequested; + } + } + + try + { + TimeSpan delay = ReconnectBackoff.Compute( + reconnectAttempt, + this.internalOptions.ReconnectBackoffBase, + this.internalOptions.ReconnectBackoffCap, + backoffRandom); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); + reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + await Task.Delay(delay, cancellation); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + } + + return ProcessorExitReason.Shutdown; + } + + + static string GetActionsListForLogging(IReadOnlyList actions) + { + if (actions.Count == 0) + { + return string.Empty; + } + else if (actions.Count == 1) + { + return actions[0].OrchestratorActionTypeCase.ToString(); + } + else + { + // Returns something like "ScheduleTask x5, CreateTimer x1,..." + return string.Join(", ", actions + .GroupBy(a => a.OrchestratorActionTypeCase) + .Select(group => $"{group.Key} x{group.Count()}")); + } + } + + static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) + { + P.TaskFailureDetails? failureDetails = null; + versionCheckFailed = false; + if (versioning != null) + { + int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); + + switch (versioning.MatchStrategy) + { + case DurableTaskWorkerOptions.VersionMatchStrategy.None: + // No versioning, breakout. + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: + // Comparison of 0 indicates equality. + if (versionComparison != 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: + // Comparison > 0 indicates the orchestration version is greater than the worker version. + if (versionComparison > 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + default: + // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionError", + ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", + IsNonRetriable = true, + }; + break; + } + + versionCheckFailed = failureDetails != null; + } + + return failureDetails; + } + + async ValueTask BuildRuntimeStateAsync( + P.OrchestratorRequest orchestratorRequest, + ProtoUtils.EntityConversionState? entityConversionState, + CancellationToken cancellation) + { + Func converter = entityConversionState is null + ? ProtoUtils.ConvertHistoryEvent + : entityConversionState.ConvertFromProto; + + IEnumerable pastEvents = []; + if (orchestratorRequest.RequiresHistoryStreaming) + { + // Stream the remaining events from the remote service + P.StreamInstanceHistoryRequest streamRequest = new() + { + InstanceId = orchestratorRequest.InstanceId, + ExecutionId = orchestratorRequest.ExecutionId, + ForWorkItemProcessing = true, + }; + + using AsyncServerStreamingCall streamResponse = + this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); + + await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) + { + pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); + } + } + else + { + // The history was already provided in the work item request + pastEvents = orchestratorRequest.PastEvents.Select(converter); + } + + IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); + + // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events + var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); + foreach (HistoryEvent e in newEvents) + { + // AddEvent() puts events into the NewEvents list. + runtimeState.AddEvent(e); + } + + if (runtimeState.ExecutionStartedEvent == null) + { + // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? + throw new InvalidOperationException("The provided orchestration history was incomplete"); + } + + return runtimeState; + } + + async Task> ConnectAsync(CancellationToken cancellation) + { + TimeSpan helloDeadline = this.internalOptions.HelloDeadline; + DateTime? deadline = null; + + if (helloDeadline > TimeSpan.Zero) + { + // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains + // unambiguous during internal normalization. + DateTime now = DateTime.UtcNow; + DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); + TimeSpan maxOffset = maxDeadlineUtc - now; + deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); + } + + await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); + this.Logger.EstablishedWorkItemConnection(); + + DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; + + // Get the stream for receiving work-items + return this.client!.GetWorkItems( + new P.GetWorkItemsRequest + { + MaxConcurrentActivityWorkItems = + workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, + MaxConcurrentOrchestrationWorkItems = + workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, + MaxConcurrentEntityWorkItems = + workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, + Capabilities = { this.worker.grpcOptions.Capabilities }, + WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), + }, + cancellationToken: cancellation); + } + + async Task ProcessWorkItemsAsync( + AsyncServerStreamingCall stream, + CancellationToken cancellation, + Action? onFirstMessage = null, + Action? onChannelLikelyPoisoned = null) + { + // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — + // including health pings sent periodically by the server — arrive within the configured + // window. If that fires we treat the stream as silently disconnected and reconnect. + TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; + + // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. + // The underlying IAsyncStreamReader is single-use — once the server terminates the stream + // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext + // returns false forever and re-entering await foreach would tight-spin with no yield. + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + ct => stream.ResponseStream.ReadAllAsync(ct), + silentDisconnectTimeout, + workItem => this.DispatchWorkItem(workItem, cancellation), + onFirstMessage, + cancellation); + + switch (result.Outcome) + { + case WorkItemStreamOutcome.Shutdown: + return; + + case WorkItemStreamOutcome.SilentDisconnect: + // Stream stopped producing messages (including health pings) for longer than the + // configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onChannelLikelyPoisoned?.Invoke(); + return; + + case WorkItemStreamOutcome.GracefulDrain: + // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + + // OK trailers when a DTS instance is being replaced). Log it explicitly so + // operators can see it. Only count it toward the channel-poisoned threshold when + // the stream produced no messages: a stream that successfully delivered work and + // was then closed by the server is healthy behavior (e.g. routine rolling + // upgrade), and counting those would let a long-lived process accumulate spurious + // "poison" credits across many healthy drains. An empty drain, on the other hand, + // is a strong signal the channel is latched onto a dead/evacuated backend and + // needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + if (!result.FirstMessageObserved) + { + onChannelLikelyPoisoned?.Invoke(); + } + + return; + } + } + + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) + { + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. + // Logging at Trace allows operators to confirm liveness without flooding info-level + // telemetry. + this.Logger.ReceivedHealthPing(); + } + else + { + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); + } + } + + void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) + { + // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? + _ = Task.Run( + async () => + { + try + { + await handler(); + } + catch (OperationCanceledException) + { + // Shutting down - ignore + } + catch (Exception ex) + { + string instanceId = + workItem?.OrchestratorRequest?.InstanceId ?? + workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? + workItem?.EntityRequest?.InstanceId ?? + workItem?.EntityRequestV2?.InstanceId ?? + string.Empty; + this.Logger.UnexpectedError(ex, instanceId); + + if (workItem?.OrchestratorRequest != null) + { + try + { + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellation); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.ActivityRequest != null) + { + try + { + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.EntityRequest != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); + } + } + else if (workItem?.EntityRequestV2 != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem?.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem?.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); + } + } + } + }); + } + + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + CancellationToken cancellationToken) + { + var executionStartedEvent = + request + .NewEvents + .Concat(request.PastEvents) + .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) + .Select(e => e.ExecutionStarted) + .FirstOrDefault(); + + Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( + executionStartedEvent, + request.OrchestrationTraceContext); + + if (executionStartedEvent is not null) + { + P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) + { + var subOrchestrationEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) + .FirstOrDefault(x => x.EventId == eventId); + + return subOrchestrationEvent; + } + + P.HistoryEvent? GetTaskScheduledEvent(int eventId) + { + var taskScheduledEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) + .LastOrDefault(x => x.EventId == eventId); + + return taskScheduledEvent; + } + + foreach (var newEvent in request.NewEvents) + { + switch (newEvent.EventTypeCase) + { + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationFailed( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, + newEvent.SubOrchestrationInstanceFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskCompleted( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskFailed: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskFailed( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled, + newEvent.TaskFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TimerFired: + TraceHelper.EmitTraceActivityForTimer( + request.InstanceId, + executionStartedEvent.Name, + newEvent.Timestamp.ToDateTime(), + newEvent.TimerFired); + break; + } + } + } + + OrchestratorExecutionResult? result = null; + P.TaskFailureDetails? failureDetails = null; + TaskName name = new("(unknown)"); + + ProtoUtils.EntityConversionState? entityConversionState = + this.internalOptions.ConvertOrchestrationEntityEvents + ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) + : null; + + DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; + bool versionFailure = false; + try + { + OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( + request, + entityConversionState, + cancellationToken); + + bool filterPassed = true; + if (this.orchestrationFilter != null) + { + filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( + new OrchestrationFilterParameters + { + Name = runtimeState.Name, + Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, + }, + cancellationToken); + } + + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + + // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. + failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); + + // Only continue with the work if the versioning check passed. + if (failureDetails == null) + { + name = new TaskName(runtimeState.Name); + + this.Logger.ReceivedOrchestratorRequest( + name, + request.InstanceId, + runtimeState.PastEvents.Count, + runtimeState.NewEvents.Count); + + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateOrchestrator( + name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) + { + // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled + // as part of try/catch. + ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch + { + ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), + _ => null, + }; + + TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); + TaskOrchestrationExecutor executor = new( + runtimeState, + shim, + BehaviorOnContinueAsNew.Carryover, + request.EntityParameters.ToCore(), + ErrorPropagationMode.UseFailureDetails, + this.exceptionPropertiesProvider); + result = executor.Execute(); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "OrchestratorTaskNotFound", + ErrorMessage = $"No orchestrator task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + } + catch (Exception unexpected) + { + // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. + this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); + failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + + P.OrchestratorResponse response; + if (result != null) + { + response = ProtoUtils.ConstructOrchestratorResponse( + request.InstanceId, + request.ExecutionId, + result.CustomStatus, + result.Actions, + completionToken, + entityConversionState, + traceActivity); + } + else if (versioning != null && failureDetails != null && versionFailure) + { + this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); + if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) + { + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + } + else + { + // This is the case for failures that happened *outside* the orchestrator executor + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + + var completeOrchestrationAction = response.Actions.FirstOrDefault( + a => a.CompleteOrchestration is not null); + + if (completeOrchestrationAction is not null) + { + if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) + { + traceActivity?.SetStatus( + ActivityStatusCode.Error, + completeOrchestrationAction.CompleteOrchestration.Result); + } + + traceActivity?.SetTag( + Schema.Task.Status, + completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); + + traceActivity?.Dispose(); + } + + this.Logger.SendingOrchestratorResponse( + name, + response.InstanceId, + response.Actions.Count, + GetActionsListForLogging(response.Actions)); + + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + cancellationToken); + } + + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) + { + using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); + + OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); + string rawInput = request.Input; + int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; + this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); + + P.TaskFailureDetails? failureDetails = null; + TaskContext innerContext = new(instance); + innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; + + TaskName name = new(request.Name); + string? output = null; + + failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); + if (!versioningFailed) + { + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskActivity shim = this.shimFactory.CreateActivity(name, activity); + output = await shim.RunAsync(innerContext, request.Input); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "ActivityTaskNotFound", + ErrorMessage = $"No activity task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + catch (Exception applicationException) + { + failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + } + else + { + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + } + + return; + } + + int outputSizeInBytes = 0; + if (failureDetails != null) + { + traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); + + outputSizeInBytes = failureDetails.GetApproximateByteCount(); + } + else if (output != null) + { + outputSizeInBytes = Encoding.UTF8.GetByteCount(output); + } + + string successOrFailure = failureDetails != null ? "failure" : "success"; + this.Logger.SendingActivityResponse( + successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); + + P.ActivityResponse response = new() + { + InstanceId = instance.InstanceId, + TaskId = request.TaskId, + Result = output, + FailureDetails = failureDetails, + CompletionToken = completionToken, + }; + + // Stop the trace activity here to avoid including the completion time in the latency calculation + traceActivity?.Stop(); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + cancellation); + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + CancellationToken cancellation, + string? completionToken = null, + List? operationInfos = null) + { + var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); + EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); + + TaskName name = new(entityId.Name); + + EntityBatchResult? batchResult; + + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; + + if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); + batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); + } + else + { + // we could not find the entity. This is considered an application error, + // so we return a non-retriable error-OperationResult for each operation in the batch. + batchResult = new EntityBatchResult() + { + Actions = [], // no actions + EntityState = batchRequest.EntityState, // state is unmodified + Results = Enumerable.Repeat( + new OperationResult() + { + FailureDetails = new FailureDetails( + errorType: "EntityTaskNotFound", + errorMessage: $"No entity task named '{name}' was found.", + stackTrace: null, + innerFailure: null, + isNonRetriable: true), + }, + batchRequest.Operations!.Count).ToList(), + FailureDetails = null, + }; + } + } + catch (Exception frameworkException) + { + // return a result with failure details. + // this will cause the batch to be abandoned and retried + // (possibly after a delay and on a different worker). + batchResult = new EntityBatchResult() + { + FailureDetails = new FailureDetails(frameworkException), + }; + } + + P.EntityBatchResult response = batchResult.ToEntityBatchResult( + completionToken, + operationInfos?.Take(batchResult.Results?.Count ?? 0)); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + cancellation); + } + + /// + /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. + /// + /// The orchestrator response to send. + /// The maximum size in bytes for each chunk. + /// The cancellation token. + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + CancellationToken cancellationToken) + { + // Validate that no single action exceeds the maximum chunk size + static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) + { + foreach (P.OrchestratorAction action in actions) + { + int actionSize = action.CalculateSize(); + if (actionSize > maxChunkBytes) + { + // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message + string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + + $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + + "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; + return new P.TaskFailureDetails + { + ErrorType = typeof(InvalidOperationException).FullName, + ErrorMessage = errorMessage, + IsNonRetriable = true, + }; + } + } + + return null; + } + + P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) + ? null + : ValidateActionsSize(response.Actions, maxChunkBytes); + if (validationFailure != null) + { + // Complete the orchestration with a failed status and failure details + P.OrchestratorResponse failureResponse = new() + { + InstanceId = response.InstanceId, + CompletionToken = response.CompletionToken, + OrchestrationTraceContext = response.OrchestrationTraceContext, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = validationFailure, + }, + }, + }, + }; + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Helper to add an action to the current chunk if it fits + static bool TryAddAction( + Google.Protobuf.Collections.RepeatedField dest, + P.OrchestratorAction action, + ref int currentSize, + int maxChunkBytes) + { + int actionSize = action.CalculateSize(); + if (currentSize + actionSize > maxChunkBytes && currentSize > 0) + { + return false; + } + + dest.Add(action); + currentSize += actionSize; + return true; + } + + // Check if the entire response fits in one chunk + int totalSize = response.CalculateSize(); + if (totalSize <= maxChunkBytes) + { + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Response is too large, split into multiple chunks + int actionsCompletedSoFar = 0, chunkIndex = 0; + List allActions = response.Actions.ToList(); + bool isPartial = true; + bool isChunkedMode = false; + + while (isPartial) + { + P.OrchestratorResponse chunkedResponse = new() + { + InstanceId = response.InstanceId, + CustomStatus = response.CustomStatus, + CompletionToken = response.CompletionToken, + RequiresHistory = response.RequiresHistory, + NumEventsProcessed = 0, + }; + + int chunkPayloadSize = 0; + + // Fill the chunk with actions until we reach the size limit + while (actionsCompletedSoFar < allActions.Count && + TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) + { + actionsCompletedSoFar++; + } + + // Determine if this is a partial chunk (more actions remaining) + isPartial = actionsCompletedSoFar < allActions.Count; + chunkedResponse.IsPartial = isPartial; + + // Only activate chunked mode when we actually need multiple chunks. + // A single oversized action that fits in one chunk (via TryAddAction allowing + // the first item in an empty chunk) should be sent as non-chunked to avoid + // backend issues with ChunkIndex=0 + IsPartial=false. + if (isPartial) + { + isChunkedMode = true; + } + + if (isChunkedMode) + { + chunkedResponse.ChunkIndex = chunkIndex; + } + + if (chunkIndex == 0) + { + // The first chunk preserves the original response's NumEventsProcessed value (null) + // When this is set to null, backend by default handles all the messages in the workitem. + // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. + chunkedResponse.NumEventsProcessed = null; + chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; + } + + chunkIndex++; + + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + } + } + + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + CancellationToken cancellationToken) + { + const int maxAttempts = 10; + TimeSpan delay = TimeSpan.FromMilliseconds(200); + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Back off with jitter for transient transport errors +#if NET6_0_OR_GREATER + int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#else + int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#endif + TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down during the retry delay, propagate the cancellation exception + throw; + } + + // Exponential increase, capping at 15 seconds + delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); + continue; + } + } + } + } +} From b18d9709d04c3cd2ecb1950d4944843bfcbaf72c Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:32:19 -0700 Subject: [PATCH 06/36] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Grpc.Tests/ExecuteWithRetryTests.cs | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index b9a308310..155b3e29f 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -18,10 +18,34 @@ public class ExecuteWithRetryTests { const string Category = "Microsoft.DurableTask.Worker.Grpc"; - static readonly MethodInfo ExecuteWithRetryAsyncMethod = typeof(GrpcDurableTaskWorker) - .GetNestedType("Processor", BindingFlags.NonPublic)! - .GetMethod("ExecuteWithRetryAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly MethodInfo ExecuteWithRetryAsyncMethod = FindExecuteWithRetryAsyncMethod(); + static Type FindProcessorType() + { + return typeof(GrpcDurableTaskWorker) + .GetNestedTypes(BindingFlags.NonPublic) + .Single(type => type.GetMethods(BindingFlags.Instance | BindingFlags.NonPublic) + .Any(method => + method.ReturnType == typeof(Task) && + method.GetParameters() is var parameters && + parameters.Length == 3 && + parameters[0].ParameterType == typeof(Func) && + parameters[1].ParameterType == typeof(string) && + parameters[2].ParameterType == typeof(CancellationToken))); + } + + static MethodInfo FindExecuteWithRetryAsyncMethod() + { + return FindProcessorType() + .GetMethods(BindingFlags.Instance | BindingFlags.NonPublic) + .Single(method => + method.ReturnType == typeof(Task) && + method.GetParameters() is var parameters && + parameters.Length == 3 && + parameters[0].ParameterType == typeof(Func) && + parameters[1].ParameterType == typeof(string) && + parameters[2].ParameterType == typeof(CancellationToken)); + } [Fact] public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() { From 5a38c5fbd0d45ac4feee8f255b36e9f7c12495cf Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 10:48:51 -0700 Subject: [PATCH 07/36] returned the completion logs --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 30 +++++++++++++------ 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 28a65a146..2aae8a9d9 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -483,7 +483,8 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); + cancellation); + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -498,16 +499,21 @@ await this.ExecuteWithRetryAsync( instanceId, workItem.ActivityRequest.Name, workItem.ActivityRequest.TaskId, - workItem?.CompletionToken ?? string.Empty); + workItem.CompletionToken ?? string.Empty); await this.ExecuteWithRetryAsync( async () => await this.client.AbandonTaskActivityWorkItemAsync( new P.AbandonActivityTaskRequest { - CompletionToken = workItem?.CompletionToken, + CompletionToken = workItem.CompletionToken, }, cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); + cancellation); + this.Logger.AbandonedActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -520,7 +526,7 @@ await this.ExecuteWithRetryAsync( { this.Logger.AbandoningEntityWorkItem( workItem.EntityRequest.InstanceId, - workItem?.CompletionToken ?? string.Empty); + workItem.CompletionToken ?? string.Empty); await this.ExecuteWithRetryAsync( async () => await this.client.AbandonTaskEntityWorkItemAsync( new P.AbandonEntityTaskRequest @@ -529,7 +535,10 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) { @@ -542,16 +551,19 @@ await this.ExecuteWithRetryAsync( { this.Logger.AbandoningEntityWorkItem( workItem.EntityRequestV2.InstanceId, - workItem?.CompletionToken ?? string.Empty); + workItem.CompletionToken ?? string.Empty); await this.ExecuteWithRetryAsync( async () => await this.client.AbandonTaskEntityWorkItemAsync( new P.AbandonEntityTaskRequest { - CompletionToken = workItem?.CompletionToken, + CompletionToken = workItem.CompletionToken, }, cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) { From 614893e6f724b2ed71b0130fb5eb2be877ef6c1b Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:49:44 -0700 Subject: [PATCH 08/36] Potential fix for pull request finding 'Local scope variable shadows member' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 155b3e29f..c6101d4f8 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -282,7 +282,7 @@ sealed class SimpleLoggerFactory : ILoggerFactory public SimpleLoggerFactory(ILoggerProvider provider) => this.provider = provider; - public void AddProvider(ILoggerProvider provider) { } + public void AddProvider(ILoggerProvider loggerProvider) { } public ILogger CreateLogger(string categoryName) => this.provider.CreateLogger(categoryName); From 6826505f67269fa6e375fa0dbb32aabbe0d3dab4 Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:56:43 -0700 Subject: [PATCH 09/36] Update src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 2514 ++++++++--------- 1 file changed, 1257 insertions(+), 1257 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 2aae8a9d9..d4f8fa61e 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -1,1267 +1,1267 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Diagnostics; -using System.Linq; -using System.Text; -using DurableTask.Core; -using DurableTask.Core.Entities; -using DurableTask.Core.Entities.OperationFormat; -using DurableTask.Core.History; -using Google.Protobuf; -using Microsoft.DurableTask.Abstractions; -using Microsoft.DurableTask.Entities; -using Microsoft.DurableTask.Tracing; -using Microsoft.DurableTask.Worker.Grpc.Internal; -using Microsoft.DurableTask.Worker.Shims; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Logging; -using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; -using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; -using DTCore = DurableTask.Core; -using P = Microsoft.DurableTask.Protobuf; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// The gRPC Durable Task worker. -/// -sealed partial class GrpcDurableTaskWorker -{ - class Processor - { - static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); - - readonly GrpcDurableTaskWorker worker; - readonly TaskHubSidecarServiceClient client; - readonly DurableTaskShimFactory shimFactory; - readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; - readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; - [Obsolete("Experimental")] - readonly IOrchestrationFilter? orchestrationFilter; - - public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) - { - this.worker = worker; - this.client = client; - this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); - this.internalOptions = this.worker.grpcOptions.Internal; - this.orchestrationFilter = orchestrationFilter; - this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null - ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) - : null; - } - - ILogger Logger => this.worker.logger; - - public async Task ExecuteAsync(CancellationToken cancellation) - { - // Tracks consecutive failures against the same channel. Reset only after the stream - // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). - int consecutiveChannelFailures = 0; - - // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. - int reconnectAttempt = 0; - Random backoffRandom = ReconnectBackoff.CreateRandom(); - - while (!cancellation.IsCancellationRequested) - { - bool channelLikelyPoisoned = false; - try - { - using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); - await this.ProcessWorkItemsAsync( - stream, - cancellation, - onFirstMessage: () => - { - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - }, - onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); - } - catch (RpcException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) - { - // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: - // Cancelled is ambiguous and shouldn't drive recreate storms. - this.Logger.SidecarDisconnected(); - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) - { - // Only HelloAsync carries a deadline. Once the work-item stream is established, - // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. - // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. - this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) - { - // Sidecar is down - keep retrying. - this.Logger.SidecarUnavailable(); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) - { - // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel - // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure - // counters: a status reply is proof the transport itself is healthy, so prior - // transport failures should not combine with later ones to trip the recreate. - this.Logger.AuthenticationFailed(ex); - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) - { - // We retry on a NotFound for several reasons: - // 1. It was the existing behavior through the UnexpectedError path. - // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes - // time to propagate so we should retry instead of making the user restart the application. - // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed - // between the scheduler and task hub, it would need to be restarted to function. - this.Logger.TaskHubNotFound(); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Shutting down, lets exit gracefully. - return ProcessorExitReason.Shutdown; - } - catch (Exception ex) - { - // Unknown failure - retry? - this.Logger.UnexpectedError(ex, string.Empty); - } - - if (channelLikelyPoisoned) - { - consecutiveChannelFailures++; - int threshold = this.internalOptions.ChannelRecreateFailureThreshold; - if (threshold > 0 && consecutiveChannelFailures >= threshold) - { - this.Logger.RecreatingChannel(consecutiveChannelFailures); - return ProcessorExitReason.ChannelRecreateRequested; - } - } - - try - { - TimeSpan delay = ReconnectBackoff.Compute( - reconnectAttempt, - this.internalOptions.ReconnectBackoffBase, - this.internalOptions.ReconnectBackoffCap, - backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); - reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt - await Task.Delay(delay, cancellation); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - } - - return ProcessorExitReason.Shutdown; - } - - - static string GetActionsListForLogging(IReadOnlyList actions) - { - if (actions.Count == 0) - { - return string.Empty; - } - else if (actions.Count == 1) - { - return actions[0].OrchestratorActionTypeCase.ToString(); - } - else - { - // Returns something like "ScheduleTask x5, CreateTimer x1,..." - return string.Join(", ", actions - .GroupBy(a => a.OrchestratorActionTypeCase) - .Select(group => $"{group.Key} x{group.Count()}")); - } - } - - static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) - { - P.TaskFailureDetails? failureDetails = null; - versionCheckFailed = false; - if (versioning != null) - { - int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); - - switch (versioning.MatchStrategy) - { - case DurableTaskWorkerOptions.VersionMatchStrategy.None: - // No versioning, breakout. - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: - // Comparison of 0 indicates equality. - if (versionComparison != 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: - // Comparison > 0 indicates the orchestration version is greater than the worker version. - if (versionComparison > 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - default: - // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionError", - ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", - IsNonRetriable = true, - }; - break; - } - - versionCheckFailed = failureDetails != null; - } - - return failureDetails; - } - - async ValueTask BuildRuntimeStateAsync( - P.OrchestratorRequest orchestratorRequest, - ProtoUtils.EntityConversionState? entityConversionState, - CancellationToken cancellation) - { - Func converter = entityConversionState is null - ? ProtoUtils.ConvertHistoryEvent - : entityConversionState.ConvertFromProto; - - IEnumerable pastEvents = []; - if (orchestratorRequest.RequiresHistoryStreaming) - { - // Stream the remaining events from the remote service - P.StreamInstanceHistoryRequest streamRequest = new() - { - InstanceId = orchestratorRequest.InstanceId, - ExecutionId = orchestratorRequest.ExecutionId, - ForWorkItemProcessing = true, - }; - - using AsyncServerStreamingCall streamResponse = - this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); - - await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) - { - pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); - } - } - else - { - // The history was already provided in the work item request - pastEvents = orchestratorRequest.PastEvents.Select(converter); - } - - IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); - - // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events - var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); - foreach (HistoryEvent e in newEvents) - { - // AddEvent() puts events into the NewEvents list. - runtimeState.AddEvent(e); - } - - if (runtimeState.ExecutionStartedEvent == null) - { - // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? - throw new InvalidOperationException("The provided orchestration history was incomplete"); - } - - return runtimeState; - } - - async Task> ConnectAsync(CancellationToken cancellation) - { - TimeSpan helloDeadline = this.internalOptions.HelloDeadline; - DateTime? deadline = null; - - if (helloDeadline > TimeSpan.Zero) - { - // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot - // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains - // unambiguous during internal normalization. - DateTime now = DateTime.UtcNow; - DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); - TimeSpan maxOffset = maxDeadlineUtc - now; - deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); - } - - await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); - this.Logger.EstablishedWorkItemConnection(); - - DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; - - // Get the stream for receiving work-items - return this.client!.GetWorkItems( - new P.GetWorkItemsRequest - { - MaxConcurrentActivityWorkItems = - workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, - MaxConcurrentOrchestrationWorkItems = - workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, - MaxConcurrentEntityWorkItems = - workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, - Capabilities = { this.worker.grpcOptions.Capabilities }, - WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), - }, - cancellationToken: cancellation); - } - - async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, - CancellationToken cancellation, - Action? onFirstMessage = null, - Action? onChannelLikelyPoisoned = null) - { - // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — - // including health pings sent periodically by the server — arrive within the configured - // window. If that fires we treat the stream as silently disconnected and reconnect. - TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; - - // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. - // The underlying IAsyncStreamReader is single-use — once the server terminates the stream - // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext - // returns false forever and re-entering await foreach would tight-spin with no yield. - WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( - ct => stream.ResponseStream.ReadAllAsync(ct), - silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, cancellation), - onFirstMessage, - cancellation); - - switch (result.Outcome) - { - case WorkItemStreamOutcome.Shutdown: - return; - - case WorkItemStreamOutcome.SilentDisconnect: - // Stream stopped producing messages (including health pings) for longer than the - // configured window. Treat as a poisoned channel. - this.Logger.ConnectionTimeout(); - onChannelLikelyPoisoned?.Invoke(); - return; - - case WorkItemStreamOutcome.GracefulDrain: - // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + - // OK trailers when a DTS instance is being replaced). Log it explicitly so - // operators can see it. Only count it toward the channel-poisoned threshold when - // the stream produced no messages: a stream that successfully delivered work and - // was then closed by the server is healthy behavior (e.g. routine rolling - // upgrade), and counting those would let a long-lived process accumulate spurious - // "poison" credits across many healthy drains. An empty drain, on the other hand, - // is a strong signal the channel is latched onto a dead/evacuated backend and - // needs to be recreated to pick up fresh DNS/routing. - this.Logger.StreamEndedByPeer(); - if (!result.FirstMessageObserved) - { - onChannelLikelyPoisoned?.Invoke(); - } - - return; - } - } - - void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. - // Logging at Trace allows operators to confirm liveness without flooding info-level - // telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } - } - - void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) - { - // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? - _ = Task.Run( - async () => - { - try - { - await handler(); - } - catch (OperationCanceledException) - { - // Shutting down - ignore - } - catch (Exception ex) - { - string instanceId = - workItem?.OrchestratorRequest?.InstanceId ?? - workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? - workItem?.EntityRequest?.InstanceId ?? - workItem?.EntityRequestV2?.InstanceId ?? - string.Empty; - this.Logger.UnexpectedError(ex, instanceId); - - if (workItem?.OrchestratorRequest != null) - { - try - { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Linq; +using System.Text; +using DurableTask.Core; +using DurableTask.Core.Entities; +using DurableTask.Core.Entities.OperationFormat; +using DurableTask.Core.History; +using Google.Protobuf; +using Microsoft.DurableTask.Abstractions; +using Microsoft.DurableTask.Entities; +using Microsoft.DurableTask.Tracing; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.DurableTask.Worker.Shims; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; +using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; +using DTCore = DurableTask.Core; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// The gRPC Durable Task worker. +/// +sealed partial class GrpcDurableTaskWorker +{ + class Processor + { + static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); + + readonly GrpcDurableTaskWorker worker; + readonly TaskHubSidecarServiceClient client; + readonly DurableTaskShimFactory shimFactory; + readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; + readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; + [Obsolete("Experimental")] + readonly IOrchestrationFilter? orchestrationFilter; + + public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) + { + this.worker = worker; + this.client = client; + this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); + this.internalOptions = this.worker.grpcOptions.Internal; + this.orchestrationFilter = orchestrationFilter; + this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null + ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) + : null; + } + + ILogger Logger => this.worker.logger; + + public async Task ExecuteAsync(CancellationToken cancellation) + { + // Tracks consecutive failures against the same channel. Reset only after the stream + // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). + int consecutiveChannelFailures = 0; + + // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. + int reconnectAttempt = 0; + Random backoffRandom = ReconnectBackoff.CreateRandom(); + + while (!cancellation.IsCancellationRequested) + { + bool channelLikelyPoisoned = false; + try + { + using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); + await this.ProcessWorkItemsAsync( + stream, + cancellation, + onFirstMessage: () => + { + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + }, + onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); + } + catch (RpcException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) + { + // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: + // Cancelled is ambiguous and shouldn't drive recreate storms. + this.Logger.SidecarDisconnected(); + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) + { + // Only HelloAsync carries a deadline. Once the work-item stream is established, + // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. + // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. + this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) + { + // Sidecar is down - keep retrying. + this.Logger.SidecarUnavailable(); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) + { + // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel + // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure + // counters: a status reply is proof the transport itself is healthy, so prior + // transport failures should not combine with later ones to trip the recreate. + this.Logger.AuthenticationFailed(ex); + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) + { + // We retry on a NotFound for several reasons: + // 1. It was the existing behavior through the UnexpectedError path. + // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes + // time to propagate so we should retry instead of making the user restart the application. + // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed + // between the scheduler and task hub, it would need to be restarted to function. + this.Logger.TaskHubNotFound(); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Shutting down, lets exit gracefully. + return ProcessorExitReason.Shutdown; + } + catch (Exception ex) + { + // Unknown failure - retry? + this.Logger.UnexpectedError(ex, string.Empty); + } + + if (channelLikelyPoisoned) + { + consecutiveChannelFailures++; + int threshold = this.internalOptions.ChannelRecreateFailureThreshold; + if (threshold > 0 && consecutiveChannelFailures >= threshold) + { + this.Logger.RecreatingChannel(consecutiveChannelFailures); + return ProcessorExitReason.ChannelRecreateRequested; + } + } + + try + { + TimeSpan delay = ReconnectBackoff.Compute( + reconnectAttempt, + this.internalOptions.ReconnectBackoffBase, + this.internalOptions.ReconnectBackoffCap, + backoffRandom); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); + reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + await Task.Delay(delay, cancellation); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + } + + return ProcessorExitReason.Shutdown; + } + + + static string GetActionsListForLogging(IReadOnlyList actions) + { + if (actions.Count == 0) + { + return string.Empty; + } + else if (actions.Count == 1) + { + return actions[0].OrchestratorActionTypeCase.ToString(); + } + else + { + // Returns something like "ScheduleTask x5, CreateTimer x1,..." + return string.Join(", ", actions + .GroupBy(a => a.OrchestratorActionTypeCase) + .Select(group => $"{group.Key} x{group.Count()}")); + } + } + + static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) + { + P.TaskFailureDetails? failureDetails = null; + versionCheckFailed = false; + if (versioning != null) + { + int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); + + switch (versioning.MatchStrategy) + { + case DurableTaskWorkerOptions.VersionMatchStrategy.None: + // No versioning, breakout. + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: + // Comparison of 0 indicates equality. + if (versionComparison != 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: + // Comparison > 0 indicates the orchestration version is greater than the worker version. + if (versionComparison > 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + default: + // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionError", + ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", + IsNonRetriable = true, + }; + break; + } + + versionCheckFailed = failureDetails != null; + } + + return failureDetails; + } + + async ValueTask BuildRuntimeStateAsync( + P.OrchestratorRequest orchestratorRequest, + ProtoUtils.EntityConversionState? entityConversionState, + CancellationToken cancellation) + { + Func converter = entityConversionState is null + ? ProtoUtils.ConvertHistoryEvent + : entityConversionState.ConvertFromProto; + + IEnumerable pastEvents = []; + if (orchestratorRequest.RequiresHistoryStreaming) + { + // Stream the remaining events from the remote service + P.StreamInstanceHistoryRequest streamRequest = new() + { + InstanceId = orchestratorRequest.InstanceId, + ExecutionId = orchestratorRequest.ExecutionId, + ForWorkItemProcessing = true, + }; + + using AsyncServerStreamingCall streamResponse = + this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); + + await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) + { + pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); + } + } + else + { + // The history was already provided in the work item request + pastEvents = orchestratorRequest.PastEvents.Select(converter); + } + + IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); + + // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events + var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); + foreach (HistoryEvent e in newEvents) + { + // AddEvent() puts events into the NewEvents list. + runtimeState.AddEvent(e); + } + + if (runtimeState.ExecutionStartedEvent == null) + { + // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? + throw new InvalidOperationException("The provided orchestration history was incomplete"); + } + + return runtimeState; + } + + async Task> ConnectAsync(CancellationToken cancellation) + { + TimeSpan helloDeadline = this.internalOptions.HelloDeadline; + DateTime? deadline = null; + + if (helloDeadline > TimeSpan.Zero) + { + // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains + // unambiguous during internal normalization. + DateTime now = DateTime.UtcNow; + DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); + TimeSpan maxOffset = maxDeadlineUtc - now; + deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); + } + + await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); + this.Logger.EstablishedWorkItemConnection(); + + DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; + + // Get the stream for receiving work-items + return this.client!.GetWorkItems( + new P.GetWorkItemsRequest + { + MaxConcurrentActivityWorkItems = + workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, + MaxConcurrentOrchestrationWorkItems = + workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, + MaxConcurrentEntityWorkItems = + workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, + Capabilities = { this.worker.grpcOptions.Capabilities }, + WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), + }, + cancellationToken: cancellation); + } + + async Task ProcessWorkItemsAsync( + AsyncServerStreamingCall stream, + CancellationToken cancellation, + Action? onFirstMessage = null, + Action? onChannelLikelyPoisoned = null) + { + // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — + // including health pings sent periodically by the server — arrive within the configured + // window. If that fires we treat the stream as silently disconnected and reconnect. + TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; + + // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. + // The underlying IAsyncStreamReader is single-use — once the server terminates the stream + // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext + // returns false forever and re-entering await foreach would tight-spin with no yield. + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + ct => stream.ResponseStream.ReadAllAsync(ct), + silentDisconnectTimeout, + workItem => this.DispatchWorkItem(workItem, cancellation), + onFirstMessage, + cancellation); + + switch (result.Outcome) + { + case WorkItemStreamOutcome.Shutdown: + return; + + case WorkItemStreamOutcome.SilentDisconnect: + // Stream stopped producing messages (including health pings) for longer than the + // configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onChannelLikelyPoisoned?.Invoke(); + return; + + case WorkItemStreamOutcome.GracefulDrain: + // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + + // OK trailers when a DTS instance is being replaced). Log it explicitly so + // operators can see it. Only count it toward the channel-poisoned threshold when + // the stream produced no messages: a stream that successfully delivered work and + // was then closed by the server is healthy behavior (e.g. routine rolling + // upgrade), and counting those would let a long-lived process accumulate spurious + // "poison" credits across many healthy drains. An empty drain, on the other hand, + // is a strong signal the channel is latched onto a dead/evacuated backend and + // needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + if (!result.FirstMessageObserved) + { + onChannelLikelyPoisoned?.Invoke(); + } + + return; + } + } + + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) + { + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. + // Logging at Trace allows operators to confirm liveness without flooding info-level + // telemetry. + this.Logger.ReceivedHealthPing(); + } + else + { + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); + } + } + + void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) + { + // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? + _ = Task.Run( + async () => + { + try + { + await handler(); + } + catch (OperationCanceledException) + { + // Shutting down - ignore + } + catch (Exception ex) + { + string instanceId = + workItem?.OrchestratorRequest?.InstanceId ?? + workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? + workItem?.EntityRequest?.InstanceId ?? + workItem?.EntityRequestV2?.InstanceId ?? + string.Empty; + this.Logger.UnexpectedError(ex, instanceId); + + if (workItem?.OrchestratorRequest != null) + { + try + { + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.ActivityRequest != null) - { - try - { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.ActivityRequest != null) + { + try + { + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), cancellation); this.Logger.AbandonedActivityWorkItem( instanceId, workItem.ActivityRequest.Name, workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.EntityRequest != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem?.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.EntityRequest != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), cancellation); this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); - } - } - else if (workItem?.EntityRequestV2 != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); + } + } + else if (workItem?.EntityRequestV2 != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), cancellation); this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); - } - } - } - }); - } - - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, - CancellationToken cancellationToken) - { - var executionStartedEvent = - request - .NewEvents - .Concat(request.PastEvents) - .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) - .Select(e => e.ExecutionStarted) - .FirstOrDefault(); - - Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( - executionStartedEvent, - request.OrchestrationTraceContext); - - if (executionStartedEvent is not null) - { - P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) - { - var subOrchestrationEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) - .FirstOrDefault(x => x.EventId == eventId); - - return subOrchestrationEvent; - } - - P.HistoryEvent? GetTaskScheduledEvent(int eventId) - { - var taskScheduledEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) - .LastOrDefault(x => x.EventId == eventId); - - return taskScheduledEvent; - } - - foreach (var newEvent in request.NewEvents) - { - switch (newEvent.EventTypeCase) - { - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationFailed( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, - newEvent.SubOrchestrationInstanceFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskCompleted( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskFailed: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskFailed( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled, - newEvent.TaskFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TimerFired: - TraceHelper.EmitTraceActivityForTimer( - request.InstanceId, - executionStartedEvent.Name, - newEvent.Timestamp.ToDateTime(), - newEvent.TimerFired); - break; - } - } - } - - OrchestratorExecutionResult? result = null; - P.TaskFailureDetails? failureDetails = null; - TaskName name = new("(unknown)"); - - ProtoUtils.EntityConversionState? entityConversionState = - this.internalOptions.ConvertOrchestrationEntityEvents - ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) - : null; - - DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; - bool versionFailure = false; - try - { - OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( - request, - entityConversionState, - cancellationToken); - - bool filterPassed = true; - if (this.orchestrationFilter != null) - { - filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( - new OrchestrationFilterParameters - { - Name = runtimeState.Name, - Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, - }, - cancellationToken); - } - - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - - // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. - failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); - - // Only continue with the work if the versioning check passed. - if (failureDetails == null) - { - name = new TaskName(runtimeState.Name); - - this.Logger.ReceivedOrchestratorRequest( - name, - request.InstanceId, - runtimeState.PastEvents.Count, - runtimeState.NewEvents.Count); - - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateOrchestrator( - name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) - { - // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled - // as part of try/catch. - ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch - { - ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), - _ => null, - }; - - TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); - TaskOrchestrationExecutor executor = new( - runtimeState, - shim, - BehaviorOnContinueAsNew.Carryover, - request.EntityParameters.ToCore(), - ErrorPropagationMode.UseFailureDetails, - this.exceptionPropertiesProvider); - result = executor.Execute(); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "OrchestratorTaskNotFound", - ErrorMessage = $"No orchestrator task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - } - catch (Exception unexpected) - { - // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. - this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); - failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - - P.OrchestratorResponse response; - if (result != null) - { - response = ProtoUtils.ConstructOrchestratorResponse( - request.InstanceId, - request.ExecutionId, - result.CustomStatus, - result.Actions, - completionToken, - entityConversionState, - traceActivity); - } - else if (versioning != null && failureDetails != null && versionFailure) - { - this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); - if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) - { - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - } - else - { - // This is the case for failures that happened *outside* the orchestrator executor - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - - var completeOrchestrationAction = response.Actions.FirstOrDefault( - a => a.CompleteOrchestration is not null); - - if (completeOrchestrationAction is not null) - { - if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) - { - traceActivity?.SetStatus( - ActivityStatusCode.Error, - completeOrchestrationAction.CompleteOrchestration.Result); - } - - traceActivity?.SetTag( - Schema.Task.Status, - completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); - - traceActivity?.Dispose(); - } - - this.Logger.SendingOrchestratorResponse( - name, - response.InstanceId, - response.Actions.Count, - GetActionsListForLogging(response.Actions)); - - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - cancellationToken); - } - - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) - { - using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); - - OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); - string rawInput = request.Input; - int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; - this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); - - P.TaskFailureDetails? failureDetails = null; - TaskContext innerContext = new(instance); - innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; - - TaskName name = new(request.Name); - string? output = null; - - failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); - if (!versioningFailed) - { - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskActivity shim = this.shimFactory.CreateActivity(name, activity); - output = await shim.RunAsync(innerContext, request.Input); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "ActivityTaskNotFound", - ErrorMessage = $"No activity task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - catch (Exception applicationException) - { - failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - } - else - { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - } - - return; - } - - int outputSizeInBytes = 0; - if (failureDetails != null) - { - traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); - - outputSizeInBytes = failureDetails.GetApproximateByteCount(); - } - else if (output != null) - { - outputSizeInBytes = Encoding.UTF8.GetByteCount(output); - } - - string successOrFailure = failureDetails != null ? "failure" : "success"; - this.Logger.SendingActivityResponse( - successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); - - P.ActivityResponse response = new() - { - InstanceId = instance.InstanceId, - TaskId = request.TaskId, - Result = output, - FailureDetails = failureDetails, - CompletionToken = completionToken, - }; - - // Stop the trace activity here to avoid including the completion time in the latency calculation - traceActivity?.Stop(); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteActivityTaskAsync), - cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - CancellationToken cancellation, - string? completionToken = null, - List? operationInfos = null) - { - var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); - EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); - - TaskName name = new(entityId.Name); - - EntityBatchResult? batchResult; - - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; - - if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); - batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); - } - else - { - // we could not find the entity. This is considered an application error, - // so we return a non-retriable error-OperationResult for each operation in the batch. - batchResult = new EntityBatchResult() - { - Actions = [], // no actions - EntityState = batchRequest.EntityState, // state is unmodified - Results = Enumerable.Repeat( - new OperationResult() - { - FailureDetails = new FailureDetails( - errorType: "EntityTaskNotFound", - errorMessage: $"No entity task named '{name}' was found.", - stackTrace: null, - innerFailure: null, - isNonRetriable: true), - }, - batchRequest.Operations!.Count).ToList(), - FailureDetails = null, - }; - } - } - catch (Exception frameworkException) - { - // return a result with failure details. - // this will cause the batch to be abandoned and retried - // (possibly after a delay and on a different worker). - batchResult = new EntityBatchResult() - { - FailureDetails = new FailureDetails(frameworkException), - }; - } - - P.EntityBatchResult response = batchResult.ToEntityBatchResult( - completionToken, - operationInfos?.Take(batchResult.Results?.Count ?? 0)); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteEntityTaskAsync), - cancellation); - } - - /// - /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. - /// - /// The orchestrator response to send. - /// The maximum size in bytes for each chunk. - /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, - CancellationToken cancellationToken) - { - // Validate that no single action exceeds the maximum chunk size - static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) - { - foreach (P.OrchestratorAction action in actions) - { - int actionSize = action.CalculateSize(); - if (actionSize > maxChunkBytes) - { - // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message - string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + - $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + - "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; - return new P.TaskFailureDetails - { - ErrorType = typeof(InvalidOperationException).FullName, - ErrorMessage = errorMessage, - IsNonRetriable = true, - }; - } - } - - return null; - } - - P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) - ? null - : ValidateActionsSize(response.Actions, maxChunkBytes); - if (validationFailure != null) - { - // Complete the orchestration with a failed status and failure details - P.OrchestratorResponse failureResponse = new() - { - InstanceId = response.InstanceId, - CompletionToken = response.CompletionToken, - OrchestrationTraceContext = response.OrchestrationTraceContext, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = validationFailure, - }, - }, - }, - }; - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Helper to add an action to the current chunk if it fits - static bool TryAddAction( - Google.Protobuf.Collections.RepeatedField dest, - P.OrchestratorAction action, - ref int currentSize, - int maxChunkBytes) - { - int actionSize = action.CalculateSize(); - if (currentSize + actionSize > maxChunkBytes && currentSize > 0) - { - return false; - } - - dest.Add(action); - currentSize += actionSize; - return true; - } - - // Check if the entire response fits in one chunk - int totalSize = response.CalculateSize(); - if (totalSize <= maxChunkBytes) - { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Response is too large, split into multiple chunks - int actionsCompletedSoFar = 0, chunkIndex = 0; - List allActions = response.Actions.ToList(); - bool isPartial = true; - bool isChunkedMode = false; - - while (isPartial) - { - P.OrchestratorResponse chunkedResponse = new() - { - InstanceId = response.InstanceId, - CustomStatus = response.CustomStatus, - CompletionToken = response.CompletionToken, - RequiresHistory = response.RequiresHistory, - NumEventsProcessed = 0, - }; - - int chunkPayloadSize = 0; - - // Fill the chunk with actions until we reach the size limit - while (actionsCompletedSoFar < allActions.Count && - TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) - { - actionsCompletedSoFar++; - } - - // Determine if this is a partial chunk (more actions remaining) - isPartial = actionsCompletedSoFar < allActions.Count; - chunkedResponse.IsPartial = isPartial; - - // Only activate chunked mode when we actually need multiple chunks. - // A single oversized action that fits in one chunk (via TryAddAction allowing - // the first item in an empty chunk) should be sent as non-chunked to avoid - // backend issues with ChunkIndex=0 + IsPartial=false. - if (isPartial) - { - isChunkedMode = true; - } - - if (isChunkedMode) - { - chunkedResponse.ChunkIndex = chunkIndex; - } - - if (chunkIndex == 0) - { - // The first chunk preserves the original response's NumEventsProcessed value (null) - // When this is set to null, backend by default handles all the messages in the workitem. - // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. - chunkedResponse.NumEventsProcessed = null; - chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; - } - - chunkIndex++; - - // Send the chunk - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - } - } - - async Task ExecuteWithRetryAsync( - Func action, - string operationName, - CancellationToken cancellationToken) - { - const int maxAttempts = 10; - TimeSpan delay = TimeSpan.FromMilliseconds(200); - - for (int attempt = 1; ; attempt++) - { - try - { - await action(); - return; - } - catch (RpcException ex) when ( - (ex.StatusCode == StatusCode.Unavailable || - ex.StatusCode == StatusCode.Unknown || - ex.StatusCode == StatusCode.DeadlineExceeded || - ex.StatusCode == StatusCode.Internal) && - attempt < maxAttempts) - { - // Back off with jitter for transient transport errors -#if NET6_0_OR_GREATER - int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#else - int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#endif - TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); - - this.Logger.TransientGrpcRetry( - operationName, - attempt, - maxAttempts, - backoff.TotalMilliseconds, - (int)ex.StatusCode, - ex); - - try - { - await Task.Delay(backoff, cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // If shutting down during the retry delay, propagate the cancellation exception - throw; - } - - // Exponential increase, capping at 15 seconds - delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); - continue; - } - } - } - } -} + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); + } + } + } + }); + } + + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + CancellationToken cancellationToken) + { + var executionStartedEvent = + request + .NewEvents + .Concat(request.PastEvents) + .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) + .Select(e => e.ExecutionStarted) + .FirstOrDefault(); + + Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( + executionStartedEvent, + request.OrchestrationTraceContext); + + if (executionStartedEvent is not null) + { + P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) + { + var subOrchestrationEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) + .FirstOrDefault(x => x.EventId == eventId); + + return subOrchestrationEvent; + } + + P.HistoryEvent? GetTaskScheduledEvent(int eventId) + { + var taskScheduledEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) + .LastOrDefault(x => x.EventId == eventId); + + return taskScheduledEvent; + } + + foreach (var newEvent in request.NewEvents) + { + switch (newEvent.EventTypeCase) + { + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationFailed( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, + newEvent.SubOrchestrationInstanceFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskCompleted( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskFailed: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskFailed( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled, + newEvent.TaskFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TimerFired: + TraceHelper.EmitTraceActivityForTimer( + request.InstanceId, + executionStartedEvent.Name, + newEvent.Timestamp.ToDateTime(), + newEvent.TimerFired); + break; + } + } + } + + OrchestratorExecutionResult? result = null; + P.TaskFailureDetails? failureDetails = null; + TaskName name = new("(unknown)"); + + ProtoUtils.EntityConversionState? entityConversionState = + this.internalOptions.ConvertOrchestrationEntityEvents + ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) + : null; + + DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; + bool versionFailure = false; + try + { + OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( + request, + entityConversionState, + cancellationToken); + + bool filterPassed = true; + if (this.orchestrationFilter != null) + { + filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( + new OrchestrationFilterParameters + { + Name = runtimeState.Name, + Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, + }, + cancellationToken); + } + + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + + // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. + failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); + + // Only continue with the work if the versioning check passed. + if (failureDetails == null) + { + name = new TaskName(runtimeState.Name); + + this.Logger.ReceivedOrchestratorRequest( + name, + request.InstanceId, + runtimeState.PastEvents.Count, + runtimeState.NewEvents.Count); + + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateOrchestrator( + name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) + { + // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled + // as part of try/catch. + ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch + { + ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), + _ => null, + }; + + TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); + TaskOrchestrationExecutor executor = new( + runtimeState, + shim, + BehaviorOnContinueAsNew.Carryover, + request.EntityParameters.ToCore(), + ErrorPropagationMode.UseFailureDetails, + this.exceptionPropertiesProvider); + result = executor.Execute(); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "OrchestratorTaskNotFound", + ErrorMessage = $"No orchestrator task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + } + catch (Exception unexpected) + { + // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. + this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); + failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + + P.OrchestratorResponse response; + if (result != null) + { + response = ProtoUtils.ConstructOrchestratorResponse( + request.InstanceId, + request.ExecutionId, + result.CustomStatus, + result.Actions, + completionToken, + entityConversionState, + traceActivity); + } + else if (versioning != null && failureDetails != null && versionFailure) + { + this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); + if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) + { + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + } + else + { + // This is the case for failures that happened *outside* the orchestrator executor + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + + var completeOrchestrationAction = response.Actions.FirstOrDefault( + a => a.CompleteOrchestration is not null); + + if (completeOrchestrationAction is not null) + { + if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) + { + traceActivity?.SetStatus( + ActivityStatusCode.Error, + completeOrchestrationAction.CompleteOrchestration.Result); + } + + traceActivity?.SetTag( + Schema.Task.Status, + completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); + + traceActivity?.Dispose(); + } + + this.Logger.SendingOrchestratorResponse( + name, + response.InstanceId, + response.Actions.Count, + GetActionsListForLogging(response.Actions)); + + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + cancellationToken); + } + + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) + { + using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); + + OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); + string rawInput = request.Input; + int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; + this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); + + P.TaskFailureDetails? failureDetails = null; + TaskContext innerContext = new(instance); + innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; + + TaskName name = new(request.Name); + string? output = null; + + failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); + if (!versioningFailed) + { + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskActivity shim = this.shimFactory.CreateActivity(name, activity); + output = await shim.RunAsync(innerContext, request.Input); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "ActivityTaskNotFound", + ErrorMessage = $"No activity task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + catch (Exception applicationException) + { + failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + } + else + { + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + } + + return; + } + + int outputSizeInBytes = 0; + if (failureDetails != null) + { + traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); + + outputSizeInBytes = failureDetails.GetApproximateByteCount(); + } + else if (output != null) + { + outputSizeInBytes = Encoding.UTF8.GetByteCount(output); + } + + string successOrFailure = failureDetails != null ? "failure" : "success"; + this.Logger.SendingActivityResponse( + successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); + + P.ActivityResponse response = new() + { + InstanceId = instance.InstanceId, + TaskId = request.TaskId, + Result = output, + FailureDetails = failureDetails, + CompletionToken = completionToken, + }; + + // Stop the trace activity here to avoid including the completion time in the latency calculation + traceActivity?.Stop(); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + cancellation); + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + CancellationToken cancellation, + string? completionToken = null, + List? operationInfos = null) + { + var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); + EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); + + TaskName name = new(entityId.Name); + + EntityBatchResult? batchResult; + + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; + + if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); + batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); + } + else + { + // we could not find the entity. This is considered an application error, + // so we return a non-retriable error-OperationResult for each operation in the batch. + batchResult = new EntityBatchResult() + { + Actions = [], // no actions + EntityState = batchRequest.EntityState, // state is unmodified + Results = Enumerable.Repeat( + new OperationResult() + { + FailureDetails = new FailureDetails( + errorType: "EntityTaskNotFound", + errorMessage: $"No entity task named '{name}' was found.", + stackTrace: null, + innerFailure: null, + isNonRetriable: true), + }, + batchRequest.Operations!.Count).ToList(), + FailureDetails = null, + }; + } + } + catch (Exception frameworkException) + { + // return a result with failure details. + // this will cause the batch to be abandoned and retried + // (possibly after a delay and on a different worker). + batchResult = new EntityBatchResult() + { + FailureDetails = new FailureDetails(frameworkException), + }; + } + + P.EntityBatchResult response = batchResult.ToEntityBatchResult( + completionToken, + operationInfos?.Take(batchResult.Results?.Count ?? 0)); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + cancellation); + } + + /// + /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. + /// + /// The orchestrator response to send. + /// The maximum size in bytes for each chunk. + /// The cancellation token. + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + CancellationToken cancellationToken) + { + // Validate that no single action exceeds the maximum chunk size + static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) + { + foreach (P.OrchestratorAction action in actions) + { + int actionSize = action.CalculateSize(); + if (actionSize > maxChunkBytes) + { + // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message + string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + + $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + + "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; + return new P.TaskFailureDetails + { + ErrorType = typeof(InvalidOperationException).FullName, + ErrorMessage = errorMessage, + IsNonRetriable = true, + }; + } + } + + return null; + } + + P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) + ? null + : ValidateActionsSize(response.Actions, maxChunkBytes); + if (validationFailure != null) + { + // Complete the orchestration with a failed status and failure details + P.OrchestratorResponse failureResponse = new() + { + InstanceId = response.InstanceId, + CompletionToken = response.CompletionToken, + OrchestrationTraceContext = response.OrchestrationTraceContext, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = validationFailure, + }, + }, + }, + }; + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Helper to add an action to the current chunk if it fits + static bool TryAddAction( + Google.Protobuf.Collections.RepeatedField dest, + P.OrchestratorAction action, + ref int currentSize, + int maxChunkBytes) + { + int actionSize = action.CalculateSize(); + if (currentSize + actionSize > maxChunkBytes && currentSize > 0) + { + return false; + } + + dest.Add(action); + currentSize += actionSize; + return true; + } + + // Check if the entire response fits in one chunk + int totalSize = response.CalculateSize(); + if (totalSize <= maxChunkBytes) + { + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Response is too large, split into multiple chunks + int actionsCompletedSoFar = 0, chunkIndex = 0; + List allActions = response.Actions.ToList(); + bool isPartial = true; + bool isChunkedMode = false; + + while (isPartial) + { + P.OrchestratorResponse chunkedResponse = new() + { + InstanceId = response.InstanceId, + CustomStatus = response.CustomStatus, + CompletionToken = response.CompletionToken, + RequiresHistory = response.RequiresHistory, + NumEventsProcessed = 0, + }; + + int chunkPayloadSize = 0; + + // Fill the chunk with actions until we reach the size limit + while (actionsCompletedSoFar < allActions.Count && + TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) + { + actionsCompletedSoFar++; + } + + // Determine if this is a partial chunk (more actions remaining) + isPartial = actionsCompletedSoFar < allActions.Count; + chunkedResponse.IsPartial = isPartial; + + // Only activate chunked mode when we actually need multiple chunks. + // A single oversized action that fits in one chunk (via TryAddAction allowing + // the first item in an empty chunk) should be sent as non-chunked to avoid + // backend issues with ChunkIndex=0 + IsPartial=false. + if (isPartial) + { + isChunkedMode = true; + } + + if (isChunkedMode) + { + chunkedResponse.ChunkIndex = chunkIndex; + } + + if (chunkIndex == 0) + { + // The first chunk preserves the original response's NumEventsProcessed value (null) + // When this is set to null, backend by default handles all the messages in the workitem. + // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. + chunkedResponse.NumEventsProcessed = null; + chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; + } + + chunkIndex++; + + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + } + } + + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + CancellationToken cancellationToken) + { + const int maxAttempts = 10; + TimeSpan delay = TimeSpan.FromMilliseconds(200); + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Back off with jitter for transient transport errors +#if NET6_0_OR_GREATER + int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#else + int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#endif + TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down during the retry delay, propagate the cancellation exception + throw; + } + + // Exponential increase, capping at 15 seconds + delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); + continue; + } + } + } + } +} From 28b2839d80f2272b3a325346f87d89310b5951d3 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 10:57:12 -0700 Subject: [PATCH 10/36] fixed the line endings --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 2534 ++++++++--------- 1 file changed, 1267 insertions(+), 1267 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index d4f8fa61e..ce30b7afa 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -1,1267 +1,1267 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Diagnostics; -using System.Linq; -using System.Text; -using DurableTask.Core; -using DurableTask.Core.Entities; -using DurableTask.Core.Entities.OperationFormat; -using DurableTask.Core.History; -using Google.Protobuf; -using Microsoft.DurableTask.Abstractions; -using Microsoft.DurableTask.Entities; -using Microsoft.DurableTask.Tracing; -using Microsoft.DurableTask.Worker.Grpc.Internal; -using Microsoft.DurableTask.Worker.Shims; -using Microsoft.Extensions.DependencyInjection; -using Microsoft.Extensions.Logging; -using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; -using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; -using DTCore = DurableTask.Core; -using P = Microsoft.DurableTask.Protobuf; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// The gRPC Durable Task worker. -/// -sealed partial class GrpcDurableTaskWorker -{ - class Processor - { - static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); - - readonly GrpcDurableTaskWorker worker; - readonly TaskHubSidecarServiceClient client; - readonly DurableTaskShimFactory shimFactory; - readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; - readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; - [Obsolete("Experimental")] - readonly IOrchestrationFilter? orchestrationFilter; - - public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) - { - this.worker = worker; - this.client = client; - this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); - this.internalOptions = this.worker.grpcOptions.Internal; - this.orchestrationFilter = orchestrationFilter; - this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null - ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) - : null; - } - - ILogger Logger => this.worker.logger; - - public async Task ExecuteAsync(CancellationToken cancellation) - { - // Tracks consecutive failures against the same channel. Reset only after the stream - // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). - int consecutiveChannelFailures = 0; - - // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. - int reconnectAttempt = 0; - Random backoffRandom = ReconnectBackoff.CreateRandom(); - - while (!cancellation.IsCancellationRequested) - { - bool channelLikelyPoisoned = false; - try - { - using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); - await this.ProcessWorkItemsAsync( - stream, - cancellation, - onFirstMessage: () => - { - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - }, - onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); - } - catch (RpcException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) - { - // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: - // Cancelled is ambiguous and shouldn't drive recreate storms. - this.Logger.SidecarDisconnected(); - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) - { - // Only HelloAsync carries a deadline. Once the work-item stream is established, - // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. - // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. - this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) - { - // Sidecar is down - keep retrying. - this.Logger.SidecarUnavailable(); - channelLikelyPoisoned = true; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) - { - // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel - // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure - // counters: a status reply is proof the transport itself is healthy, so prior - // transport failures should not combine with later ones to trip the recreate. - this.Logger.AuthenticationFailed(ex); - consecutiveChannelFailures = 0; - reconnectAttempt = 0; - } - catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) - { - // We retry on a NotFound for several reasons: - // 1. It was the existing behavior through the UnexpectedError path. - // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes - // time to propagate so we should retry instead of making the user restart the application. - // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed - // between the scheduler and task hub, it would need to be restarted to function. - this.Logger.TaskHubNotFound(); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Shutting down, lets exit gracefully. - return ProcessorExitReason.Shutdown; - } - catch (Exception ex) - { - // Unknown failure - retry? - this.Logger.UnexpectedError(ex, string.Empty); - } - - if (channelLikelyPoisoned) - { - consecutiveChannelFailures++; - int threshold = this.internalOptions.ChannelRecreateFailureThreshold; - if (threshold > 0 && consecutiveChannelFailures >= threshold) - { - this.Logger.RecreatingChannel(consecutiveChannelFailures); - return ProcessorExitReason.ChannelRecreateRequested; - } - } - - try - { - TimeSpan delay = ReconnectBackoff.Compute( - reconnectAttempt, - this.internalOptions.ReconnectBackoffBase, - this.internalOptions.ReconnectBackoffCap, - backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); - reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt - await Task.Delay(delay, cancellation); - } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) - { - // Worker is shutting down - let the method exit gracefully - return ProcessorExitReason.Shutdown; - } - } - - return ProcessorExitReason.Shutdown; - } - - - static string GetActionsListForLogging(IReadOnlyList actions) - { - if (actions.Count == 0) - { - return string.Empty; - } - else if (actions.Count == 1) - { - return actions[0].OrchestratorActionTypeCase.ToString(); - } - else - { - // Returns something like "ScheduleTask x5, CreateTimer x1,..." - return string.Join(", ", actions - .GroupBy(a => a.OrchestratorActionTypeCase) - .Select(group => $"{group.Key} x{group.Count()}")); - } - } - - static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) - { - P.TaskFailureDetails? failureDetails = null; - versionCheckFailed = false; - if (versioning != null) - { - int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); - - switch (versioning.MatchStrategy) - { - case DurableTaskWorkerOptions.VersionMatchStrategy.None: - // No versioning, breakout. - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: - // Comparison of 0 indicates equality. - if (versionComparison != 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: - // Comparison > 0 indicates the orchestration version is greater than the worker version. - if (versionComparison > 0) - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionMismatch", - ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", - IsNonRetriable = true, - }; - } - - break; - default: - // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. - failureDetails = new P.TaskFailureDetails - { - ErrorType = "VersionError", - ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", - IsNonRetriable = true, - }; - break; - } - - versionCheckFailed = failureDetails != null; - } - - return failureDetails; - } - - async ValueTask BuildRuntimeStateAsync( - P.OrchestratorRequest orchestratorRequest, - ProtoUtils.EntityConversionState? entityConversionState, - CancellationToken cancellation) - { - Func converter = entityConversionState is null - ? ProtoUtils.ConvertHistoryEvent - : entityConversionState.ConvertFromProto; - - IEnumerable pastEvents = []; - if (orchestratorRequest.RequiresHistoryStreaming) - { - // Stream the remaining events from the remote service - P.StreamInstanceHistoryRequest streamRequest = new() - { - InstanceId = orchestratorRequest.InstanceId, - ExecutionId = orchestratorRequest.ExecutionId, - ForWorkItemProcessing = true, - }; - - using AsyncServerStreamingCall streamResponse = - this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); - - await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) - { - pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); - } - } - else - { - // The history was already provided in the work item request - pastEvents = orchestratorRequest.PastEvents.Select(converter); - } - - IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); - - // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events - var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); - foreach (HistoryEvent e in newEvents) - { - // AddEvent() puts events into the NewEvents list. - runtimeState.AddEvent(e); - } - - if (runtimeState.ExecutionStartedEvent == null) - { - // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? - throw new InvalidOperationException("The provided orchestration history was incomplete"); - } - - return runtimeState; - } - - async Task> ConnectAsync(CancellationToken cancellation) - { - TimeSpan helloDeadline = this.internalOptions.HelloDeadline; - DateTime? deadline = null; - - if (helloDeadline > TimeSpan.Zero) - { - // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot - // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains - // unambiguous during internal normalization. - DateTime now = DateTime.UtcNow; - DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); - TimeSpan maxOffset = maxDeadlineUtc - now; - deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); - } - - await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); - this.Logger.EstablishedWorkItemConnection(); - - DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; - - // Get the stream for receiving work-items - return this.client!.GetWorkItems( - new P.GetWorkItemsRequest - { - MaxConcurrentActivityWorkItems = - workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, - MaxConcurrentOrchestrationWorkItems = - workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, - MaxConcurrentEntityWorkItems = - workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, - Capabilities = { this.worker.grpcOptions.Capabilities }, - WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), - }, - cancellationToken: cancellation); - } - - async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, - CancellationToken cancellation, - Action? onFirstMessage = null, - Action? onChannelLikelyPoisoned = null) - { - // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — - // including health pings sent periodically by the server — arrive within the configured - // window. If that fires we treat the stream as silently disconnected and reconnect. - TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; - - // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. - // The underlying IAsyncStreamReader is single-use — once the server terminates the stream - // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext - // returns false forever and re-entering await foreach would tight-spin with no yield. - WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( - ct => stream.ResponseStream.ReadAllAsync(ct), - silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, cancellation), - onFirstMessage, - cancellation); - - switch (result.Outcome) - { - case WorkItemStreamOutcome.Shutdown: - return; - - case WorkItemStreamOutcome.SilentDisconnect: - // Stream stopped producing messages (including health pings) for longer than the - // configured window. Treat as a poisoned channel. - this.Logger.ConnectionTimeout(); - onChannelLikelyPoisoned?.Invoke(); - return; - - case WorkItemStreamOutcome.GracefulDrain: - // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + - // OK trailers when a DTS instance is being replaced). Log it explicitly so - // operators can see it. Only count it toward the channel-poisoned threshold when - // the stream produced no messages: a stream that successfully delivered work and - // was then closed by the server is healthy behavior (e.g. routine rolling - // upgrade), and counting those would let a long-lived process accumulate spurious - // "poison" credits across many healthy drains. An empty drain, on the other hand, - // is a strong signal the channel is latched onto a dead/evacuated backend and - // needs to be recreated to pick up fresh DNS/routing. - this.Logger.StreamEndedByPeer(); - if (!result.FirstMessageObserved) - { - onChannelLikelyPoisoned?.Invoke(); - } - - return; - } - } - - void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. - // Logging at Trace allows operators to confirm liveness without flooding info-level - // telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } - } - - void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) - { - // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? - _ = Task.Run( - async () => - { - try - { - await handler(); - } - catch (OperationCanceledException) - { - // Shutting down - ignore - } - catch (Exception ex) - { - string instanceId = - workItem?.OrchestratorRequest?.InstanceId ?? - workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? - workItem?.EntityRequest?.InstanceId ?? - workItem?.EntityRequestV2?.InstanceId ?? - string.Empty; - this.Logger.UnexpectedError(ex, instanceId); - - if (workItem?.OrchestratorRequest != null) - { - try - { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.ActivityRequest != null) - { - try - { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - this.Logger.AbandonedActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } - } - else if (workItem?.EntityRequest != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); - } - } - else if (workItem?.EntityRequestV2 != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); - } - } - } - }); - } - - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, - CancellationToken cancellationToken) - { - var executionStartedEvent = - request - .NewEvents - .Concat(request.PastEvents) - .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) - .Select(e => e.ExecutionStarted) - .FirstOrDefault(); - - Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( - executionStartedEvent, - request.OrchestrationTraceContext); - - if (executionStartedEvent is not null) - { - P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) - { - var subOrchestrationEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) - .FirstOrDefault(x => x.EventId == eventId); - - return subOrchestrationEvent; - } - - P.HistoryEvent? GetTaskScheduledEvent(int eventId) - { - var taskScheduledEvent = - request - .PastEvents - .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) - .LastOrDefault(x => x.EventId == eventId); - - return taskScheduledEvent; - } - - foreach (var newEvent in request.NewEvents) - { - switch (newEvent.EventTypeCase) - { - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: - { - P.HistoryEvent? subOrchestrationInstanceCreatedEvent = - GetSuborchestrationInstanceCreatedEvent( - newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForSubOrchestrationFailed( - request.InstanceId, - subOrchestrationInstanceCreatedEvent, - subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, - newEvent.SubOrchestrationInstanceFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskCompleted( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TaskFailed: - { - P.HistoryEvent? taskScheduledEvent = - GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); - - TraceHelper.EmitTraceActivityForTaskFailed( - request.InstanceId, - taskScheduledEvent, - taskScheduledEvent?.TaskScheduled, - newEvent.TaskFailed); - break; - } - - case P.HistoryEvent.EventTypeOneofCase.TimerFired: - TraceHelper.EmitTraceActivityForTimer( - request.InstanceId, - executionStartedEvent.Name, - newEvent.Timestamp.ToDateTime(), - newEvent.TimerFired); - break; - } - } - } - - OrchestratorExecutionResult? result = null; - P.TaskFailureDetails? failureDetails = null; - TaskName name = new("(unknown)"); - - ProtoUtils.EntityConversionState? entityConversionState = - this.internalOptions.ConvertOrchestrationEntityEvents - ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) - : null; - - DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; - bool versionFailure = false; - try - { - OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( - request, - entityConversionState, - cancellationToken); - - bool filterPassed = true; - if (this.orchestrationFilter != null) - { - filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( - new OrchestrationFilterParameters - { - Name = runtimeState.Name, - Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, - }, - cancellationToken); - } - - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - - // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. - failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); - - // Only continue with the work if the versioning check passed. - if (failureDetails == null) - { - name = new TaskName(runtimeState.Name); - - this.Logger.ReceivedOrchestratorRequest( - name, - request.InstanceId, - runtimeState.PastEvents.Count, - runtimeState.NewEvents.Count); - - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateOrchestrator( - name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) - { - // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled - // as part of try/catch. - ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch - { - ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), - _ => null, - }; - - TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); - TaskOrchestrationExecutor executor = new( - runtimeState, - shim, - BehaviorOnContinueAsNew.Carryover, - request.EntityParameters.ToCore(), - ErrorPropagationMode.UseFailureDetails, - this.exceptionPropertiesProvider); - result = executor.Execute(); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "OrchestratorTaskNotFound", - ErrorMessage = $"No orchestrator task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - } - catch (Exception unexpected) - { - // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. - this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); - failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - - P.OrchestratorResponse response; - if (result != null) - { - response = ProtoUtils.ConstructOrchestratorResponse( - request.InstanceId, - request.ExecutionId, - result.CustomStatus, - result.Actions, - completionToken, - entityConversionState, - traceActivity); - } - else if (versioning != null && failureDetails != null && versionFailure) - { - this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); - if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) - { - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; - } - } - else - { - // This is the case for failures that happened *outside* the orchestrator executor - response = new P.OrchestratorResponse - { - InstanceId = request.InstanceId, - CompletionToken = completionToken, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = failureDetails, - }, - }, - }, - }; - } - - var completeOrchestrationAction = response.Actions.FirstOrDefault( - a => a.CompleteOrchestration is not null); - - if (completeOrchestrationAction is not null) - { - if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) - { - traceActivity?.SetStatus( - ActivityStatusCode.Error, - completeOrchestrationAction.CompleteOrchestration.Result); - } - - traceActivity?.SetTag( - Schema.Task.Status, - completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); - - traceActivity?.Dispose(); - } - - this.Logger.SendingOrchestratorResponse( - name, - response.InstanceId, - response.Actions.Count, - GetActionsListForLogging(response.Actions)); - - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - cancellationToken); - } - - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) - { - using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); - - OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); - string rawInput = request.Input; - int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; - this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); - - P.TaskFailureDetails? failureDetails = null; - TaskContext innerContext = new(instance); - innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; - - TaskName name = new(request.Name); - string? output = null; - - failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); - if (!versioningFailed) - { - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskActivity shim = this.shimFactory.CreateActivity(name, activity); - output = await shim.RunAsync(innerContext, request.Input); - } - else - { - failureDetails = new P.TaskFailureDetails - { - ErrorType = "ActivityTaskNotFound", - ErrorMessage = $"No activity task named '{name}' was found.", - IsNonRetriable = true, - }; - } - } - catch (Exception applicationException) - { - failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); - } - } - else - { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - } - - return; - } - - int outputSizeInBytes = 0; - if (failureDetails != null) - { - traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); - - outputSizeInBytes = failureDetails.GetApproximateByteCount(); - } - else if (output != null) - { - outputSizeInBytes = Encoding.UTF8.GetByteCount(output); - } - - string successOrFailure = failureDetails != null ? "failure" : "success"; - this.Logger.SendingActivityResponse( - successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); - - P.ActivityResponse response = new() - { - InstanceId = instance.InstanceId, - TaskId = request.TaskId, - Result = output, - FailureDetails = failureDetails, - CompletionToken = completionToken, - }; - - // Stop the trace activity here to avoid including the completion time in the latency calculation - traceActivity?.Stop(); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteActivityTaskAsync), - cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - CancellationToken cancellation, - string? completionToken = null, - List? operationInfos = null) - { - var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); - EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); - - TaskName name = new(entityId.Name); - - EntityBatchResult? batchResult; - - try - { - await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); - IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; - - if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) - { - // Both the factory invocation and the RunAsync could involve user code and need to be handled as - // part of try/catch. - TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); - batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); - } - else - { - // we could not find the entity. This is considered an application error, - // so we return a non-retriable error-OperationResult for each operation in the batch. - batchResult = new EntityBatchResult() - { - Actions = [], // no actions - EntityState = batchRequest.EntityState, // state is unmodified - Results = Enumerable.Repeat( - new OperationResult() - { - FailureDetails = new FailureDetails( - errorType: "EntityTaskNotFound", - errorMessage: $"No entity task named '{name}' was found.", - stackTrace: null, - innerFailure: null, - isNonRetriable: true), - }, - batchRequest.Operations!.Count).ToList(), - FailureDetails = null, - }; - } - } - catch (Exception frameworkException) - { - // return a result with failure details. - // this will cause the batch to be abandoned and retried - // (possibly after a delay and on a different worker). - batchResult = new EntityBatchResult() - { - FailureDetails = new FailureDetails(frameworkException), - }; - } - - P.EntityBatchResult response = batchResult.ToEntityBatchResult( - completionToken, - operationInfos?.Take(batchResult.Results?.Count ?? 0)); - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteEntityTaskAsync), - cancellation); - } - - /// - /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. - /// - /// The orchestrator response to send. - /// The maximum size in bytes for each chunk. - /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, - CancellationToken cancellationToken) - { - // Validate that no single action exceeds the maximum chunk size - static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) - { - foreach (P.OrchestratorAction action in actions) - { - int actionSize = action.CalculateSize(); - if (actionSize > maxChunkBytes) - { - // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message - string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + - $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + - "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; - return new P.TaskFailureDetails - { - ErrorType = typeof(InvalidOperationException).FullName, - ErrorMessage = errorMessage, - IsNonRetriable = true, - }; - } - } - - return null; - } - - P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) - ? null - : ValidateActionsSize(response.Actions, maxChunkBytes); - if (validationFailure != null) - { - // Complete the orchestration with a failed status and failure details - P.OrchestratorResponse failureResponse = new() - { - InstanceId = response.InstanceId, - CompletionToken = response.CompletionToken, - OrchestrationTraceContext = response.OrchestrationTraceContext, - Actions = - { - new P.OrchestratorAction - { - CompleteOrchestration = new P.CompleteOrchestrationAction - { - OrchestrationStatus = P.OrchestrationStatus.Failed, - FailureDetails = validationFailure, - }, - }, - }, - }; - - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Helper to add an action to the current chunk if it fits - static bool TryAddAction( - Google.Protobuf.Collections.RepeatedField dest, - P.OrchestratorAction action, - ref int currentSize, - int maxChunkBytes) - { - int actionSize = action.CalculateSize(); - if (currentSize + actionSize > maxChunkBytes && currentSize > 0) - { - return false; - } - - dest.Add(action); - currentSize += actionSize; - return true; - } - - // Check if the entire response fits in one chunk - int totalSize = response.CalculateSize(); - if (totalSize <= maxChunkBytes) - { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - - // Response is too large, split into multiple chunks - int actionsCompletedSoFar = 0, chunkIndex = 0; - List allActions = response.Actions.ToList(); - bool isPartial = true; - bool isChunkedMode = false; - - while (isPartial) - { - P.OrchestratorResponse chunkedResponse = new() - { - InstanceId = response.InstanceId, - CustomStatus = response.CustomStatus, - CompletionToken = response.CompletionToken, - RequiresHistory = response.RequiresHistory, - NumEventsProcessed = 0, - }; - - int chunkPayloadSize = 0; - - // Fill the chunk with actions until we reach the size limit - while (actionsCompletedSoFar < allActions.Count && - TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) - { - actionsCompletedSoFar++; - } - - // Determine if this is a partial chunk (more actions remaining) - isPartial = actionsCompletedSoFar < allActions.Count; - chunkedResponse.IsPartial = isPartial; - - // Only activate chunked mode when we actually need multiple chunks. - // A single oversized action that fits in one chunk (via TryAddAction allowing - // the first item in an empty chunk) should be sent as non-chunked to avoid - // backend issues with ChunkIndex=0 + IsPartial=false. - if (isPartial) - { - isChunkedMode = true; - } - - if (isChunkedMode) - { - chunkedResponse.ChunkIndex = chunkIndex; - } - - if (chunkIndex == 0) - { - // The first chunk preserves the original response's NumEventsProcessed value (null) - // When this is set to null, backend by default handles all the messages in the workitem. - // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. - chunkedResponse.NumEventsProcessed = null; - chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; - } - - chunkIndex++; - - // Send the chunk - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - } - } - - async Task ExecuteWithRetryAsync( - Func action, - string operationName, - CancellationToken cancellationToken) - { - const int maxAttempts = 10; - TimeSpan delay = TimeSpan.FromMilliseconds(200); - - for (int attempt = 1; ; attempt++) - { - try - { - await action(); - return; - } - catch (RpcException ex) when ( - (ex.StatusCode == StatusCode.Unavailable || - ex.StatusCode == StatusCode.Unknown || - ex.StatusCode == StatusCode.DeadlineExceeded || - ex.StatusCode == StatusCode.Internal) && - attempt < maxAttempts) - { - // Back off with jitter for transient transport errors -#if NET6_0_OR_GREATER - int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#else - int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#endif - TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); - - this.Logger.TransientGrpcRetry( - operationName, - attempt, - maxAttempts, - backoff.TotalMilliseconds, - (int)ex.StatusCode, - ex); - - try - { - await Task.Delay(backoff, cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // If shutting down during the retry delay, propagate the cancellation exception - throw; - } - - // Exponential increase, capping at 15 seconds - delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); - continue; - } - } - } - } -} +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Linq; +using System.Text; +using DurableTask.Core; +using DurableTask.Core.Entities; +using DurableTask.Core.Entities.OperationFormat; +using DurableTask.Core.History; +using Google.Protobuf; +using Microsoft.DurableTask.Abstractions; +using Microsoft.DurableTask.Entities; +using Microsoft.DurableTask.Tracing; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.DurableTask.Worker.Shims; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using static Microsoft.DurableTask.Protobuf.TaskHubSidecarService; +using ActivityStatusCode = System.Diagnostics.ActivityStatusCode; +using DTCore = DurableTask.Core; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// The gRPC Durable Task worker. +/// +sealed partial class GrpcDurableTaskWorker +{ + class Processor + { + static readonly Google.Protobuf.WellKnownTypes.Empty EmptyMessage = new(); + + readonly GrpcDurableTaskWorker worker; + readonly TaskHubSidecarServiceClient client; + readonly DurableTaskShimFactory shimFactory; + readonly GrpcDurableTaskWorkerOptions.InternalOptions internalOptions; + readonly DTCore.IExceptionPropertiesProvider? exceptionPropertiesProvider; + [Obsolete("Experimental")] + readonly IOrchestrationFilter? orchestrationFilter; + + public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient client, IOrchestrationFilter? orchestrationFilter = null, IExceptionPropertiesProvider? exceptionPropertiesProvider = null) + { + this.worker = worker; + this.client = client; + this.shimFactory = new DurableTaskShimFactory(this.worker.grpcOptions, this.worker.loggerFactory); + this.internalOptions = this.worker.grpcOptions.Internal; + this.orchestrationFilter = orchestrationFilter; + this.exceptionPropertiesProvider = exceptionPropertiesProvider is not null + ? new ExceptionPropertiesProviderAdapter(exceptionPropertiesProvider) + : null; + } + + ILogger Logger => this.worker.logger; + + public async Task ExecuteAsync(CancellationToken cancellation) + { + // Tracks consecutive failures against the same channel. Reset only after the stream + // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). + int consecutiveChannelFailures = 0; + + // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. + int reconnectAttempt = 0; + Random backoffRandom = ReconnectBackoff.CreateRandom(); + + while (!cancellation.IsCancellationRequested) + { + bool channelLikelyPoisoned = false; + try + { + using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); + await this.ProcessWorkItemsAsync( + stream, + cancellation, + onFirstMessage: () => + { + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + }, + onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); + } + catch (RpcException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) + { + // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: + // Cancelled is ambiguous and shouldn't drive recreate storms. + this.Logger.SidecarDisconnected(); + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) + { + // Only HelloAsync carries a deadline. Once the work-item stream is established, + // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. + // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. + this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) + { + // Sidecar is down - keep retrying. + this.Logger.SidecarUnavailable(); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) + { + // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel + // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure + // counters: a status reply is proof the transport itself is healthy, so prior + // transport failures should not combine with later ones to trip the recreate. + this.Logger.AuthenticationFailed(ex); + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) + { + // We retry on a NotFound for several reasons: + // 1. It was the existing behavior through the UnexpectedError path. + // 2. A 404 can be returned for a missing task hub or authentication failure. Authentication takes + // time to propagate so we should retry instead of making the user restart the application. + // 3. In some cases, a task hub can be created separately from the scheduler. If a worker is deployed + // between the scheduler and task hub, it would need to be restarted to function. + this.Logger.TaskHubNotFound(); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Shutting down, lets exit gracefully. + return ProcessorExitReason.Shutdown; + } + catch (Exception ex) + { + // Unknown failure - retry? + this.Logger.UnexpectedError(ex, string.Empty); + } + + if (channelLikelyPoisoned) + { + consecutiveChannelFailures++; + int threshold = this.internalOptions.ChannelRecreateFailureThreshold; + if (threshold > 0 && consecutiveChannelFailures >= threshold) + { + this.Logger.RecreatingChannel(consecutiveChannelFailures); + return ProcessorExitReason.ChannelRecreateRequested; + } + } + + try + { + TimeSpan delay = ReconnectBackoff.Compute( + reconnectAttempt, + this.internalOptions.ReconnectBackoffBase, + this.internalOptions.ReconnectBackoffCap, + backoffRandom); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); + reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + await Task.Delay(delay, cancellation); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - let the method exit gracefully + return ProcessorExitReason.Shutdown; + } + } + + return ProcessorExitReason.Shutdown; + } + + + static string GetActionsListForLogging(IReadOnlyList actions) + { + if (actions.Count == 0) + { + return string.Empty; + } + else if (actions.Count == 1) + { + return actions[0].OrchestratorActionTypeCase.ToString(); + } + else + { + // Returns something like "ScheduleTask x5, CreateTimer x1,..." + return string.Join(", ", actions + .GroupBy(a => a.OrchestratorActionTypeCase) + .Select(group => $"{group.Key} x{group.Count()}")); + } + } + + static P.TaskFailureDetails? EvaluateOrchestrationVersioning(DurableTaskWorkerOptions.VersioningOptions? versioning, string orchestrationVersion, out bool versionCheckFailed) + { + P.TaskFailureDetails? failureDetails = null; + versionCheckFailed = false; + if (versioning != null) + { + int versionComparison = TaskOrchestrationVersioningUtils.CompareVersions(orchestrationVersion, versioning.Version); + + switch (versioning.MatchStrategy) + { + case DurableTaskWorkerOptions.VersionMatchStrategy.None: + // No versioning, breakout. + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.Strict: + // Comparison of 0 indicates equality. + if (versionComparison != 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' does not match the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + case DurableTaskWorkerOptions.VersionMatchStrategy.CurrentOrOlder: + // Comparison > 0 indicates the orchestration version is greater than the worker version. + if (versionComparison > 0) + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionMismatch", + ErrorMessage = $"The orchestration version '{orchestrationVersion}' is greater than the worker version '{versioning.Version}'.", + IsNonRetriable = true, + }; + } + + break; + default: + // If there is a type of versioning we don't understand, it is better to treat it as a versioning failure. + failureDetails = new P.TaskFailureDetails + { + ErrorType = "VersionError", + ErrorMessage = $"The version match strategy '{orchestrationVersion}' is unknown.", + IsNonRetriable = true, + }; + break; + } + + versionCheckFailed = failureDetails != null; + } + + return failureDetails; + } + + async ValueTask BuildRuntimeStateAsync( + P.OrchestratorRequest orchestratorRequest, + ProtoUtils.EntityConversionState? entityConversionState, + CancellationToken cancellation) + { + Func converter = entityConversionState is null + ? ProtoUtils.ConvertHistoryEvent + : entityConversionState.ConvertFromProto; + + IEnumerable pastEvents = []; + if (orchestratorRequest.RequiresHistoryStreaming) + { + // Stream the remaining events from the remote service + P.StreamInstanceHistoryRequest streamRequest = new() + { + InstanceId = orchestratorRequest.InstanceId, + ExecutionId = orchestratorRequest.ExecutionId, + ForWorkItemProcessing = true, + }; + + using AsyncServerStreamingCall streamResponse = + this.client.StreamInstanceHistory(streamRequest, cancellationToken: cancellation); + + await foreach (P.HistoryChunk chunk in streamResponse.ResponseStream.ReadAllAsync(cancellation)) + { + pastEvents = pastEvents.Concat(chunk.Events.Select(converter)); + } + } + else + { + // The history was already provided in the work item request + pastEvents = orchestratorRequest.PastEvents.Select(converter); + } + + IEnumerable newEvents = orchestratorRequest.NewEvents.Select(converter); + + // Reconstruct the orchestration state in a way that correctly distinguishes new events from past events + var runtimeState = new OrchestrationRuntimeState(pastEvents.ToList()); + foreach (HistoryEvent e in newEvents) + { + // AddEvent() puts events into the NewEvents list. + runtimeState.AddEvent(e); + } + + if (runtimeState.ExecutionStartedEvent == null) + { + // TODO: What's the right way to handle this? Callback to the sidecar with a retriable error request? + throw new InvalidOperationException("The provided orchestration history was incomplete"); + } + + return runtimeState; + } + + async Task> ConnectAsync(CancellationToken cancellation) + { + TimeSpan helloDeadline = this.internalOptions.HelloDeadline; + DateTime? deadline = null; + + if (helloDeadline > TimeSpan.Zero) + { + // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains + // unambiguous during internal normalization. + DateTime now = DateTime.UtcNow; + DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); + TimeSpan maxOffset = maxDeadlineUtc - now; + deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); + } + + await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); + this.Logger.EstablishedWorkItemConnection(); + + DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; + + // Get the stream for receiving work-items + return this.client!.GetWorkItems( + new P.GetWorkItemsRequest + { + MaxConcurrentActivityWorkItems = + workerOptions.Concurrency.MaximumConcurrentActivityWorkItems, + MaxConcurrentOrchestrationWorkItems = + workerOptions.Concurrency.MaximumConcurrentOrchestrationWorkItems, + MaxConcurrentEntityWorkItems = + workerOptions.Concurrency.MaximumConcurrentEntityWorkItems, + Capabilities = { this.worker.grpcOptions.Capabilities }, + WorkItemFilters = this.worker.workItemFilters?.ToGrpcWorkItemFilters(), + }, + cancellationToken: cancellation); + } + + async Task ProcessWorkItemsAsync( + AsyncServerStreamingCall stream, + CancellationToken cancellation, + Action? onFirstMessage = null, + Action? onChannelLikelyPoisoned = null) + { + // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — + // including health pings sent periodically by the server — arrive within the configured + // window. If that fires we treat the stream as silently disconnected and reconnect. + TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; + + // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. + // The underlying IAsyncStreamReader is single-use — once the server terminates the stream + // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext + // returns false forever and re-entering await foreach would tight-spin with no yield. + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + ct => stream.ResponseStream.ReadAllAsync(ct), + silentDisconnectTimeout, + workItem => this.DispatchWorkItem(workItem, cancellation), + onFirstMessage, + cancellation); + + switch (result.Outcome) + { + case WorkItemStreamOutcome.Shutdown: + return; + + case WorkItemStreamOutcome.SilentDisconnect: + // Stream stopped producing messages (including health pings) for longer than the + // configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onChannelLikelyPoisoned?.Invoke(); + return; + + case WorkItemStreamOutcome.GracefulDrain: + // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + + // OK trailers when a DTS instance is being replaced). Log it explicitly so + // operators can see it. Only count it toward the channel-poisoned threshold when + // the stream produced no messages: a stream that successfully delivered work and + // was then closed by the server is healthy behavior (e.g. routine rolling + // upgrade), and counting those would let a long-lived process accumulate spurious + // "poison" credits across many healthy drains. An empty drain, on the other hand, + // is a strong signal the channel is latched onto a dead/evacuated backend and + // needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + if (!result.FirstMessageObserved) + { + onChannelLikelyPoisoned?.Invoke(); + } + + return; + } + } + + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) + { + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. + // Logging at Trace allows operators to confirm liveness without flooding info-level + // telemetry. + this.Logger.ReceivedHealthPing(); + } + else + { + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); + } + } + + void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) + { + // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? + _ = Task.Run( + async () => + { + try + { + await handler(); + } + catch (OperationCanceledException) + { + // Shutting down - ignore + } + catch (Exception ex) + { + string instanceId = + workItem?.OrchestratorRequest?.InstanceId ?? + workItem?.ActivityRequest?.OrchestrationInstance?.InstanceId ?? + workItem?.EntityRequest?.InstanceId ?? + workItem?.EntityRequestV2?.InstanceId ?? + string.Empty; + this.Logger.UnexpectedError(ex, instanceId); + + if (workItem?.OrchestratorRequest != null) + { + try + { + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellation); + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.ActivityRequest != null) + { + try + { + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + this.Logger.AbandonedActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instanceId); + } + } + else if (workItem?.EntityRequest != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); + } + } + else if (workItem?.EntityRequestV2 != null) + { + try + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); + } + } + } + }); + } + + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + CancellationToken cancellationToken) + { + var executionStartedEvent = + request + .NewEvents + .Concat(request.PastEvents) + .Where(e => e.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.ExecutionStarted) + .Select(e => e.ExecutionStarted) + .FirstOrDefault(); + + Activity? traceActivity = TraceHelper.StartTraceActivityForOrchestrationExecution( + executionStartedEvent, + request.OrchestrationTraceContext); + + if (executionStartedEvent is not null) + { + P.HistoryEvent? GetSuborchestrationInstanceCreatedEvent(int eventId) + { + var subOrchestrationEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCreated) + .FirstOrDefault(x => x.EventId == eventId); + + return subOrchestrationEvent; + } + + P.HistoryEvent? GetTaskScheduledEvent(int eventId) + { + var taskScheduledEvent = + request + .PastEvents + .Where(x => x.EventTypeCase == P.HistoryEvent.EventTypeOneofCase.TaskScheduled) + .LastOrDefault(x => x.EventId == eventId); + + return taskScheduledEvent; + } + + foreach (var newEvent in request.NewEvents) + { + switch (newEvent.EventTypeCase) + { + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceCompleted: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationCompleted( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.SubOrchestrationInstanceFailed: + { + P.HistoryEvent? subOrchestrationInstanceCreatedEvent = + GetSuborchestrationInstanceCreatedEvent( + newEvent.SubOrchestrationInstanceFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForSubOrchestrationFailed( + request.InstanceId, + subOrchestrationInstanceCreatedEvent, + subOrchestrationInstanceCreatedEvent?.SubOrchestrationInstanceCreated, + newEvent.SubOrchestrationInstanceFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskCompleted: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskCompleted.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskCompleted( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TaskFailed: + { + P.HistoryEvent? taskScheduledEvent = + GetTaskScheduledEvent(newEvent.TaskFailed.TaskScheduledId); + + TraceHelper.EmitTraceActivityForTaskFailed( + request.InstanceId, + taskScheduledEvent, + taskScheduledEvent?.TaskScheduled, + newEvent.TaskFailed); + break; + } + + case P.HistoryEvent.EventTypeOneofCase.TimerFired: + TraceHelper.EmitTraceActivityForTimer( + request.InstanceId, + executionStartedEvent.Name, + newEvent.Timestamp.ToDateTime(), + newEvent.TimerFired); + break; + } + } + } + + OrchestratorExecutionResult? result = null; + P.TaskFailureDetails? failureDetails = null; + TaskName name = new("(unknown)"); + + ProtoUtils.EntityConversionState? entityConversionState = + this.internalOptions.ConvertOrchestrationEntityEvents + ? new(this.internalOptions.InsertEntityUnlocksOnCompletion) + : null; + + DurableTaskWorkerOptions.VersioningOptions? versioning = this.worker.workerOptions.Versioning; + bool versionFailure = false; + try + { + OrchestrationRuntimeState runtimeState = await this.BuildRuntimeStateAsync( + request, + entityConversionState, + cancellationToken); + + bool filterPassed = true; + if (this.orchestrationFilter != null) + { + filterPassed = await this.orchestrationFilter.IsOrchestrationValidAsync( + new OrchestrationFilterParameters + { + Name = runtimeState.Name, + Tags = runtimeState.Tags != null ? new Dictionary(runtimeState.Tags) : null, + }, + cancellationToken); + } + + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + + // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. + failureDetails = EvaluateOrchestrationVersioning(versioning, runtimeState.Version, out versionFailure); + + // Only continue with the work if the versioning check passed. + if (failureDetails == null) + { + name = new TaskName(runtimeState.Name); + + this.Logger.ReceivedOrchestratorRequest( + name, + request.InstanceId, + runtimeState.PastEvents.Count, + runtimeState.NewEvents.Count); + + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateOrchestrator( + name, scope.ServiceProvider, out ITaskOrchestrator? orchestrator)) + { + // Both the factory invocation and the ExecuteAsync could involve user code and need to be handled + // as part of try/catch. + ParentOrchestrationInstance? parent = runtimeState.ParentInstance switch + { + ParentInstance p => new(new(p.Name), p.OrchestrationInstance.InstanceId), + _ => null, + }; + + TaskOrchestration shim = this.shimFactory.CreateOrchestration(name, orchestrator, parent); + TaskOrchestrationExecutor executor = new( + runtimeState, + shim, + BehaviorOnContinueAsNew.Carryover, + request.EntityParameters.ToCore(), + ErrorPropagationMode.UseFailureDetails, + this.exceptionPropertiesProvider); + result = executor.Execute(); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "OrchestratorTaskNotFound", + ErrorMessage = $"No orchestrator task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + } + catch (Exception unexpected) + { + // This is not expected: Normally TaskOrchestrationExecutor handles exceptions in user code. + this.Logger.OrchestratorFailed(name, request.InstanceId, unexpected.ToString()); + failureDetails = unexpected.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + + P.OrchestratorResponse response; + if (result != null) + { + response = ProtoUtils.ConstructOrchestratorResponse( + request.InstanceId, + request.ExecutionId, + result.CustomStatus, + result.Actions, + completionToken, + entityConversionState, + traceActivity); + } + else if (versioning != null && failureDetails != null && versionFailure) + { + this.Logger.OrchestrationVersionFailure(versioning.FailureStrategy.ToString(), failureDetails.ErrorMessage); + if (versioning.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Fail) + { + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + + return; + } + } + else + { + // This is the case for failures that happened *outside* the orchestrator executor + response = new P.OrchestratorResponse + { + InstanceId = request.InstanceId, + CompletionToken = completionToken, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = failureDetails, + }, + }, + }, + }; + } + + var completeOrchestrationAction = response.Actions.FirstOrDefault( + a => a.CompleteOrchestration is not null); + + if (completeOrchestrationAction is not null) + { + if (completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus == P.OrchestrationStatus.Failed) + { + traceActivity?.SetStatus( + ActivityStatusCode.Error, + completeOrchestrationAction.CompleteOrchestration.Result); + } + + traceActivity?.SetTag( + Schema.Task.Status, + completeOrchestrationAction.CompleteOrchestration.OrchestrationStatus.ToString()); + + traceActivity?.Dispose(); + } + + this.Logger.SendingOrchestratorResponse( + name, + response.InstanceId, + response.Actions.Count, + GetActionsListForLogging(response.Actions)); + + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + cancellationToken); + } + + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) + { + using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); + + OrchestrationInstance instance = request.OrchestrationInstance.ToCore(); + string rawInput = request.Input; + int inputSize = rawInput != null ? Encoding.UTF8.GetByteCount(rawInput) : 0; + this.Logger.ReceivedActivityRequest(request.Name, request.TaskId, instance.InstanceId, inputSize); + + P.TaskFailureDetails? failureDetails = null; + TaskContext innerContext = new(instance); + innerContext.ExceptionPropertiesProvider = this.exceptionPropertiesProvider; + + TaskName name = new(request.Name); + string? output = null; + + failureDetails = EvaluateOrchestrationVersioning(this.worker.workerOptions.Versioning, request.Version, out bool versioningFailed); + if (!versioningFailed) + { + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + if (this.worker.Factory.TryCreateActivity(name, scope.ServiceProvider, out ITaskActivity? activity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskActivity shim = this.shimFactory.CreateActivity(name, activity); + output = await shim.RunAsync(innerContext, request.Input); + } + else + { + failureDetails = new P.TaskFailureDetails + { + ErrorType = "ActivityTaskNotFound", + ErrorMessage = $"No activity task named '{name}' was found.", + IsNonRetriable = true, + }; + } + } + catch (Exception applicationException) + { + failureDetails = applicationException.ToTaskFailureDetails(this.exceptionPropertiesProvider); + } + } + else + { + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + } + + return; + } + + int outputSizeInBytes = 0; + if (failureDetails != null) + { + traceActivity?.SetStatus(ActivityStatusCode.Error, failureDetails.ErrorMessage); + + outputSizeInBytes = failureDetails.GetApproximateByteCount(); + } + else if (output != null) + { + outputSizeInBytes = Encoding.UTF8.GetByteCount(output); + } + + string successOrFailure = failureDetails != null ? "failure" : "success"; + this.Logger.SendingActivityResponse( + successOrFailure, name, request.TaskId, instance.InstanceId, outputSizeInBytes); + + P.ActivityResponse response = new() + { + InstanceId = instance.InstanceId, + TaskId = request.TaskId, + Result = output, + FailureDetails = failureDetails, + CompletionToken = completionToken, + }; + + // Stop the trace activity here to avoid including the completion time in the latency calculation + traceActivity?.Stop(); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + cancellation); + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + CancellationToken cancellation, + string? completionToken = null, + List? operationInfos = null) + { + var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); + EntityId entityId = new(coreEntityId.Name, coreEntityId.Key); + + TaskName name = new(entityId.Name); + + EntityBatchResult? batchResult; + + try + { + await using AsyncServiceScope scope = this.worker.services.CreateAsyncScope(); + IDurableTaskFactory2 factory = (IDurableTaskFactory2)this.worker.Factory; + + if (factory.TryCreateEntity(name, scope.ServiceProvider, out ITaskEntity? entity)) + { + // Both the factory invocation and the RunAsync could involve user code and need to be handled as + // part of try/catch. + TaskEntity shim = this.shimFactory.CreateEntity(name, entity, entityId); + batchResult = await shim.ExecuteOperationBatchAsync(batchRequest); + } + else + { + // we could not find the entity. This is considered an application error, + // so we return a non-retriable error-OperationResult for each operation in the batch. + batchResult = new EntityBatchResult() + { + Actions = [], // no actions + EntityState = batchRequest.EntityState, // state is unmodified + Results = Enumerable.Repeat( + new OperationResult() + { + FailureDetails = new FailureDetails( + errorType: "EntityTaskNotFound", + errorMessage: $"No entity task named '{name}' was found.", + stackTrace: null, + innerFailure: null, + isNonRetriable: true), + }, + batchRequest.Operations!.Count).ToList(), + FailureDetails = null, + }; + } + } + catch (Exception frameworkException) + { + // return a result with failure details. + // this will cause the batch to be abandoned and retried + // (possibly after a delay and on a different worker). + batchResult = new EntityBatchResult() + { + FailureDetails = new FailureDetails(frameworkException), + }; + } + + P.EntityBatchResult response = batchResult.ToEntityBatchResult( + completionToken, + operationInfos?.Take(batchResult.Results?.Count ?? 0)); + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + cancellation); + } + + /// + /// Completes an orchestration task with automatic chunking if the response exceeds the maximum size. + /// + /// The orchestrator response to send. + /// The maximum size in bytes for each chunk. + /// The cancellation token. + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + CancellationToken cancellationToken) + { + // Validate that no single action exceeds the maximum chunk size + static P.TaskFailureDetails? ValidateActionsSize(IEnumerable actions, int maxChunkBytes) + { + foreach (P.OrchestratorAction action in actions) + { + int actionSize = action.CalculateSize(); + if (actionSize > maxChunkBytes) + { + // TODO: large payload doc is not available yet on aka.ms, add doc link to below error message + string errorMessage = $"A single orchestrator action of type {action.OrchestratorActionTypeCase} with id {action.Id} " + + $"exceeds the {maxChunkBytes / 1024.0 / 1024.0:F2}MB limit: {actionSize / 1024.0 / 1024.0:F2}MB. " + + "Enable large-payload externalization to Azure Blob Storage to support oversized actions."; + return new P.TaskFailureDetails + { + ErrorType = typeof(InvalidOperationException).FullName, + ErrorMessage = errorMessage, + IsNonRetriable = true, + }; + } + } + + return null; + } + + P.TaskFailureDetails? validationFailure = this.worker.grpcOptions.Capabilities.Contains(P.WorkerCapability.LargePayloads) + ? null + : ValidateActionsSize(response.Actions, maxChunkBytes); + if (validationFailure != null) + { + // Complete the orchestration with a failed status and failure details + P.OrchestratorResponse failureResponse = new() + { + InstanceId = response.InstanceId, + CompletionToken = response.CompletionToken, + OrchestrationTraceContext = response.OrchestrationTraceContext, + Actions = + { + new P.OrchestratorAction + { + CompleteOrchestration = new P.CompleteOrchestrationAction + { + OrchestrationStatus = P.OrchestrationStatus.Failed, + FailureDetails = validationFailure, + }, + }, + }, + }; + + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Helper to add an action to the current chunk if it fits + static bool TryAddAction( + Google.Protobuf.Collections.RepeatedField dest, + P.OrchestratorAction action, + ref int currentSize, + int maxChunkBytes) + { + int actionSize = action.CalculateSize(); + if (currentSize + actionSize > maxChunkBytes && currentSize > 0) + { + return false; + } + + dest.Add(action); + currentSize += actionSize; + return true; + } + + // Check if the entire response fits in one chunk + int totalSize = response.CalculateSize(); + if (totalSize <= maxChunkBytes) + { + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + return; + } + + // Response is too large, split into multiple chunks + int actionsCompletedSoFar = 0, chunkIndex = 0; + List allActions = response.Actions.ToList(); + bool isPartial = true; + bool isChunkedMode = false; + + while (isPartial) + { + P.OrchestratorResponse chunkedResponse = new() + { + InstanceId = response.InstanceId, + CustomStatus = response.CustomStatus, + CompletionToken = response.CompletionToken, + RequiresHistory = response.RequiresHistory, + NumEventsProcessed = 0, + }; + + int chunkPayloadSize = 0; + + // Fill the chunk with actions until we reach the size limit + while (actionsCompletedSoFar < allActions.Count && + TryAddAction(chunkedResponse.Actions, allActions[actionsCompletedSoFar], ref chunkPayloadSize, maxChunkBytes)) + { + actionsCompletedSoFar++; + } + + // Determine if this is a partial chunk (more actions remaining) + isPartial = actionsCompletedSoFar < allActions.Count; + chunkedResponse.IsPartial = isPartial; + + // Only activate chunked mode when we actually need multiple chunks. + // A single oversized action that fits in one chunk (via TryAddAction allowing + // the first item in an empty chunk) should be sent as non-chunked to avoid + // backend issues with ChunkIndex=0 + IsPartial=false. + if (isPartial) + { + isChunkedMode = true; + } + + if (isChunkedMode) + { + chunkedResponse.ChunkIndex = chunkIndex; + } + + if (chunkIndex == 0) + { + // The first chunk preserves the original response's NumEventsProcessed value (null) + // When this is set to null, backend by default handles all the messages in the workitem. + // For subsequent chunks, we set it to 0 since all messages are already handled in first chunk. + chunkedResponse.NumEventsProcessed = null; + chunkedResponse.OrchestrationTraceContext = response.OrchestrationTraceContext; + } + + chunkIndex++; + + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + cancellationToken); + } + } + + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + CancellationToken cancellationToken) + { + const int maxAttempts = 10; + TimeSpan delay = TimeSpan.FromMilliseconds(200); + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Back off with jitter for transient transport errors +#if NET6_0_OR_GREATER + int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#else + int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); +#endif + TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down during the retry delay, propagate the cancellation exception + throw; + } + + // Exponential increase, capping at 15 seconds + delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); + continue; + } + } + } + } +} From 55535e342b1c9c2928d02a1fb155a05b154c6da5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 18:00:39 +0000 Subject: [PATCH 11/36] Add max-attempts exhaustion test for ExecuteWithRetryAsync Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/bdf20a7a-2447-4ef2-92fc-1d2a5d16833c Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 5 ++- .../Grpc.Tests/ExecuteWithRetryTests.cs | 41 ++++++++++++++++--- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index ce30b7afa..190b99c6a 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -1212,10 +1212,11 @@ await this.ExecuteWithRetryAsync( async Task ExecuteWithRetryAsync( Func action, string operationName, - CancellationToken cancellationToken) + CancellationToken cancellationToken, + TimeSpan? initialDelay = null) { const int maxAttempts = 10; - TimeSpan delay = TimeSpan.FromMilliseconds(200); + TimeSpan delay = initialDelay ?? TimeSpan.FromMilliseconds(200); for (int attempt = 1; ; attempt++) { diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index c6101d4f8..8b49c7eb1 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -28,10 +28,11 @@ static Type FindProcessorType() .Any(method => method.ReturnType == typeof(Task) && method.GetParameters() is var parameters && - parameters.Length == 3 && + parameters.Length == 4 && parameters[0].ParameterType == typeof(Func) && parameters[1].ParameterType == typeof(string) && - parameters[2].ParameterType == typeof(CancellationToken))); + parameters[2].ParameterType == typeof(CancellationToken) && + parameters[3].ParameterType == typeof(TimeSpan?))); } static MethodInfo FindExecuteWithRetryAsyncMethod() @@ -41,10 +42,11 @@ static MethodInfo FindExecuteWithRetryAsyncMethod() .Single(method => method.ReturnType == typeof(Task) && method.GetParameters() is var parameters && - parameters.Length == 3 && + parameters.Length == 4 && parameters[0].ParameterType == typeof(Func) && parameters[1].ParameterType == typeof(string) && - parameters[2].ParameterType == typeof(CancellationToken)); + parameters[2].ParameterType == typeof(CancellationToken) && + parameters[3].ParameterType == typeof(TimeSpan?)); } [Fact] public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() @@ -211,6 +213,32 @@ await InvokeExecuteWithRetryAsync( callCount.Should().Be(3); } + [Fact] + public async Task ExecuteWithRetryAsync_TransientErrorExceedsMaxAttempts_ThrowsLastRpcException() + { + // Arrange + object processor = CreateProcessor(); + const int maxAttempts = 10; + int callCount = 0; + StatusCode lastStatusCode = StatusCode.Unavailable; + + // Act - always fail with a transient error + Func act = () => InvokeExecuteWithRetryAsync( + processor, + () => + { + callCount++; + throw new RpcException(new Status(lastStatusCode, "persistent transient error")); + }, + "TestOperation", + CancellationToken.None, + initialDelay: TimeSpan.Zero); + + // Assert - the last RpcException should be surfaced after max attempts + await act.Should().ThrowAsync().Where(e => e.StatusCode == lastStatusCode); + callCount.Should().Be(maxAttempts); + } + static object CreateProcessor(TestLogProvider? logProvider = null) { ILoggerFactory loggerFactory = logProvider is null @@ -250,11 +278,12 @@ static Task InvokeExecuteWithRetryAsync( object processor, Func action, string operationName, - CancellationToken cancellationToken) + CancellationToken cancellationToken, + TimeSpan? initialDelay = null) { return (Task)ExecuteWithRetryAsyncMethod.Invoke( processor, - new object?[] { action, operationName, cancellationToken })!; + new object?[] { action, operationName, cancellationToken, initialDelay })!; } sealed class OptionsMonitorStub : IOptionsMonitor where T : class, new() From ba6c0c039180289c1f8621435709cf80cd532fdf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 18:04:06 +0000 Subject: [PATCH 12/36] Assert status code in TransientGrpcRetry log test Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/380b9325-d404-4397-b025-648ee19b5d51 Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 8b49c7eb1..b2d8ffaf6 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -174,7 +174,8 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 1 of 10")); + log.Message.Contains("Attempt 1 of 10") && + log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); } [Fact] From e60922755013ab54f3f742336ff7534fffbb4472 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 11:13:41 -0700 Subject: [PATCH 13/36] simplied method extraction --- .../Grpc.Tests/ExecuteWithRetryTests.cs | 28 +++---------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index c6101d4f8..30c4de7bc 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -20,32 +20,12 @@ public class ExecuteWithRetryTests static readonly MethodInfo ExecuteWithRetryAsyncMethod = FindExecuteWithRetryAsyncMethod(); - static Type FindProcessorType() - { - return typeof(GrpcDurableTaskWorker) - .GetNestedTypes(BindingFlags.NonPublic) - .Single(type => type.GetMethods(BindingFlags.Instance | BindingFlags.NonPublic) - .Any(method => - method.ReturnType == typeof(Task) && - method.GetParameters() is var parameters && - parameters.Length == 3 && - parameters[0].ParameterType == typeof(Func) && - parameters[1].ParameterType == typeof(string) && - parameters[2].ParameterType == typeof(CancellationToken))); - } - static MethodInfo FindExecuteWithRetryAsyncMethod() { - return FindProcessorType() - .GetMethods(BindingFlags.Instance | BindingFlags.NonPublic) - .Single(method => - method.ReturnType == typeof(Task) && - method.GetParameters() is var parameters && - parameters.Length == 3 && - parameters[0].ParameterType == typeof(Func) && - parameters[1].ParameterType == typeof(string) && - parameters[2].ParameterType == typeof(CancellationToken)); - } + Type processorType = typeof(GrpcDurableTaskWorker).GetNestedType("Processor", BindingFlags.NonPublic)!; + return processorType.GetMethod("ExecuteWithRetryAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + } + [Fact] public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() { From 96258123a0907e79e2454764ab052ac510fa3512 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 18:27:13 +0000 Subject: [PATCH 14/36] Add status code assertion to MultipleTransientErrors log test Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/9df186ec-820e-418e-8d32-c60328aabfdf Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index b2d8ffaf6..188f1a370 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -207,10 +207,12 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 1 of 10")); + log.Message.Contains("Attempt 1 of 10") && + log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); logs.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 2 of 10")); + log.Message.Contains("Attempt 2 of 10") && + log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); callCount.Should().Be(3); } From 8ec4113c6e2fa2fa04a9bb58a7d0cd6dae02f3f7 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 12:20:16 -0700 Subject: [PATCH 15/36] refactored so the retry also uses the shared backoff class --- .../{ReconnectBackoff.cs => GrpcBackoff.cs} | 43 +- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 503 +++++++++--------- .../Grpc/GrpcDurableTaskWorkerOptions.cs | 20 + src/Worker/Grpc/Logs.cs | 2 +- test/Worker/Grpc.Tests/GrpcBackoffTests.cs | 214 ++++++++ ...pcDurableTaskWorkerOptionsInternalTests.cs | 5 +- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 4 +- .../Grpc.Tests/ReconnectBackoffTests.cs | 145 ----- 8 files changed, 529 insertions(+), 407 deletions(-) rename src/Worker/Grpc/{ReconnectBackoff.cs => GrpcBackoff.cs} (72%) create mode 100644 test/Worker/Grpc.Tests/GrpcBackoffTests.cs delete mode 100644 test/Worker/Grpc.Tests/ReconnectBackoffTests.cs diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/GrpcBackoff.cs similarity index 72% rename from src/Worker/Grpc/ReconnectBackoff.cs rename to src/Worker/Grpc/GrpcBackoff.cs index dd08a58ce..167bd41f7 100644 --- a/src/Worker/Grpc/ReconnectBackoff.cs +++ b/src/Worker/Grpc/GrpcBackoff.cs @@ -6,9 +6,9 @@ namespace Microsoft.DurableTask.Worker.Grpc; /// -/// Helpers for computing reconnect backoff delays in the gRPC worker. +/// Helpers for computing reconnect and retry backoff delays in the gRPC worker. /// -static class ReconnectBackoff +static class GrpcBackoff { /// /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on @@ -31,30 +31,37 @@ public static Random CreateRandom() /// The retry attempt index, starting at 0. /// The base delay used for the exponential growth. /// The maximum delay before jitter is applied. - /// The random source used for jitter. + /// The random source used for jitter. + /// If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound. /// The computed jittered delay. - public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random) - { - if (baseDelay <= TimeSpan.Zero) - { - return TimeSpan.Zero; - } - - if (attempt < 0) - { - attempt = 0; + public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random, bool fullJitter) + { + if (baseDelay <= TimeSpan.Zero || cap <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt < 0) + { + attempt = 0; } // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. int safeAttempt = Math.Min(attempt, 30); - double capMs = Math.Max(0, cap.TotalMilliseconds); double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); - double upperBoundMs = Math.Min(capMs, exponentialMs); + double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs); + + double jitteredMs = 0; + if (fullJitter) + { + jitteredMs = random.NextDouble() * upperBoundMs; + } + else + { + jitteredMs = upperBoundMs + (random.NextDouble() * (upperBoundMs * .2)); + } - // Full jitter intentionally allows any value in the retry window. The wide spread keeps many - // workers that saw the same outage from reconnecting in lockstep against the backend. - double jitteredMs = random.NextDouble() * upperBoundMs; return TimeSpan.FromMilliseconds(jitteredMs); } } diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index ce30b7afa..4b8d5da49 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -62,7 +62,7 @@ public async Task ExecuteAsync(CancellationToken cancellati // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. int reconnectAttempt = 0; - Random backoffRandom = ReconnectBackoff.CreateRandom(); + Random random = GrpcBackoff.CreateRandom(); while (!cancellation.IsCancellationRequested) { @@ -71,7 +71,8 @@ public async Task ExecuteAsync(CancellationToken cancellati { using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); await this.ProcessWorkItemsAsync( - stream, + stream, + random, cancellation, onFirstMessage: () => { @@ -148,14 +149,16 @@ await this.ProcessWorkItemsAsync( } try - { - TimeSpan delay = ReconnectBackoff.Compute( + { + // Full jitter intentionally allows any value in the retry window. The wide spread keeps many + // workers that saw the same outage from reconnecting in lockstep against the backend. + TimeSpan delay = GrpcBackoff.Compute( reconnectAttempt, this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, - backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); - reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + random, + fullJitter: true); + this.Logger.GrpcBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) @@ -335,7 +338,8 @@ async ValueTask BuildRuntimeStateAsync( } async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, + AsyncServerStreamingCall stream, + Random retryRandom, CancellationToken cancellation, Action? onFirstMessage = null, Action? onChannelLikelyPoisoned = null) @@ -352,7 +356,7 @@ async Task ProcessWorkItemsAsync( WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( ct => stream.ResponseStream.ReadAllAsync(ct), silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, cancellation), + workItem => this.DispatchWorkItem(workItem, retryRandom, cancellation), onFirstMessage, cancellation); @@ -388,49 +392,56 @@ async Task ProcessWorkItemsAsync( } } - void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); + void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + retryRandom, + cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + retryRandom, + cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), retryRandom, cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + retryRandom, + cancellation, + workItem.CompletionToken, + operationInfos), + retryRandom, + cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) { @@ -446,7 +457,7 @@ void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) } } - void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) + void RunBackgroundTask(P.WorkItem? workItem, Func handler, Random retryRandom, CancellationToken cancellation) { // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? _ = Task.Run( @@ -474,16 +485,17 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationTok { try { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellation); this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -495,19 +507,20 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + retryRandom, cancellation); this.Logger.AbandonedActivityWorkItem( instanceId, @@ -524,20 +537,21 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + retryRandom, + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -549,20 +563,21 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + retryRandom, + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -574,9 +589,10 @@ await this.ExecuteWithRetryAsync( }); } - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + Random retryRandom, CancellationToken cancellationToken) { var executionStartedEvent = @@ -712,20 +728,21 @@ async Task OnRunOrchestratorAsync( cancellationToken); } - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellationToken); + + return; } // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. @@ -816,20 +833,21 @@ await this.ExecuteWithRetryAsync( }, }; } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellationToken); + + return; } } else @@ -878,13 +896,14 @@ await this.ExecuteWithRetryAsync( response.Actions.Count, GetActionsListForLogging(response.Actions)); - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + retryRandom, cancellationToken); } - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, Random retryRandom, CancellationToken cancellation) { using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); @@ -930,18 +949,19 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, } else { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + retryRandom, + cancellation); } return; @@ -975,16 +995,18 @@ await this.ExecuteWithRetryAsync( // Stop the trace activity here to avoid including the completion time in the latency calculation traceActivity?.Stop(); - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteActivityTaskAsync), - cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - CancellationToken cancellation, - string? completionToken = null, + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + retryRandom, + cancellation); + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + Random retryRandom, + CancellationToken cancellation, + string? completionToken = null, List? operationInfos = null) { var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); @@ -1044,10 +1066,11 @@ async Task OnRunEntityBatchAsync( completionToken, operationInfos?.Take(batchResult.Results?.Count ?? 0)); - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteEntityTaskAsync), - cancellation); + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + retryRandom, + cancellation); } /// @@ -1056,9 +1079,10 @@ await this.ExecuteWithRetryAsync( /// The orchestrator response to send. /// The maximum size in bytes for each chunk. /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + Random retryRandom, CancellationToken cancellationToken) { // Validate that no single action exceeds the maximum chunk size @@ -1109,13 +1133,14 @@ async Task CompleteOrchestratorTaskWithChunkingAsync( }, }; - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; - } - + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + return; + } + // Helper to add an action to the current chunk if it fits static bool TryAddAction( Google.Protobuf.Collections.RepeatedField dest, @@ -1138,12 +1163,13 @@ static bool TryAddAction( int totalSize = response.CalculateSize(); if (totalSize <= maxChunkBytes) { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - return; + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + return; } // Response is too large, split into multiple chunks @@ -1201,67 +1227,64 @@ await this.ExecuteWithRetryAsync( chunkIndex++; - // Send the chunk - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - cancellationToken); - } + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + } } - async Task ExecuteWithRetryAsync( - Func action, - string operationName, - CancellationToken cancellationToken) - { - const int maxAttempts = 10; - TimeSpan delay = TimeSpan.FromMilliseconds(200); - - for (int attempt = 1; ; attempt++) - { - try - { - await action(); - return; - } - catch (RpcException ex) when ( - (ex.StatusCode == StatusCode.Unavailable || - ex.StatusCode == StatusCode.Unknown || - ex.StatusCode == StatusCode.DeadlineExceeded || - ex.StatusCode == StatusCode.Internal) && - attempt < maxAttempts) - { - // Back off with jitter for transient transport errors -#if NET6_0_OR_GREATER - int jitterMs = Random.Shared.Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#else - int jitterMs = new Random().Next(0, (int)(delay.TotalMilliseconds * 0.2)); -#endif - TimeSpan backoff = delay + TimeSpan.FromMilliseconds(jitterMs); - - this.Logger.TransientGrpcRetry( - operationName, - attempt, - maxAttempts, - backoff.TotalMilliseconds, - (int)ex.StatusCode, - ex); - - try - { - await Task.Delay(backoff, cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // If shutting down during the retry delay, propagate the cancellation exception - throw; - } - - // Exponential increase, capping at 15 seconds - delay = TimeSpan.FromMilliseconds(Math.Min(delay.TotalMilliseconds * 2, 15000)); - continue; - } - } + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + Random retryRandom, + CancellationToken cancellationToken) + { + int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; + TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; + TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with + // each attempt. We don't have lockstep concerns in this case + TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down during the retry delay, propagate the cancellation exception + throw; + } + + continue; + } + } } } } diff --git a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs index 464c50a8b..8616264f1 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs @@ -135,6 +135,26 @@ internal class InternalOptions /// public TimeSpan ReconnectBackoffCap { get; set; } = TimeSpan.FromSeconds(30); + /// + /// Gets or sets the maximum number of attempts the worker will make when retrying a transient + /// gRPC call (such as completing or abandoning a work item). Once this many attempts have failed, + /// the most recent exception is rethrown. Defaults to 10. + /// + public int TransientRetryMaxAttempts { get; set; } = 10; + + /// + /// Gets or sets the initial delay used when computing exponential backoff between retries of a + /// transient gRPC call. The delay doubles after each failed attempt, with a small uniform jitter + /// added on top, until is reached. Defaults to 200 ms. + /// + public TimeSpan TransientRetryBackoffBase { get; set; } = TimeSpan.FromMilliseconds(200); + + /// + /// Gets or sets the maximum delay between retries of a transient gRPC call. The exponentially + /// increasing delay is clamped to this value. Defaults to 15 second. + /// + public TimeSpan TransientRetryBackoffCap { get; set; } = TimeSpan.FromSeconds(15); + /// /// Gets or sets an optional callback invoked when the worker requests a fresh gRPC channel after /// repeated connect failures. The callback receives the previously-used channel and should return diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index 878efe9c8..df8b3c0b9 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -93,7 +93,7 @@ static partial class Logs public static partial void ChannelRecreated(this ILogger logger, string endpoint); [LoggerMessage(EventId = 74, Level = LogLevel.Debug, Message = "Reconnect attempt {attempt} will retry after {delayMs} ms.")] - public static partial void ReconnectBackoff(this ILogger logger, int attempt, int delayMs); + public static partial void GrpcBackoff(this ILogger logger, int attempt, int delayMs); [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] public static partial void ReceivedHealthPing(this ILogger logger); diff --git a/test/Worker/Grpc.Tests/GrpcBackoffTests.cs b/test/Worker/Grpc.Tests/GrpcBackoffTests.cs new file mode 100644 index 000000000..93b1cf119 --- /dev/null +++ b/test/Worker/Grpc.Tests/GrpcBackoffTests.cs @@ -0,0 +1,214 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class GrpcBackoffTests +{ + [Theory] + [InlineData(true)] + [InlineData(false)] + public void Compute_ZeroBase_ReturnsZero(bool fullJitter) + { + // Arrange + Random random = new(42); + + // Act + TimeSpan delay = GrpcBackoff.Compute(attempt: 5, baseDelay: TimeSpan.Zero, cap: TimeSpan.FromSeconds(30), random, fullJitter); + + // Assert + delay.Should().Be(TimeSpan.Zero); + } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void Compute_NegativeBase_ReturnsZero(bool fullJitter) + { + // Arrange + Random random = new(42); + + // Act + TimeSpan delay = GrpcBackoff.Compute(attempt: 0, baseDelay: TimeSpan.FromMilliseconds(-100), cap: TimeSpan.FromSeconds(30), random, fullJitter); + + // Assert + delay.Should().Be(TimeSpan.Zero); + } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void Compute_NonPositiveCap_ReturnsZero(bool fullJitter) + { + // Arrange + DeterministicRandom random = new(0.999999); + + // Act + TimeSpan zero = GrpcBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.Zero, random, fullJitter); + TimeSpan negative = GrpcBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(-1), random, fullJitter); + + // Assert + zero.Should().Be(TimeSpan.Zero); + negative.Should().Be(TimeSpan.Zero); + } + + [Fact] + public void Compute_FullJitter_NeverExceedsCap() + { + // Arrange + TimeSpan cap = TimeSpan.FromSeconds(30); + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + Random random = new(1); + + // Act + Assert: try a wide range of attempts, including pathological values. + // Note: this invariant is full-jitter-specific — biased mode intentionally returns up to + // 1.2x the upper bound and so can legally exceed the cap. + for (int attempt = 0; attempt < 50; attempt++) + { + TimeSpan delay = GrpcBackoff.Compute(attempt, baseDelay, cap, random, fullJitter: true); + delay.Should().BeLessThanOrEqualTo(cap, $"attempt {attempt} produced {delay}"); + delay.Should().BeGreaterThanOrEqualTo(TimeSpan.Zero); + } + } + + [Fact] + public void Compute_FullJitter_GrowsExponentiallyUntilCap() + { + // Arrange: a Random that always returns ~1.0 forces the upper bound of the jitter window. + DeterministicRandom random = new(value: 0.999999); + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + TimeSpan cap = TimeSpan.FromSeconds(30); + + // Act + double d0 = GrpcBackoff.Compute(0, baseDelay, cap, random, fullJitter: true).TotalMilliseconds; + double d1 = GrpcBackoff.Compute(1, baseDelay, cap, random, fullJitter: true).TotalMilliseconds; + double d2 = GrpcBackoff.Compute(2, baseDelay, cap, random, fullJitter: true).TotalMilliseconds; + double d3 = GrpcBackoff.Compute(3, baseDelay, cap, random, fullJitter: true).TotalMilliseconds; + double d10 = GrpcBackoff.Compute(10, baseDelay, cap, random, fullJitter: true).TotalMilliseconds; + + // Assert: roughly doubles each step until cap is reached. + d0.Should().BeApproximately(1000, 1); + d1.Should().BeApproximately(2000, 1); + d2.Should().BeApproximately(4000, 1); + d3.Should().BeApproximately(8000, 1); + d10.Should().BeApproximately(30000, 1, "should be clamped at the cap"); + } + + [Fact] + public void Compute_FullJitter_StaysWithinBounds() + { + // Arrange: with random=0 the result is 0; with random=1 the result is the bound. + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + TimeSpan cap = TimeSpan.FromSeconds(30); + + // Act + Assert: random=0 → 0 + TimeSpan low = GrpcBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.0), fullJitter: true); + low.TotalMilliseconds.Should().BeApproximately(0, 0.5); + + // random ~1 → bound (= 8s for attempt=3, base=1s) + TimeSpan high = GrpcBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.999999), fullJitter: true); + high.TotalMilliseconds.Should().BeApproximately(8000, 1); + } + + [Fact] + public void Compute_BiasedJitter_StaysWithinBounds() + { + // Arrange: biased jitter returns a value in [upperBound, upperBound * 1.2]. + // attempt=3, base=1s → upperBound=8s. + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + TimeSpan cap = TimeSpan.FromSeconds(30); + + // Act + Assert: random=0 → upperBound (lower edge of biased window). + TimeSpan low = GrpcBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.0), fullJitter: false); + low.TotalMilliseconds.Should().BeApproximately(8000, 1); + + // random ~1 → upperBound * 1.2 (upper edge). + TimeSpan high = GrpcBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.999999), fullJitter: false); + high.TotalMilliseconds.Should().BeApproximately(9600, 1); + + // mid value → halfway. + TimeSpan mid = GrpcBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.5), fullJitter: false); + mid.TotalMilliseconds.Should().BeApproximately(8800, 1); + } + + [Fact] + public void Compute_NegativeAttempt_TreatedAsZero() + { + // Arrange + DeterministicRandom random = new(0.999999); + + // Act + TimeSpan delay = GrpcBackoff.Compute(attempt: -5, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(30), random, fullJitter: true); + + // Assert + delay.TotalMilliseconds.Should().BeApproximately(1000, 1); + } + + [Fact] + public void Compute_FullJitter_CapSmallerThanBase_ClampsToCap() + { + // Arrange: cap is intentionally smaller than baseDelay; the cap must still be honored. + // Note: biased mode would return up to 1.2 * cap here by design, so this invariant is + // full-jitter-only. + DeterministicRandom random = new(0.999999); + TimeSpan baseDelay = TimeSpan.FromSeconds(5); + TimeSpan cap = TimeSpan.FromSeconds(1); + + // Act + TimeSpan delay = GrpcBackoff.Compute(attempt: 3, baseDelay, cap, random, fullJitter: true); + + // Assert: with random ~ 1 the result is the bound, which must equal the cap. + delay.TotalMilliseconds.Should().BeApproximately(1000, 1); + delay.Should().BeLessThanOrEqualTo(cap); + } + + [Theory] + [InlineData(true)] + [InlineData(false)] + public void Compute_AttemptIsCappedAt30(bool fullJitter) + { + // Arrange: pick a base/cap where the cap is large enough that 2^30 * base does not saturate it, + // so the exponent — not the cap — drives the upper bound. This lets us observe the internal + // attempt clamp at 30: any attempt ≥ 30 must yield the same upper bound as attempt = 30. + TimeSpan baseDelay = TimeSpan.FromMilliseconds(1); + TimeSpan cap = TimeSpan.FromDays(365); // 2^30 ms ≈ 12.4 days < cap. + + // Act: produce a fresh DeterministicRandom for each call so the same NextDouble() value is + // sampled + TimeSpan at30 = GrpcBackoff.Compute(30, baseDelay, cap, new DeterministicRandom(1.0), fullJitter); + TimeSpan at31 = GrpcBackoff.Compute(31, baseDelay, cap, new DeterministicRandom(1.0), fullJitter); + TimeSpan at100 = GrpcBackoff.Compute(100, baseDelay, cap, new DeterministicRandom(1.0), fullJitter); + TimeSpan atIntMax = GrpcBackoff.Compute(int.MaxValue, baseDelay, cap, new DeterministicRandom(1.0), fullJitter); + + // Assert: all produce the same delay, equal to the attempt=30 value (sanity-checked against + // the analytical upper bound of 2^30 ms — exact for full jitter at random=1, and 2^30 * 1.2 + // for biased mode at random=1). + double expectedUpperBoundMs = Math.Pow(2, 30); // 2^30 ms + if (fullJitter) + { + // random = 1 → result == upper bound + at30.TotalMilliseconds.Should().BeApproximately(expectedUpperBoundMs, 1); + } + else + { + // random = 1 → result == upper bound * 1.2 + at30.TotalMilliseconds.Should().BeApproximately(expectedUpperBoundMs * 1.2, 1); + } + + at31.Should().Be(at30); + at100.Should().Be(at30); + atIntMax.Should().Be(at30); + } + + sealed class DeterministicRandom : Random + { + readonly double value; + + public DeterministicRandom(double value) + { + this.value = value; + } + + public override double NextDouble() => this.value; + } +} diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs index 5813bc683..87e704849 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs @@ -20,7 +20,10 @@ public void InternalOptions_HasSafeDefaults() internalOptions.HelloDeadline.Should().Be(TimeSpan.FromSeconds(30)); internalOptions.ChannelRecreateFailureThreshold.Should().Be(5); internalOptions.ReconnectBackoffBase.Should().Be(TimeSpan.FromSeconds(1)); - internalOptions.ReconnectBackoffCap.Should().Be(TimeSpan.FromSeconds(30)); + internalOptions.ReconnectBackoffCap.Should().Be(TimeSpan.FromSeconds(30)); + internalOptions.TransientRetryBackoffBase.Should().Be(TimeSpan.FromMilliseconds(200)); + internalOptions.TransientRetryBackoffCap.Should().Be(TimeSpan.FromSeconds(15)); + internalOptions.TransientRetryMaxAttempts.Should().Be(10); internalOptions.SilentDisconnectTimeout.Should().Be(TimeSpan.FromSeconds(120)); internalOptions.ChannelRecreator.Should().BeNull(); } diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index db6c98da5..9adcc9ef8 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -52,7 +52,7 @@ public async Task ExecuteAsync_ConnectFailureThreshold_RecreatesConfiguredChanne Channel = currentChannel, }; grpcOptions.Internal.ChannelRecreateFailureThreshold = 2; - grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; DurableTaskWorkerOptions workerOptions = new() @@ -205,7 +205,7 @@ public async Task ProcessorExecuteAsync_GracefulDrainAfterFirstMessage_Reconnect // Arrange GrpcDurableTaskWorkerOptions grpcOptions = new(); grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; - grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; grpcOptions.Internal.SilentDisconnectTimeout = TimeSpan.FromSeconds(5); diff --git a/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs b/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs deleted file mode 100644 index 024f179eb..000000000 --- a/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -namespace Microsoft.DurableTask.Worker.Grpc.Tests; - -public class ReconnectBackoffTests -{ - [Fact] - public void Compute_ZeroBase_ReturnsZero() - { - // Arrange - Random random = new(42); - - // Act - TimeSpan delay = ReconnectBackoff.Compute(attempt: 5, baseDelay: TimeSpan.Zero, cap: TimeSpan.FromSeconds(30), random); - - // Assert - delay.Should().Be(TimeSpan.Zero); - } - - [Fact] - public void Compute_NegativeBase_ReturnsZero() - { - // Arrange - Random random = new(42); - - // Act - TimeSpan delay = ReconnectBackoff.Compute(attempt: 0, baseDelay: TimeSpan.FromMilliseconds(-100), cap: TimeSpan.FromSeconds(30), random); - - // Assert - delay.Should().Be(TimeSpan.Zero); - } - - [Fact] - public void Compute_NeverExceedsCap() - { - // Arrange - TimeSpan cap = TimeSpan.FromSeconds(30); - TimeSpan baseDelay = TimeSpan.FromSeconds(1); - Random random = new(1); - - // Act + Assert: try a wide range of attempts, including pathological values. - for (int attempt = 0; attempt < 50; attempt++) - { - TimeSpan delay = ReconnectBackoff.Compute(attempt, baseDelay, cap, random); - delay.Should().BeLessThanOrEqualTo(cap, $"attempt {attempt} produced {delay}"); - delay.Should().BeGreaterThanOrEqualTo(TimeSpan.Zero); - } - } - - [Fact] - public void Compute_GrowsExponentiallyUntilCap() - { - // Arrange: a Random that always returns 1.0 forces the upper bound of the jitter window. - DeterministicRandom random = new(value: 0.999999); - TimeSpan baseDelay = TimeSpan.FromSeconds(1); - TimeSpan cap = TimeSpan.FromSeconds(30); - - // Act - double d0 = ReconnectBackoff.Compute(0, baseDelay, cap, random).TotalMilliseconds; - double d1 = ReconnectBackoff.Compute(1, baseDelay, cap, random).TotalMilliseconds; - double d2 = ReconnectBackoff.Compute(2, baseDelay, cap, random).TotalMilliseconds; - double d3 = ReconnectBackoff.Compute(3, baseDelay, cap, random).TotalMilliseconds; - double d10 = ReconnectBackoff.Compute(10, baseDelay, cap, random).TotalMilliseconds; - - // Assert: roughly doubles each step until cap is reached. - d0.Should().BeApproximately(1000, 1); - d1.Should().BeApproximately(2000, 1); - d2.Should().BeApproximately(4000, 1); - d3.Should().BeApproximately(8000, 1); - d10.Should().BeApproximately(30000, 1, "should be clamped at the cap"); - } - - [Fact] - public void Compute_WithFullJitter_StaysWithinBounds() - { - // Arrange: with random=0 the result is 0; with random=1 the result is the bound. - TimeSpan baseDelay = TimeSpan.FromSeconds(1); - TimeSpan cap = TimeSpan.FromSeconds(30); - - // Act + Assert: random=0 → 0 - TimeSpan low = ReconnectBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.0)); - low.TotalMilliseconds.Should().BeApproximately(0, 0.5); - - // random ~1 → bound (= 8s for attempt=3, base=1s) - TimeSpan high = ReconnectBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.999999)); - high.TotalMilliseconds.Should().BeApproximately(8000, 1); - } - - [Fact] - public void Compute_NegativeAttempt_TreatedAsZero() - { - // Arrange - DeterministicRandom random = new(0.999999); - - // Act - TimeSpan delay = ReconnectBackoff.Compute(attempt: -5, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(30), random); - - // Assert - delay.TotalMilliseconds.Should().BeApproximately(1000, 1); - } - - [Fact] - public void Compute_CapSmallerThanBase_ClampsToCap() - { - // Arrange: cap is intentionally smaller than baseDelay; the cap must still be honored. - DeterministicRandom random = new(0.999999); - TimeSpan baseDelay = TimeSpan.FromSeconds(5); - TimeSpan cap = TimeSpan.FromSeconds(1); - - // Act - TimeSpan delay = ReconnectBackoff.Compute(attempt: 3, baseDelay, cap, random); - - // Assert: with random ~ 1 the result is the bound, which must equal the cap. - delay.TotalMilliseconds.Should().BeApproximately(1000, 1); - delay.Should().BeLessThanOrEqualTo(cap); - } - - [Fact] - public void Compute_NonPositiveCap_ReturnsZero() - { - // Arrange - DeterministicRandom random = new(0.999999); - - // Act - TimeSpan zero = ReconnectBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.Zero, random); - TimeSpan negative = ReconnectBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(-1), random); - - // Assert - zero.Should().Be(TimeSpan.Zero); - negative.Should().Be(TimeSpan.Zero); - } - - sealed class DeterministicRandom : Random - { - readonly double value; - - public DeterministicRandom(double value) - { - this.value = value; - } - - public override double NextDouble() => this.value; - } -} From 08922fb5c619c87fde4ba8574ee67373eef8bb34 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 12:24:39 -0700 Subject: [PATCH 16/36] Trying to fix line endings --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 508 +++++++++--------- .../Grpc.Tests/ExecuteWithRetryTests.cs | 2 +- 2 files changed, 255 insertions(+), 255 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 4b8d5da49..a9b153aeb 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -71,7 +71,7 @@ public async Task ExecuteAsync(CancellationToken cancellati { using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); await this.ProcessWorkItemsAsync( - stream, + stream, random, cancellation, onFirstMessage: () => @@ -149,14 +149,14 @@ await this.ProcessWorkItemsAsync( } try - { - // Full jitter intentionally allows any value in the retry window. The wide spread keeps many - // workers that saw the same outage from reconnecting in lockstep against the backend. + { + // Full jitter intentionally allows any value in the retry window. The wide spread keeps many + // workers that saw the same outage from reconnecting in lockstep against the backend. TimeSpan delay = GrpcBackoff.Compute( reconnectAttempt, this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, - random, + random, fullJitter: true); this.Logger.GrpcBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); await Task.Delay(delay, cancellation); @@ -338,7 +338,7 @@ async ValueTask BuildRuntimeStateAsync( } async Task ProcessWorkItemsAsync( - AsyncServerStreamingCall stream, + AsyncServerStreamingCall stream, Random retryRandom, CancellationToken cancellation, Action? onFirstMessage = null, @@ -392,56 +392,56 @@ async Task ProcessWorkItemsAsync( } } - void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken cancellation) - { - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - retryRandom, - cancellation), - retryRandom, - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - retryRandom, - cancellation), - retryRandom, - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), retryRandom, cancellation), - retryRandom, - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - retryRandom, - cancellation, - workItem.CompletionToken, - operationInfos), - retryRandom, - cancellation); + void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + retryRandom, + cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + retryRandom, + cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), retryRandom, cancellation), + retryRandom, + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + retryRandom, + cancellation, + workItem.CompletionToken, + operationInfos), + retryRandom, + cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) { @@ -485,17 +485,17 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, Random retryRan { try { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, - cancellation); + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellation); this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -507,20 +507,20 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - retryRandom, + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + retryRandom, cancellation); this.Logger.AbandonedActivityWorkItem( instanceId, @@ -537,21 +537,21 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - retryRandom, - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + retryRandom, + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -563,21 +563,21 @@ await this.ExecuteWithRetryAsync( { try { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - retryRandom, - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + retryRandom, + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, workItem.CompletionToken ?? string.Empty); } catch (Exception abandonException) @@ -589,10 +589,10 @@ await this.ExecuteWithRetryAsync( }); } - async Task OnRunOrchestratorAsync( - P.OrchestratorRequest request, - string completionToken, - Random retryRandom, + async Task OnRunOrchestratorAsync( + P.OrchestratorRequest request, + string completionToken, + Random retryRandom, CancellationToken cancellationToken) { var executionStartedEvent = @@ -728,21 +728,21 @@ async Task OnRunOrchestratorAsync( cancellationToken); } - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, - cancellationToken); - - return; + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellationToken); + + return; } // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. @@ -833,21 +833,21 @@ await this.ExecuteWithRetryAsync( }, }; } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, - cancellationToken); - - return; + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + retryRandom, + cancellationToken); + + return; } } else @@ -896,10 +896,10 @@ await this.ExecuteWithRetryAsync( response.Actions.Count, GetActionsListForLogging(response.Actions)); - await this.CompleteOrchestratorTaskWithChunkingAsync( - response, - this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - retryRandom, + await this.CompleteOrchestratorTaskWithChunkingAsync( + response, + this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, + retryRandom, cancellationToken); } @@ -949,19 +949,19 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, } else { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - retryRandom, - cancellation); + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + retryRandom, + cancellation); } return; @@ -995,18 +995,18 @@ await this.ExecuteWithRetryAsync( // Stop the trace activity here to avoid including the completion time in the latency calculation traceActivity?.Stop(); - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteActivityTaskAsync), - retryRandom, - cancellation); - } - - async Task OnRunEntityBatchAsync( - EntityBatchRequest batchRequest, - Random retryRandom, - CancellationToken cancellation, - string? completionToken = null, + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteActivityTaskAsync), + retryRandom, + cancellation); + } + + async Task OnRunEntityBatchAsync( + EntityBatchRequest batchRequest, + Random retryRandom, + CancellationToken cancellation, + string? completionToken = null, List? operationInfos = null) { var coreEntityId = DTCore.Entities.EntityId.FromString(batchRequest.InstanceId!); @@ -1066,11 +1066,11 @@ async Task OnRunEntityBatchAsync( completionToken, operationInfos?.Take(batchResult.Results?.Count ?? 0)); - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), - nameof(this.client.CompleteEntityTaskAsync), - retryRandom, - cancellation); + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), + nameof(this.client.CompleteEntityTaskAsync), + retryRandom, + cancellation); } /// @@ -1079,10 +1079,10 @@ await this.ExecuteWithRetryAsync( /// The orchestrator response to send. /// The maximum size in bytes for each chunk. /// The cancellation token. - async Task CompleteOrchestratorTaskWithChunkingAsync( - P.OrchestratorResponse response, - int maxChunkBytes, - Random retryRandom, + async Task CompleteOrchestratorTaskWithChunkingAsync( + P.OrchestratorResponse response, + int maxChunkBytes, + Random retryRandom, CancellationToken cancellationToken) { // Validate that no single action exceeds the maximum chunk size @@ -1133,14 +1133,14 @@ async Task CompleteOrchestratorTaskWithChunkingAsync( }, }; - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, - cancellationToken); - return; - } - + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + return; + } + // Helper to add an action to the current chunk if it fits static bool TryAddAction( Google.Protobuf.Collections.RepeatedField dest, @@ -1163,13 +1163,13 @@ static bool TryAddAction( int totalSize = response.CalculateSize(); if (totalSize <= maxChunkBytes) { - // Response fits in one chunk, send it directly (isPartial defaults to false) - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, - cancellationToken); - return; + // Response fits in one chunk, send it directly (isPartial defaults to false) + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + return; } // Response is too large, split into multiple chunks @@ -1227,64 +1227,64 @@ await this.ExecuteWithRetryAsync( chunkIndex++; - // Send the chunk - await this.ExecuteWithRetryAsync( - async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), - nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, - cancellationToken); - } + // Send the chunk + await this.ExecuteWithRetryAsync( + async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), + nameof(this.client.CompleteOrchestratorTaskAsync), + retryRandom, + cancellationToken); + } } - async Task ExecuteWithRetryAsync( - Func action, - string operationName, - Random retryRandom, - CancellationToken cancellationToken) - { - int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; - TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; - TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; - - for (int attempt = 1; ; attempt++) - { - try - { - await action(); - return; - } - catch (RpcException ex) when ( - (ex.StatusCode == StatusCode.Unavailable || - ex.StatusCode == StatusCode.Unknown || - ex.StatusCode == StatusCode.DeadlineExceeded || - ex.StatusCode == StatusCode.Internal) && - attempt < maxAttempts) - { - // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with - // each attempt. We don't have lockstep concerns in this case - TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); - - this.Logger.TransientGrpcRetry( - operationName, - attempt, - maxAttempts, - backoff.TotalMilliseconds, - (int)ex.StatusCode, - ex); - - try - { - await Task.Delay(backoff, cancellationToken); - } - catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) - { - // If shutting down during the retry delay, propagate the cancellation exception - throw; - } - - continue; - } - } + async Task ExecuteWithRetryAsync( + Func action, + string operationName, + Random retryRandom, + CancellationToken cancellationToken) + { + int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; + TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; + TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; + + for (int attempt = 1; ; attempt++) + { + try + { + await action(); + return; + } + catch (RpcException ex) when ( + (ex.StatusCode == StatusCode.Unavailable || + ex.StatusCode == StatusCode.Unknown || + ex.StatusCode == StatusCode.DeadlineExceeded || + ex.StatusCode == StatusCode.Internal) && + attempt < maxAttempts) + { + // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with + // each attempt. We don't have lockstep concerns in this case + TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); + + this.Logger.TransientGrpcRetry( + operationName, + attempt, + maxAttempts, + backoff.TotalMilliseconds, + (int)ex.StatusCode, + ex); + + try + { + await Task.Delay(backoff, cancellationToken); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + // If shutting down during the retry delay, propagate the cancellation exception + throw; + } + + continue; + } + } } } } diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index b1cff2aac..736dbe60d 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -25,7 +25,7 @@ static MethodInfo FindExecuteWithRetryAsyncMethod() { Type processorType = typeof(GrpcDurableTaskWorker).GetNestedType("Processor", BindingFlags.NonPublic)!; return processorType.GetMethod("ExecuteWithRetryAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; - } + } [Fact] public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() From aa1930016dba1a02d3311556fe4ee2a07dbee30f Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 12:25:49 -0700 Subject: [PATCH 17/36] reverting some unnecessary changes --- src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs | 2 +- test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index a9b153aeb..bbc99e270 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -158,7 +158,7 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffCap, random, fullJitter: true); - this.Logger.GrpcBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index 9adcc9ef8..db6c98da5 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -52,7 +52,7 @@ public async Task ExecuteAsync_ConnectFailureThreshold_RecreatesConfiguredChanne Channel = currentChannel, }; grpcOptions.Internal.ChannelRecreateFailureThreshold = 2; - grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; DurableTaskWorkerOptions workerOptions = new() @@ -205,7 +205,7 @@ public async Task ProcessorExecuteAsync_GracefulDrainAfterFirstMessage_Reconnect // Arrange GrpcDurableTaskWorkerOptions grpcOptions = new(); grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; - grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; grpcOptions.Internal.SilentDisconnectTimeout = TimeSpan.FromSeconds(5); From 3df94d7c19f0acd7e66c10f4f46134fc3ddeb11f Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 12:27:51 -0700 Subject: [PATCH 18/36] missed the log changes --- src/Worker/Grpc/Logs.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index df8b3c0b9..878efe9c8 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -93,7 +93,7 @@ static partial class Logs public static partial void ChannelRecreated(this ILogger logger, string endpoint); [LoggerMessage(EventId = 74, Level = LogLevel.Debug, Message = "Reconnect attempt {attempt} will retry after {delayMs} ms.")] - public static partial void GrpcBackoff(this ILogger logger, int attempt, int delayMs); + public static partial void ReconnectBackoff(this ILogger logger, int attempt, int delayMs); [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] public static partial void ReceivedHealthPing(this ILogger logger); From 155b5edd1130381ed70e88f44a4ef462ec4e9019 Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:34:33 -0700 Subject: [PATCH 19/36] Potential fix for pull request finding 'Missed ternary opportunity' Co-authored-by: Copilot Autofix powered by AI <223894421+github-code-quality[bot]@users.noreply.github.com> --- src/Worker/Grpc/GrpcBackoff.cs | 104 ++++++++++++++++----------------- 1 file changed, 49 insertions(+), 55 deletions(-) diff --git a/src/Worker/Grpc/GrpcBackoff.cs b/src/Worker/Grpc/GrpcBackoff.cs index 167bd41f7..6d3a75b9e 100644 --- a/src/Worker/Grpc/GrpcBackoff.cs +++ b/src/Worker/Grpc/GrpcBackoff.cs @@ -1,39 +1,39 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Security.Cryptography; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// Helpers for computing reconnect and retry backoff delays in the gRPC worker. -/// -static class GrpcBackoff -{ - /// - /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on - /// older runtimes don't converge on the same time-based seed. - /// - /// A random source suitable for reconnect jitter. - public static Random CreateRandom() - { - byte[] seedBytes = new byte[sizeof(int)]; - using RandomNumberGenerator randomNumberGenerator = RandomNumberGenerator.Create(); - randomNumberGenerator.GetBytes(seedBytes); - return new Random(BitConverter.ToInt32(seedBytes, 0)); - } - - /// - /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in - /// [0, min(cap, base * 2^attempt)]. Returns when - /// or is non-positive. - /// - /// The retry attempt index, starting at 0. - /// The base delay used for the exponential growth. - /// The maximum delay before jitter is applied. +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Security.Cryptography; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// Helpers for computing reconnect and retry backoff delays in the gRPC worker. +/// +static class GrpcBackoff +{ + /// + /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on + /// older runtimes don't converge on the same time-based seed. + /// + /// A random source suitable for reconnect jitter. + public static Random CreateRandom() + { + byte[] seedBytes = new byte[sizeof(int)]; + using RandomNumberGenerator randomNumberGenerator = RandomNumberGenerator.Create(); + randomNumberGenerator.GetBytes(seedBytes); + return new Random(BitConverter.ToInt32(seedBytes, 0)); + } + + /// + /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in + /// [0, min(cap, base * 2^attempt)]. Returns when + /// or is non-positive. + /// + /// The retry attempt index, starting at 0. + /// The base delay used for the exponential growth. + /// The maximum delay before jitter is applied. /// The random source used for jitter. - /// If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound. - /// The computed jittered delay. + /// If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound. + /// The computed jittered delay. public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random, bool fullJitter) { if (baseDelay <= TimeSpan.Zero || cap <= TimeSpan.Zero) @@ -44,24 +44,18 @@ public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Ra if (attempt < 0) { attempt = 0; - } - - // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. - int safeAttempt = Math.Min(attempt, 30); - - double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); - double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs); - - double jitteredMs = 0; - if (fullJitter) - { - jitteredMs = random.NextDouble() * upperBoundMs; } - else - { - jitteredMs = upperBoundMs + (random.NextDouble() * (upperBoundMs * .2)); - } - - return TimeSpan.FromMilliseconds(jitteredMs); - } -} + + // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. + int safeAttempt = Math.Min(attempt, 30); + + double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); + double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs); + + double jitteredMs = fullJitter + ? random.NextDouble() * upperBoundMs + : upperBoundMs + (random.NextDouble() * (upperBoundMs * .2)); + + return TimeSpan.FromMilliseconds(jitteredMs); + } +} From 05a0958657c35205db4db0a0f5d44f8fcadec4bf Mon Sep 17 00:00:00 2001 From: sophiatev <38052607+sophiatev@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:46:24 -0700 Subject: [PATCH 20/36] Update src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs index 8616264f1..49bd63507 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs @@ -144,14 +144,17 @@ internal class InternalOptions /// /// Gets or sets the initial delay used when computing exponential backoff between retries of a - /// transient gRPC call. The delay doubles after each failed attempt, with a small uniform jitter - /// added on top, until is reached. Defaults to 200 ms. + /// transient gRPC call. The delay doubles after each failed attempt, and the exponential component + /// is capped at before jitter is applied. In the default + /// biased-jitter mode, the final delay may therefore slightly exceed + /// . Defaults to 200 ms. /// public TimeSpan TransientRetryBackoffBase { get; set; } = TimeSpan.FromMilliseconds(200); /// - /// Gets or sets the maximum delay between retries of a transient gRPC call. The exponentially - /// increasing delay is clamped to this value. Defaults to 15 second. + /// Gets or sets the cap applied to the exponential backoff component between retries of a transient + /// gRPC call before jitter is applied. In the default biased-jitter mode, the final computed delay + /// may be slightly greater than this value. Defaults to 15 seconds. /// public TimeSpan TransientRetryBackoffCap { get; set; } = TimeSpan.FromSeconds(15); From fc5c1ff22e015635f8c67409ee1b877420781c6d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 19:54:45 +0000 Subject: [PATCH 21/36] Fix thread-safety: create Random per ExecuteWithRetryAsync call, remove from call chain Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/16ec67ea-9c12-4fad-8ef9-325eb2985d5d Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 42 ++++--------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index bbc99e270..c46166384 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -72,7 +72,6 @@ public async Task ExecuteAsync(CancellationToken cancellati using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); await this.ProcessWorkItemsAsync( stream, - random, cancellation, onFirstMessage: () => { @@ -339,7 +338,6 @@ async ValueTask BuildRuntimeStateAsync( async Task ProcessWorkItemsAsync( AsyncServerStreamingCall stream, - Random retryRandom, CancellationToken cancellation, Action? onFirstMessage = null, Action? onChannelLikelyPoisoned = null) @@ -356,7 +354,7 @@ async Task ProcessWorkItemsAsync( WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( ct => stream.ResponseStream.ReadAllAsync(ct), silentDisconnectTimeout, - workItem => this.DispatchWorkItem(workItem, retryRandom, cancellation), + workItem => this.DispatchWorkItem(workItem, cancellation), onFirstMessage, cancellation); @@ -392,7 +390,7 @@ async Task ProcessWorkItemsAsync( } } - void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken cancellation) + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) { if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) { @@ -401,9 +399,7 @@ void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken () => this.OnRunOrchestratorAsync( workItem.OrchestratorRequest, workItem.CompletionToken, - retryRandom, cancellation), - retryRandom, cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) @@ -413,17 +409,14 @@ void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken () => this.OnRunActivityAsync( workItem.ActivityRequest, workItem.CompletionToken, - retryRandom, cancellation), - retryRandom, cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) { this.RunBackgroundTask( workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), retryRandom, cancellation), - retryRandom, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) @@ -436,11 +429,9 @@ void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken workItem, () => this.OnRunEntityBatchAsync( batchRequest, - retryRandom, cancellation, workItem.CompletionToken, operationInfos), - retryRandom, cancellation); } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) @@ -457,7 +448,7 @@ void DispatchWorkItem(P.WorkItem workItem, Random retryRandom, CancellationToken } } - void RunBackgroundTask(P.WorkItem? workItem, Func handler, Random retryRandom, CancellationToken cancellation) + void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) { // TODO: is Task.Run appropriate here? Should we have finer control over the tasks and their threads? _ = Task.Run( @@ -494,7 +485,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, cancellation); this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } @@ -520,7 +510,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), - retryRandom, cancellation); this.Logger.AbandonedActivityWorkItem( instanceId, @@ -548,7 +537,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), - retryRandom, cancellation); this.Logger.AbandonedEntityWorkItem( workItem.EntityRequest.InstanceId, @@ -574,7 +562,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskEntityWorkItemAsync), - retryRandom, cancellation); this.Logger.AbandonedEntityWorkItem( workItem.EntityRequestV2.InstanceId, @@ -592,7 +579,6 @@ await this.ExecuteWithRetryAsync( async Task OnRunOrchestratorAsync( P.OrchestratorRequest request, string completionToken, - Random retryRandom, CancellationToken cancellationToken) { var executionStartedEvent = @@ -739,7 +725,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellationToken), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, cancellationToken); return; @@ -844,7 +829,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellationToken), nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - retryRandom, cancellationToken); return; @@ -899,11 +883,10 @@ await this.ExecuteWithRetryAsync( await this.CompleteOrchestratorTaskWithChunkingAsync( response, this.worker.grpcOptions.CompleteOrchestrationWorkItemChunkSizeInBytes, - retryRandom, cancellationToken); } - async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, Random retryRandom, CancellationToken cancellation) + async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, CancellationToken cancellation) { using Activity? traceActivity = TraceHelper.StartTraceActivityForTaskExecution(request); @@ -960,7 +943,6 @@ await this.ExecuteWithRetryAsync( }, cancellationToken: cancellation), nameof(this.client.AbandonTaskActivityWorkItemAsync), - retryRandom, cancellation); } @@ -998,13 +980,11 @@ await this.ExecuteWithRetryAsync( await this.ExecuteWithRetryAsync( async () => await this.client.CompleteActivityTaskAsync(response, cancellationToken: cancellation), nameof(this.client.CompleteActivityTaskAsync), - retryRandom, cancellation); } async Task OnRunEntityBatchAsync( EntityBatchRequest batchRequest, - Random retryRandom, CancellationToken cancellation, string? completionToken = null, List? operationInfos = null) @@ -1069,7 +1049,6 @@ async Task OnRunEntityBatchAsync( await this.ExecuteWithRetryAsync( async () => await this.client.CompleteEntityTaskAsync(response, cancellationToken: cancellation), nameof(this.client.CompleteEntityTaskAsync), - retryRandom, cancellation); } @@ -1082,7 +1061,6 @@ await this.ExecuteWithRetryAsync( async Task CompleteOrchestratorTaskWithChunkingAsync( P.OrchestratorResponse response, int maxChunkBytes, - Random retryRandom, CancellationToken cancellationToken) { // Validate that no single action exceeds the maximum chunk size @@ -1136,7 +1114,6 @@ async Task CompleteOrchestratorTaskWithChunkingAsync( await this.ExecuteWithRetryAsync( async () => await this.client.CompleteOrchestratorTaskAsync(failureResponse, cancellationToken: cancellationToken), nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, cancellationToken); return; } @@ -1167,7 +1144,6 @@ static bool TryAddAction( await this.ExecuteWithRetryAsync( async () => await this.client.CompleteOrchestratorTaskAsync(response, cancellationToken: cancellationToken), nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, cancellationToken); return; } @@ -1231,7 +1207,6 @@ await this.ExecuteWithRetryAsync( await this.ExecuteWithRetryAsync( async () => await this.client.CompleteOrchestratorTaskAsync(chunkedResponse, cancellationToken: cancellationToken), nameof(this.client.CompleteOrchestratorTaskAsync), - retryRandom, cancellationToken); } } @@ -1239,12 +1214,13 @@ await this.ExecuteWithRetryAsync( async Task ExecuteWithRetryAsync( Func action, string operationName, - Random retryRandom, - CancellationToken cancellationToken) + CancellationToken cancellationToken, + TimeSpan? initialDelay = null) { int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; - TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; + TimeSpan baseDelay = initialDelay ?? this.internalOptions.TransientRetryBackoffBase; TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; + Random retryRandom = GrpcBackoff.CreateRandom(); for (int attempt = 1; ; attempt++) { From 89957bbab300225d6d214792931a3be4b79eaf90 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:06:00 -0700 Subject: [PATCH 22/36] fixed attempt logic --- src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index c46166384..25a1a311c 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -157,7 +157,8 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffCap, random, fullJitter: true); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + reconnectAttempt++; await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) @@ -1222,7 +1223,7 @@ async Task ExecuteWithRetryAsync( TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; Random retryRandom = GrpcBackoff.CreateRandom(); - for (int attempt = 1; ; attempt++) + for (int attempt = 0; ; attempt++) { try { @@ -1238,7 +1239,7 @@ async Task ExecuteWithRetryAsync( { // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with // each attempt. We don't have lockstep concerns in this case - TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); + TimeSpan backoff = GrpcBackoff.Compute(attempt, baseDelay, cap, retryRandom, fullJitter: false); this.Logger.TransientGrpcRetry( operationName, From 35051fab469c82d22e04005898b09f041c275225 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:21:06 -0700 Subject: [PATCH 23/36] fixing line endings --- src/Worker/Grpc/GrpcBackoff.cs | 122 ++++++++++++++++----------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/src/Worker/Grpc/GrpcBackoff.cs b/src/Worker/Grpc/GrpcBackoff.cs index 6d3a75b9e..f60e19028 100644 --- a/src/Worker/Grpc/GrpcBackoff.cs +++ b/src/Worker/Grpc/GrpcBackoff.cs @@ -1,61 +1,61 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT License. - -using System.Security.Cryptography; - -namespace Microsoft.DurableTask.Worker.Grpc; - -/// -/// Helpers for computing reconnect and retry backoff delays in the gRPC worker. -/// -static class GrpcBackoff -{ - /// - /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on - /// older runtimes don't converge on the same time-based seed. - /// - /// A random source suitable for reconnect jitter. - public static Random CreateRandom() - { - byte[] seedBytes = new byte[sizeof(int)]; - using RandomNumberGenerator randomNumberGenerator = RandomNumberGenerator.Create(); - randomNumberGenerator.GetBytes(seedBytes); - return new Random(BitConverter.ToInt32(seedBytes, 0)); - } - - /// - /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in - /// [0, min(cap, base * 2^attempt)]. Returns when - /// or is non-positive. - /// - /// The retry attempt index, starting at 0. - /// The base delay used for the exponential growth. - /// The maximum delay before jitter is applied. - /// The random source used for jitter. - /// If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound. - /// The computed jittered delay. - public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random, bool fullJitter) - { - if (baseDelay <= TimeSpan.Zero || cap <= TimeSpan.Zero) - { - return TimeSpan.Zero; - } - - if (attempt < 0) - { - attempt = 0; - } - - // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. - int safeAttempt = Math.Min(attempt, 30); - - double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); - double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs); - - double jitteredMs = fullJitter - ? random.NextDouble() * upperBoundMs - : upperBoundMs + (random.NextDouble() * (upperBoundMs * .2)); - - return TimeSpan.FromMilliseconds(jitteredMs); - } -} +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Security.Cryptography; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// Helpers for computing reconnect and retry backoff delays in the gRPC worker. +/// +static class GrpcBackoff +{ + /// + /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on + /// older runtimes don't converge on the same time-based seed. + /// + /// A random source suitable for reconnect jitter. + public static Random CreateRandom() + { + byte[] seedBytes = new byte[sizeof(int)]; + using RandomNumberGenerator randomNumberGenerator = RandomNumberGenerator.Create(); + randomNumberGenerator.GetBytes(seedBytes); + return new Random(BitConverter.ToInt32(seedBytes, 0)); + } + + /// + /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in + /// [0, min(cap, base * 2^attempt)]. Returns when + /// or is non-positive. + /// + /// The retry attempt index, starting at 0. + /// The base delay used for the exponential growth. + /// The maximum delay before jitter is applied. + /// The random source used for jitter. + /// If true, applies full jitter. If false, applies a smaller jitter that is biased towards the upper bound. + /// The computed jittered delay. + public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random, bool fullJitter) + { + if (baseDelay <= TimeSpan.Zero || cap <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt < 0) + { + attempt = 0; + } + + // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. + int safeAttempt = Math.Min(attempt, 30); + + double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); + double upperBoundMs = Math.Min(cap.TotalMilliseconds, exponentialMs); + + double jitteredMs = fullJitter + ? random.NextDouble() * upperBoundMs + : upperBoundMs + (random.NextDouble() * (upperBoundMs * .2)); + + return TimeSpan.FromMilliseconds(jitteredMs); + } +} From 6d116877b1d07ec4a61fd090051d15ac6231dc62 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:27:49 -0700 Subject: [PATCH 24/36] reverting some more unnecessary changes --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 25a1a311c..3a01f89f9 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -62,7 +62,7 @@ public async Task ExecuteAsync(CancellationToken cancellati // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. int reconnectAttempt = 0; - Random random = GrpcBackoff.CreateRandom(); + Random backoffRandom = GrpcBackoff.CreateRandom(); while (!cancellation.IsCancellationRequested) { @@ -155,7 +155,7 @@ await this.ProcessWorkItemsAsync( reconnectAttempt, this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, - random, + backoffRandom, fullJitter: true); this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); reconnectAttempt++; @@ -1215,13 +1215,17 @@ await this.ExecuteWithRetryAsync( async Task ExecuteWithRetryAsync( Func action, string operationName, - CancellationToken cancellationToken, - TimeSpan? initialDelay = null) + CancellationToken cancellationToken) { int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; - TimeSpan baseDelay = initialDelay ?? this.internalOptions.TransientRetryBackoffBase; - TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; - Random retryRandom = GrpcBackoff.CreateRandom(); + TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; + TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; + Random retryRandom; +#if NET6_0_OR_GREATER + retryRandom = Random.Shared; +#else + retryRandom = new Random(); +#endif for (int attempt = 0; ; attempt++) { From de9c357fa8db2b397a41d3203a09bfd0518771f7 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:41:02 -0700 Subject: [PATCH 25/36] fixing the failing tests --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 736dbe60d..9a919526a 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -25,7 +25,7 @@ static MethodInfo FindExecuteWithRetryAsyncMethod() { Type processorType = typeof(GrpcDurableTaskWorker).GetNestedType("Processor", BindingFlags.NonPublic)!; return processorType.GetMethod("ExecuteWithRetryAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; - } + } [Fact] public async Task ExecuteWithRetryAsync_SucceedsOnFirstAttempt_DoesNotRetry() @@ -213,8 +213,7 @@ public async Task ExecuteWithRetryAsync_TransientErrorExceedsMaxAttempts_ThrowsL throw new RpcException(new Status(lastStatusCode, "persistent transient error")); }, "TestOperation", - CancellationToken.None, - initialDelay: TimeSpan.Zero); + CancellationToken.None); // Assert - the last RpcException should be surfaced after max attempts await act.Should().ThrowAsync().Where(e => e.StatusCode == lastStatusCode); @@ -260,12 +259,11 @@ static Task InvokeExecuteWithRetryAsync( object processor, Func action, string operationName, - CancellationToken cancellationToken, - TimeSpan? initialDelay = null) + CancellationToken cancellationToken) { return (Task)ExecuteWithRetryAsyncMethod.Invoke( processor, - new object?[] { action, operationName, cancellationToken, initialDelay })!; + new object?[] { action, operationName, cancellationToken })!; } sealed class OptionsMonitorStub : IOptionsMonitor where T : class, new() From 2eda8a95766c65b31fa3611c4f7cad95645d3711 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:50:10 -0700 Subject: [PATCH 26/36] fixed the log tests --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 9a919526a..acb2b5efc 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -153,7 +153,7 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 1 of 10") && + log.Message.Contains("Attempt 0 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); } @@ -186,11 +186,11 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 1 of 10") && + log.Message.Contains("Attempt 0 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); logs.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 2 of 10") && + log.Message.Contains("Attempt 1 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); callCount.Should().Be(3); } From d16cdc3d6f262e848ed15c8cd26ce82e99155e09 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 13:54:03 -0700 Subject: [PATCH 27/36] fixing the failing tests --- src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs | 11 ++++++----- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 6 +++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 3a01f89f9..b304afcfb 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -156,9 +156,9 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, backoffRandom, - fullJitter: true); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + fullJitter: true); reconnectAttempt++; + this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) @@ -1227,7 +1227,7 @@ async Task ExecuteWithRetryAsync( retryRandom = new Random(); #endif - for (int attempt = 0; ; attempt++) + for (int attempt = 1; ; attempt++) { try { @@ -1242,8 +1242,9 @@ async Task ExecuteWithRetryAsync( attempt < maxAttempts) { // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with - // each attempt. We don't have lockstep concerns in this case - TimeSpan backoff = GrpcBackoff.Compute(attempt, baseDelay, cap, retryRandom, fullJitter: false); + // each attempt. We don't have lockstep concerns in this case. + // Also make sure to zero-index the attempts + TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); this.Logger.TransientGrpcRetry( operationName, diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index acb2b5efc..9a919526a 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -153,7 +153,7 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 0 of 10") && + log.Message.Contains("Attempt 1 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); } @@ -186,11 +186,11 @@ await InvokeExecuteWithRetryAsync( logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); logs!.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 0 of 10") && + log.Message.Contains("Attempt 1 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); logs.Should().Contain(log => log.Message.Contains($"Transient gRPC error for '{operationName}'") && - log.Message.Contains("Attempt 1 of 10") && + log.Message.Contains("Attempt 2 of 10") && log.Message.Contains($"StatusCode={(int)StatusCode.Unavailable}")); callCount.Should().Be(3); } From 1379dd4058f2ff4b757465998c9b940edc7df6f0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 20:59:43 +0000 Subject: [PATCH 28/36] test: add integration-level retry tests to RunBackgroundTaskLoggingTests and fix unit test assertion Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/9ea46ebc-6803-49aa-8fe1-582fc56d49d7 Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- .../Grpc.Tests/ExecuteWithRetryTests.cs | 25 ++- .../RunBackgroundTaskLoggingTests.cs | 182 +++++++++++++++++- 2 files changed, 200 insertions(+), 7 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 9a919526a..1e3554d13 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -198,9 +198,9 @@ await InvokeExecuteWithRetryAsync( [Fact] public async Task ExecuteWithRetryAsync_TransientErrorExceedsMaxAttempts_ThrowsLastRpcException() { - // Arrange - object processor = CreateProcessor(); + // Arrange - use a small backoff base to avoid long delays in the test const int maxAttempts = 10; + object processor = CreateProcessor(transientRetryMaxAttempts: maxAttempts, transientRetryBackoffBase: TimeSpan.FromMilliseconds(1)); int callCount = 0; StatusCode lastStatusCode = StatusCode.Unavailable; @@ -215,12 +215,17 @@ public async Task ExecuteWithRetryAsync_TransientErrorExceedsMaxAttempts_ThrowsL "TestOperation", CancellationToken.None); - // Assert - the last RpcException should be surfaced after max attempts + // Assert - the last RpcException should be surfaced after max attempts. + // The loop makes maxAttempts retries (attempts 0..maxAttempts-1 are retried) and + // then one final call at attempt=maxAttempts that is not retried, for maxAttempts+1 total calls. await act.Should().ThrowAsync().Where(e => e.StatusCode == lastStatusCode); - callCount.Should().Be(maxAttempts); + callCount.Should().Be(maxAttempts + 1); } - static object CreateProcessor(TestLogProvider? logProvider = null) + static object CreateProcessor( + TestLogProvider? logProvider = null, + int? transientRetryMaxAttempts = null, + TimeSpan? transientRetryBackoffBase = null) { ILoggerFactory loggerFactory = logProvider is null ? NullLoggerFactory.Instance @@ -228,6 +233,16 @@ static object CreateProcessor(TestLogProvider? logProvider = null) Mock factoryMock = new(MockBehavior.Strict); GrpcDurableTaskWorkerOptions grpcOptions = new(); + if (transientRetryMaxAttempts.HasValue) + { + grpcOptions.Internal.TransientRetryMaxAttempts = transientRetryMaxAttempts.Value; + } + + if (transientRetryBackoffBase.HasValue) + { + grpcOptions.Internal.TransientRetryBackoffBase = transientRetryBackoffBase.Value; + } + DurableTaskWorkerOptions workerOptions = new() { Logging = { UseLegacyCategories = false }, diff --git a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs index 86f9afec5..186f54c7d 100644 --- a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs +++ b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs @@ -285,6 +285,159 @@ public async Task Logs_Abandoning_And_NoAbandoned_When_EntityV2_Abandon_Fails() await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Unexpected error") && l.Message.Contains(instanceId))); } + [Fact] + public async Task Retries_Abandon_Orchestrator_On_Transient_Error_Eventually_Succeeds() + { + await using var fixture = await TestFixture.CreateAsync(transientRetryBackoffBase: TimeSpan.FromMilliseconds(1)); + + string instanceId = Guid.NewGuid().ToString("N"); + string completionToken = Guid.NewGuid().ToString("N"); + + int abandonCallCount = 0; + var tcs = new TaskCompletionSource(); + fixture.ClientMock + .Setup(c => c.AbandonTaskOrchestratorWorkItemAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns((P.AbandonOrchestrationTaskRequest r, Metadata h, DateTime? d, CancellationToken ct) => + { + abandonCallCount++; + if (abandonCallCount == 1) + { + // First call: simulate a transient gRPC error + return RpcExceptionAsyncUnaryCall(StatusCode.Unavailable); + } + + // Second call: succeed + return CompletedAsyncUnaryCall(new P.AbandonOrchestrationTaskResponse(), () => tcs.TrySetResult(true)); + }); + + P.WorkItem workItem = new() + { + OrchestratorRequest = new P.OrchestratorRequest { InstanceId = instanceId }, + CompletionToken = completionToken, + }; + + fixture.InvokeRunBackgroundTask(workItem, () => Task.FromException(new Exception("boom"))); + + await WaitAsync(tcs.Task); + + // Verify the call was retried (called twice total) + abandonCallCount.Should().Be(2); + + // Verify the Abandoned log is present (retry succeeded) + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId))); + + // Verify a retry warning was logged + await AssertEventually(() => fixture.GetLogs().Any(l => + l.EventId.Name == "TransientGrpcRetry" && + l.Message.Contains("AbandonTaskOrchestratorWorkItemAsync"))); + } + + [Fact] + public async Task Retries_Abandon_Activity_On_Transient_Error_Eventually_Succeeds() + { + await using var fixture = await TestFixture.CreateAsync(transientRetryBackoffBase: TimeSpan.FromMilliseconds(1)); + + string instanceId = Guid.NewGuid().ToString("N"); + string completionToken = Guid.NewGuid().ToString("N"); + + int abandonCallCount = 0; + var tcs = new TaskCompletionSource(); + fixture.ClientMock + .Setup(c => c.AbandonTaskActivityWorkItemAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns((P.AbandonActivityTaskRequest r, Metadata h, DateTime? d, CancellationToken ct) => + { + abandonCallCount++; + if (abandonCallCount == 1) + { + // First call: simulate a transient gRPC error + return RpcExceptionAsyncUnaryCall(StatusCode.Unavailable); + } + + // Second call: succeed + return CompletedAsyncUnaryCall(new P.AbandonActivityTaskResponse(), () => tcs.TrySetResult(true)); + }); + + P.WorkItem workItem = new() + { + ActivityRequest = new P.ActivityRequest + { + Name = "MyActivity", + TaskId = 42, + OrchestrationInstance = new P.OrchestrationInstance { InstanceId = instanceId }, + }, + CompletionToken = completionToken, + }; + + fixture.InvokeRunBackgroundTask(workItem, () => Task.FromException(new Exception("boom"))); + + await WaitAsync(tcs.Task); + + abandonCallCount.Should().Be(2); + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Abandoned activity work item") && l.Message.Contains(instanceId))); + await AssertEventually(() => fixture.GetLogs().Any(l => + l.EventId.Name == "TransientGrpcRetry" && + l.Message.Contains("AbandonTaskActivityWorkItemAsync"))); + } + + [Fact] + public async Task Retries_Abandon_Orchestrator_Until_MaxAttempts_Then_Fails() + { + const int maxAttempts = 3; + await using var fixture = await TestFixture.CreateAsync( + transientRetryMaxAttempts: maxAttempts, + transientRetryBackoffBase: TimeSpan.FromMilliseconds(1)); + + string instanceId = Guid.NewGuid().ToString("N"); + string completionToken = Guid.NewGuid().ToString("N"); + + int abandonCallCount = 0; + fixture.ClientMock + .Setup(c => c.AbandonTaskOrchestratorWorkItemAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns((P.AbandonOrchestrationTaskRequest r, Metadata h, DateTime? d, CancellationToken ct) => + { + abandonCallCount++; + return RpcExceptionAsyncUnaryCall(StatusCode.Unavailable); + }); + + P.WorkItem workItem = new() + { + OrchestratorRequest = new P.OrchestratorRequest { InstanceId = instanceId }, + CompletionToken = completionToken, + }; + + fixture.InvokeRunBackgroundTask(workItem, () => Task.FromException(new Exception("boom"))); + + // Wait for all retries to be exhausted. The background task logs an "Unexpected error" (for the + // abandon exception) after the retry loop gives up, which signals the task has completed. + await AssertEventually( + () => fixture.GetLogs().Count(l => l.EventId.Name == "UnexpectedError") >= 2, + timeoutMs: 10000); + + // The Abandoned log should NOT be present since the abandon never succeeded + Assert.DoesNotContain(fixture.GetLogs(), l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId)); + + // Verify retry warnings were logged: one per retry attempt (attempts 0..maxAttempts-1) + IEnumerable retryLogs = fixture.GetLogs().Where(l => + l.EventId.Name == "TransientGrpcRetry" && + l.Message.Contains("AbandonTaskOrchestratorWorkItemAsync")); + retryLogs.Should().HaveCount(maxAttempts); + + // The abandon RPC was called maxAttempts times (retried) plus one final call that propagated + abandonCallCount.Should().Be(maxAttempts + 1); + } + [Fact] public async Task Forwards_CancellationToken_To_Abandon_Orchestrator() { @@ -366,7 +519,9 @@ sealed class TestFixture : IAsyncDisposable this.RunBackgroundTaskMethod = runBackgroundTaskMethod; } - public static async Task CreateAsync() + public static async Task CreateAsync( + int? transientRetryMaxAttempts = null, + TimeSpan? transientRetryBackoffBase = null) { // Logging var logProvider = new TestLogProvider(new NullOutput()); @@ -375,7 +530,18 @@ public static async Task CreateAsync() var loggerFactory = new SimpleLoggerFactory(logProvider); // Options - var grpcOptions = new OptionsMonitorStub(new GrpcDurableTaskWorkerOptions()); + GrpcDurableTaskWorkerOptions grpcOptionsValue = new(); + if (transientRetryMaxAttempts.HasValue) + { + grpcOptionsValue.Internal.TransientRetryMaxAttempts = transientRetryMaxAttempts.Value; + } + + if (transientRetryBackoffBase.HasValue) + { + grpcOptionsValue.Internal.TransientRetryBackoffBase = transientRetryBackoffBase.Value; + } + + var grpcOptions = new OptionsMonitorStub(grpcOptionsValue); var workerOptions = new OptionsMonitorStub(new DurableTaskWorkerOptions()); // Factory (not used in these tests) @@ -450,6 +616,18 @@ static AsyncUnaryCall FaultedAsyncUnaryCall(Exception ex) () => { }); } + static AsyncUnaryCall RpcExceptionAsyncUnaryCall(StatusCode statusCode, string detail = "transient error") + { + RpcException ex = new(new Status(statusCode, detail)); + var respTask = Task.FromException(ex); + return new AsyncUnaryCall( + respTask, + Task.FromResult(new Metadata()), + () => new Status(statusCode, detail), + () => new Metadata(), + () => { }); + } + sealed class NullOutput : ITestOutputHelper { public void WriteLine(string message) { } From cf46ceb87af5c30750907999843a1651bdacc8fb Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 14:19:24 -0700 Subject: [PATCH 29/36] fixed the failing test --- test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs index 1e3554d13..50ae19b52 100644 --- a/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs +++ b/test/Worker/Grpc.Tests/ExecuteWithRetryTests.cs @@ -216,10 +216,8 @@ public async Task ExecuteWithRetryAsync_TransientErrorExceedsMaxAttempts_ThrowsL CancellationToken.None); // Assert - the last RpcException should be surfaced after max attempts. - // The loop makes maxAttempts retries (attempts 0..maxAttempts-1 are retried) and - // then one final call at attempt=maxAttempts that is not retried, for maxAttempts+1 total calls. await act.Should().ThrowAsync().Where(e => e.StatusCode == lastStatusCode); - callCount.Should().Be(maxAttempts + 1); + callCount.Should().Be(maxAttempts); } static object CreateProcessor( From ed926d0e62a142ee226043de69a5ae3b79a05c52 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 14:33:24 -0700 Subject: [PATCH 30/36] fixed another failing max attempt test --- test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs index 186f54c7d..f755de8fc 100644 --- a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs +++ b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs @@ -428,14 +428,14 @@ await AssertEventually( // The Abandoned log should NOT be present since the abandon never succeeded Assert.DoesNotContain(fixture.GetLogs(), l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId)); - // Verify retry warnings were logged: one per retry attempt (attempts 0..maxAttempts-1) + // Verify retry warnings were logged: one per retry attempt IEnumerable retryLogs = fixture.GetLogs().Where(l => l.EventId.Name == "TransientGrpcRetry" && l.Message.Contains("AbandonTaskOrchestratorWorkItemAsync")); - retryLogs.Should().HaveCount(maxAttempts); + retryLogs.Should().HaveCount(maxAttempts - 1); - // The abandon RPC was called maxAttempts times (retried) plus one final call that propagated - abandonCallCount.Should().Be(maxAttempts + 1); + // The abandon RPC was called maxAttempts-1 times (retried) plus one final call that propagated + abandonCallCount.Should().Be(maxAttempts); } [Fact] From 33700f2bb1617de53d4976b5e4ce14f9b4c630f7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 27 Apr 2026 21:40:52 +0000 Subject: [PATCH 31/36] test: add Non_Transient_Abandon_Orchestrator_Error_Is_Not_Retried integration test Agent-Logs-Url: https://github.com/microsoft/durabletask-dotnet/sessions/8d495e35-19ea-440a-80a7-e7538566e906 Co-authored-by: sophiatev <38052607+sophiatev@users.noreply.github.com> --- global.json | 2 +- .../RunBackgroundTaskLoggingTests.cs | 51 +++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/global.json b/global.json index f4fd6619c..73683e9c5 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "10.0.203", + "version": "10.0.201", "rollForward": "latestFeature" }, "msbuild-sdks": { diff --git a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs index f755de8fc..8efdbde6f 100644 --- a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs +++ b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs @@ -438,6 +438,57 @@ await AssertEventually( abandonCallCount.Should().Be(maxAttempts); } + [Theory] + [InlineData(StatusCode.InvalidArgument)] + [InlineData(StatusCode.PermissionDenied)] + [InlineData(StatusCode.NotFound)] + public async Task Non_Transient_Abandon_Orchestrator_Error_Is_Not_Retried(StatusCode statusCode) + { + await using var fixture = await TestFixture.CreateAsync(); + + string instanceId = Guid.NewGuid().ToString("N"); + string completionToken = Guid.NewGuid().ToString("N"); + + // Signal fires after the (single) abandon call, giving us a reliable completion signal + var callDoneTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + int abandonCallCount = 0; + fixture.ClientMock + .Setup(c => c.AbandonTaskOrchestratorWorkItemAsync( + It.IsAny(), + It.IsAny(), + It.IsAny(), + It.IsAny())) + .Returns((P.AbandonOrchestrationTaskRequest r, Metadata h, DateTime? d, CancellationToken ct) => + { + Interlocked.Increment(ref abandonCallCount); + callDoneTcs.TrySetResult(true); + return RpcExceptionAsyncUnaryCall(statusCode); + }); + + P.WorkItem workItem = new() + { + OrchestratorRequest = new P.OrchestratorRequest { InstanceId = instanceId }, + CompletionToken = completionToken, + }; + + fixture.InvokeRunBackgroundTask(workItem, () => Task.FromException(new Exception("boom"))); + + // Wait for the single abandon call to complete + await WaitAsync(callDoneTcs.Task); + + // Give a brief moment for the final log lines to flush + await Task.Delay(100); + + // The non-transient error must not have been retried – exactly one abandon call + abandonCallCount.Should().Be(1); + + // No retry warning should have been logged + Assert.DoesNotContain(fixture.GetLogs(), l => l.EventId.Name == "TransientGrpcRetry"); + + // The Abandoned log must not be present since the RPC failed without retry + Assert.DoesNotContain(fixture.GetLogs(), l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId)); + } + [Fact] public async Task Forwards_CancellationToken_To_Abandon_Orchestrator() { From b3e93e383cdc687875aa98b2a295d6931c7b7386 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 14:53:30 -0700 Subject: [PATCH 32/36] updated the tests slightly --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 104 +++++++++++------- .../RunBackgroundTaskLoggingTests.cs | 12 +- 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index b304afcfb..19bbdf843 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -715,20 +715,28 @@ async Task OnRunOrchestratorAsync( cancellationToken); } - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + try + { + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, request.InstanceId); + } + + return; } // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. @@ -819,20 +827,28 @@ await this.ExecuteWithRetryAsync( }, }; } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + try + { + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, request.InstanceId); + } + + return; } } else @@ -933,18 +949,26 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, } else { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + try + { + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + this.Logger.AbandonedActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + } + catch (Exception abandonException) + { + this.Logger.UnexpectedError(abandonException, instance.InstanceId); + } } return; diff --git a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs index 8efdbde6f..0c8847d88 100644 --- a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs +++ b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs @@ -423,9 +423,11 @@ public async Task Retries_Abandon_Orchestrator_Until_MaxAttempts_Then_Fails() // abandon exception) after the retry loop gives up, which signals the task has completed. await AssertEventually( () => fixture.GetLogs().Count(l => l.EventId.Name == "UnexpectedError") >= 2, - timeoutMs: 10000); - - // The Abandoned log should NOT be present since the abandon never succeeded + timeoutMs: 10000); + + // The Abandoned log should NOT be present since the abandon never succeeded (but the abandoning should be present) + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Abandoning orchestrator work item") && l.Message.Contains(instanceId))); + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Unexpected error") && l.Message.Contains(instanceId))); Assert.DoesNotContain(fixture.GetLogs(), l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId)); // Verify retry warnings were logged: one per retry attempt @@ -485,7 +487,9 @@ public async Task Non_Transient_Abandon_Orchestrator_Error_Is_Not_Retried(Status // No retry warning should have been logged Assert.DoesNotContain(fixture.GetLogs(), l => l.EventId.Name == "TransientGrpcRetry"); - // The Abandoned log must not be present since the RPC failed without retry + // The Abandoned log must not be present since the RPC failed without retry (but the abandoning should be present) + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Abandoning orchestrator work item") && l.Message.Contains(instanceId))); + await AssertEventually(() => fixture.GetLogs().Any(l => l.Message.Contains("Unexpected error") && l.Message.Contains(instanceId))); Assert.DoesNotContain(fixture.GetLogs(), l => l.Message.Contains("Abandoned orchestrator work item") && l.Message.Contains(instanceId)); } From 1142c65db8aeaf7912efc6cb0e3e178cd3b53932 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 15:20:24 -0700 Subject: [PATCH 33/36] removed try-catch, updated tests --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 242 +++++++----------- .../RunBackgroundTaskLoggingTests.cs | 4 +- 2 files changed, 97 insertions(+), 149 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 19bbdf843..a8a80cfa6 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -473,105 +473,77 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationTok string.Empty; this.Logger.UnexpectedError(ex, instanceId); - if (workItem?.OrchestratorRequest != null) - { - try - { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } + if (workItem?.OrchestratorRequest != null) + { + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellation); + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } - else if (workItem?.ActivityRequest != null) - { - try - { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - this.Logger.AbandonedActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instanceId); - } + else if (workItem?.ActivityRequest != null) + { + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + this.Logger.AbandonedActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); } - else if (workItem?.EntityRequest != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequest.InstanceId); - } + else if (workItem?.EntityRequest != null) + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); } - else if (workItem?.EntityRequestV2 != null) - { - try - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, workItem.EntityRequestV2.InstanceId); - } + else if (workItem?.EntityRequestV2 != null) + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); } } }); @@ -718,23 +690,15 @@ async Task OnRunOrchestratorAsync( if (!filterPassed) { this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - try - { - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, request.InstanceId); - } + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); return; } @@ -830,23 +794,15 @@ await this.ExecuteWithRetryAsync( else { this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - try - { - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, request.InstanceId); - } + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); return; } @@ -952,23 +908,15 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) { this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - try - { - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - this.Logger.AbandonedActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - } - catch (Exception abandonException) - { - this.Logger.UnexpectedError(abandonException, instance.InstanceId); - } + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); } return; diff --git a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs index 0c8847d88..5786d7499 100644 --- a/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs +++ b/test/Worker/Grpc.Tests/RunBackgroundTaskLoggingTests.cs @@ -419,10 +419,10 @@ public async Task Retries_Abandon_Orchestrator_Until_MaxAttempts_Then_Fails() fixture.InvokeRunBackgroundTask(workItem, () => Task.FromException(new Exception("boom"))); - // Wait for all retries to be exhausted. The background task logs an "Unexpected error" (for the + // Wait for all retries to be exhausted. The ExecuteAsync loop in the worker logs an "Unexpected error" (for the // abandon exception) after the retry loop gives up, which signals the task has completed. await AssertEventually( - () => fixture.GetLogs().Count(l => l.EventId.Name == "UnexpectedError") >= 2, + () => fixture.GetLogs().Count(l => l.EventId.Name == "UnexpectedError") >= 1, timeoutMs: 10000); // The Abandoned log should NOT be present since the abandon never succeeded (but the abandoning should be present) From c3c95719da0cb651e77812d59cbabff422a9b85a Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 16:53:12 -0700 Subject: [PATCH 34/36] remove json change --- global.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/global.json b/global.json index 73683e9c5..f4fd6619c 100644 --- a/global.json +++ b/global.json @@ -1,6 +1,6 @@ { "sdk": { - "version": "10.0.201", + "version": "10.0.203", "rollForward": "latestFeature" }, "msbuild-sdks": { From f9052ddb9b1c135bee3f022d78600220c58c19e0 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 17:05:35 -0700 Subject: [PATCH 35/36] fix line endings, add a few more logs --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 235 +++++++++--------- 1 file changed, 118 insertions(+), 117 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index a8a80cfa6..2be713647 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -156,9 +156,9 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, backoffRandom, - fullJitter: true); + fullJitter: true); reconnectAttempt++; - this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) @@ -473,77 +473,77 @@ void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationTok string.Empty; this.Logger.UnexpectedError(ex, instanceId); - if (workItem?.OrchestratorRequest != null) - { - this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellation); - this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + if (workItem?.OrchestratorRequest != null) + { + this.Logger.AbandoningOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellation); + this.Logger.AbandonedOrchestratorWorkItem(instanceId, workItem.CompletionToken ?? string.Empty); } - else if (workItem?.ActivityRequest != null) - { - this.Logger.AbandoningActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); - this.Logger.AbandonedActivityWorkItem( - instanceId, - workItem.ActivityRequest.Name, - workItem.ActivityRequest.TaskId, - workItem.CompletionToken ?? string.Empty); + else if (workItem?.ActivityRequest != null) + { + this.Logger.AbandoningActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + this.Logger.AbandonedActivityWorkItem( + instanceId, + workItem.ActivityRequest.Name, + workItem.ActivityRequest.TaskId, + workItem.CompletionToken ?? string.Empty); } - else if (workItem?.EntityRequest != null) - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequest.InstanceId, - workItem.CompletionToken ?? string.Empty); + else if (workItem?.EntityRequest != null) + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequest.InstanceId, + workItem.CompletionToken ?? string.Empty); } - else if (workItem?.EntityRequestV2 != null) - { - this.Logger.AbandoningEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskEntityWorkItemAsync( - new P.AbandonEntityTaskRequest - { - CompletionToken = workItem.CompletionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskEntityWorkItemAsync), - cancellation); - this.Logger.AbandonedEntityWorkItem( - workItem.EntityRequestV2.InstanceId, - workItem.CompletionToken ?? string.Empty); + else if (workItem?.EntityRequestV2 != null) + { + this.Logger.AbandoningEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskEntityWorkItemAsync( + new P.AbandonEntityTaskRequest + { + CompletionToken = workItem.CompletionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskEntityWorkItemAsync), + cancellation); + this.Logger.AbandonedEntityWorkItem( + workItem.EntityRequestV2.InstanceId, + workItem.CompletionToken ?? string.Empty); } } }); @@ -687,20 +687,20 @@ async Task OnRunOrchestratorAsync( cancellationToken); } - if (!filterPassed) - { - this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + if (!filterPassed) + { + this.Logger.AbandoningOrchestrationDueToOrchestrationFilter(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); + return; } // If versioning has been explicitly set, we attempt to follow that pattern. If it is not set, we don't compare versions here. @@ -791,20 +791,20 @@ await this.ExecuteWithRetryAsync( }, }; } - else - { - this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( - new P.AbandonOrchestrationTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellationToken), - nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), - cancellationToken); - - return; + else + { + this.Logger.AbandoningOrchestrationDueToVersioning(request.InstanceId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskOrchestratorWorkItemAsync( + new P.AbandonOrchestrationTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellationToken), + nameof(this.client.AbandonTaskOrchestratorWorkItemAsync), + cancellationToken); + this.Logger.AbandonedOrchestratorWorkItem(request.InstanceId, completionToken); + return; } } else @@ -905,18 +905,19 @@ async Task OnRunActivityAsync(P.ActivityRequest request, string completionToken, } else { - if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) - { - this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); - await this.ExecuteWithRetryAsync( - async () => await this.client.AbandonTaskActivityWorkItemAsync( - new P.AbandonActivityTaskRequest - { - CompletionToken = completionToken, - }, - cancellationToken: cancellation), - nameof(this.client.AbandonTaskActivityWorkItemAsync), - cancellation); + if (this.worker.workerOptions.Versioning?.FailureStrategy == DurableTaskWorkerOptions.VersionFailureStrategy.Reject) + { + this.Logger.AbandoningActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); + await this.ExecuteWithRetryAsync( + async () => await this.client.AbandonTaskActivityWorkItemAsync( + new P.AbandonActivityTaskRequest + { + CompletionToken = completionToken, + }, + cancellationToken: cancellation), + nameof(this.client.AbandonTaskActivityWorkItemAsync), + cancellation); + this.Logger.AbandonedActivityWorkItem(instance.InstanceId, request.Name, request.TaskId, completionToken); } return; @@ -1191,12 +1192,12 @@ async Task ExecuteWithRetryAsync( { int maxAttempts = this.internalOptions.TransientRetryMaxAttempts; TimeSpan baseDelay = this.internalOptions.TransientRetryBackoffBase; - TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; - Random retryRandom; -#if NET6_0_OR_GREATER - retryRandom = Random.Shared; -#else - retryRandom = new Random(); + TimeSpan cap = this.internalOptions.TransientRetryBackoffCap; + Random retryRandom; +#if NET6_0_OR_GREATER + retryRandom = Random.Shared; +#else + retryRandom = new Random(); #endif for (int attempt = 1; ; attempt++) @@ -1214,7 +1215,7 @@ async Task ExecuteWithRetryAsync( attempt < maxAttempts) { // Don't use full jitter since we want to keep the retry interval fairly fixed and increasing with - // each attempt. We don't have lockstep concerns in this case. + // each attempt. We don't have lockstep concerns in this case. // Also make sure to zero-index the attempts TimeSpan backoff = GrpcBackoff.Compute(attempt - 1, baseDelay, cap, retryRandom, fullJitter: false); From 81801a38132a9c9047cfa74b2dd1635d589c6405 Mon Sep 17 00:00:00 2001 From: Sophia Tevosyan Date: Mon, 27 Apr 2026 17:10:28 -0700 Subject: [PATCH 36/36] change order of a log and reconnect attempt --- src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 2be713647..4c5a18b2b 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -156,9 +156,9 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, backoffRandom, - fullJitter: true); - reconnectAttempt++; + fullJitter: true); this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + reconnectAttempt++; await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested)