Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
using Trax.Scheduler.Services.JobSubmitter;
using Trax.Scheduler.Services.ManifestManagerPollingService;
using Trax.Scheduler.Services.MetadataCleanupPollingService;
using Trax.Scheduler.Services.Operations;
using Trax.Scheduler.Services.SchedulerStartupService;
using Trax.Scheduler.Services.TraxScheduler;
using Trax.Scheduler.Trains.JobDispatcher;
Expand Down Expand Up @@ -114,6 +115,14 @@ internal void Build()
// Register ITraxScheduler
_parentBuilder.ServiceCollection.AddScoped<ITraxScheduler, TraxScheduler>();

// Register IOperationsService — shared between dashboard UI and GraphQL operations
// mutations so both surfaces have identical validation and persistence behaviour.
_parentBuilder.ServiceCollection.AddScoped<IOperationsService, OperationsService>();

// Reads the persisted scheduler_config row at startup and applies it to the
// in-memory SchedulerConfiguration singleton.
_parentBuilder.ServiceCollection.AddHostedService<SchedulerConfigBootstrapHostedService>();

// Register IDormantDependentContext with forwarding so both concrete type
// (for RunScheduledTrainJunction.Initialize) and interface (for user steps)
// resolve to the same scoped instance
Expand Down
74 changes: 74 additions & 0 deletions src/Trax.Scheduler/Services/Operations/DashboardMetrics.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
namespace Trax.Scheduler.Services.Operations;

/// <summary>
/// Snapshot of dashboard-relevant metrics. Returned by
/// <see cref="IOperationsService.GetDashboardMetricsAsync"/>; consumed by the dashboard
/// Index page and the GraphQL <c>operations.metrics.dashboard</c> query.
/// </summary>
public record DashboardMetrics(
DashboardKpis Kpis,
IReadOnlyList<ExecutionsBucket> ExecutionsOverTime,
IReadOnlyList<TrainFailureCount> TopFailures,
IReadOnlyList<TrainAverageDuration> TopAverageDurations,
IReadOnlyList<ThroughputSeries> ThroughputSeries
);

/// <param name="ExecutionsToday">Total executions started today (UTC), all states.</param>
/// <param name="SuccessRate">Completed / (Completed + Failed) as a percentage. Zero when no terminal executions.</param>
/// <param name="CurrentlyRunning">Executions in <c>InProgress</c> state right now.</param>
/// <param name="UnresolvedDeadLetters">Dead letters in <c>AwaitingIntervention</c> state.</param>
public record DashboardKpis(
int ExecutionsToday,
double SuccessRate,
int CurrentlyRunning,
int UnresolvedDeadLetters
);

/// <summary>
/// One bucket of the executions-over-time chart, broken down by terminal state.
/// </summary>
/// <param name="Timestamp">UTC start of the bucket.</param>
/// <param name="Completed">Count of completed executions in this bucket.</param>
/// <param name="Failed">Count of failed executions.</param>
/// <param name="Cancelled">Count of cancelled executions.</param>
public record ExecutionsBucket(DateTime Timestamp, int Completed, int Failed, int Cancelled);

/// <param name="TrainName">Train interface FullName as stored on <c>Metadata.Name</c>.</param>
/// <param name="Count">Number of failures over the time window (last 7 days).</param>
public record TrainFailureCount(string TrainName, int Count);

/// <param name="TrainName">Train interface FullName.</param>
/// <param name="AverageMilliseconds">Mean execution time over completed root-level runs in the last 7 days.</param>
public record TrainAverageDuration(string TrainName, double AverageMilliseconds);

/// <summary>
/// One per-train series for the 7-day throughput sparkline. The dashboard renders the
/// top 3 trains plus an "Other" series. The service emits the top-N (default 3) series
/// plus an "Other" bucket in the same shape; consumers can render however they want.
/// </summary>
/// <param name="TrainName">
/// Train interface FullName, or the literal string <c>"Other"</c> for the aggregated
/// remainder series.
/// </param>
/// <param name="Buckets">28 buckets of 6 hours each, oldest first.</param>
public record ThroughputSeries(string TrainName, IReadOnlyList<ThroughputBucket> Buckets);

/// <param name="Timestamp">UTC start of the bucket.</param>
/// <param name="Count">Number of completed executions in the bucket.</param>
public record ThroughputBucket(DateTime Timestamp, int Count);

/// <summary>
/// Process-level health snapshot. Memory/GC/uptime are exact at call time; CPU%
/// requires consumer-side sampling state and is intentionally omitted from the shared
/// service.
/// </summary>
/// <param name="ProcessStartTimeUtc">When the host process started.</param>
/// <param name="UptimeSeconds">Seconds since process start.</param>
/// <param name="WorkingSetBytes">Resident set size (Process.WorkingSet64).</param>
/// <param name="GcHeapBytes">Total GC-managed heap (GC.GetTotalMemory(false)).</param>
public record ServerMetrics(
DateTime ProcessStartTimeUtc,
double UptimeSeconds,
long WorkingSetBytes,
long GcHeapBytes
);
107 changes: 107 additions & 0 deletions src/Trax.Scheduler/Services/Operations/IOperationsService.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
namespace Trax.Scheduler.Services.Operations;

/// <summary>
/// Shared service for high-level operations performed by both the dashboard UI and the
/// GraphQL <c>operations</c> namespace. Centralising the logic here keeps both surfaces
/// behaviourally identical: a queue/cancel/update from the React (or Blazor) dashboard
/// runs the same code path as the same call from the GraphQL API.
/// </summary>
public interface IOperationsService
{
/// <summary>
/// Validates the input against the registered train's input type and inserts a new
/// <see cref="Effect.Models.WorkQueue.WorkQueue"/> row in the <c>Queued</c> state.
/// </summary>
/// <returns>
/// <c>OperationResult(true, Id: newEntryId, Count: 1, ...)</c> on success;
/// <c>OperationResult(false, ...)</c> with a populated <c>Message</c> for unknown
/// trains, missing <c>TrainName</c>, or invalid <c>InputJson</c>.
/// </returns>
Task<OperationResult> QueueTrainAsync(QueueTrainInput input, CancellationToken ct);

/// <summary>
/// Transitions a queued work queue entry to <c>Cancelled</c>. Only entries currently
/// in the <c>Queued</c> state are eligible. Entries that are already dispatched or
/// already cancelled return a failure result without modifying the row.
/// </summary>
Task<OperationResult> CancelWorkQueueEntryAsync(long id, CancellationToken ct);

/// <summary>
/// Patches mutable settings on a manifest group (max active jobs, priority, enabled
/// flag). Each field on <paramref name="input"/> is optional and "no change by default":
/// only properties explicitly set on the input are written. <c>UpdatedAt</c> is bumped
/// when at least one field changed.
/// </summary>
/// <returns>
/// <c>OperationResult(true, Id: groupId, Count: N, ...)</c> where <c>N</c> is the number
/// of fields written; <c>OperationResult(false, ...)</c> if the group does not exist.
/// </returns>
Task<OperationResult> UpdateManifestGroupAsync(
long id,
UpdateManifestGroupInput input,
CancellationToken ct
);

/// <summary>
/// Returns the 1-hop cross-group dependency neighborhood for a manifest group:
/// every group that contains a manifest the focal group's manifests depend on
/// (upstream), every group that contains a manifest depending on the focal group's
/// manifests (downstream), and the focal group itself. Edges are directed
/// parent → dependent.
/// </summary>
/// <returns>
/// <c>null</c> if the group does not exist or contains no manifests with cross-group
/// dependencies; otherwise a graph that always includes the focal group as a node.
/// </returns>
Task<ManifestGroupDependencyGraph?> GetManifestGroupDependencyGraphAsync(
long groupId,
CancellationToken ct
);

/// <summary>
/// Returns a snapshot of dashboard-relevant metrics: today's KPI counts, an
/// executions-over-time chart at the chosen granularity, top failing trains over
/// the last 7 days, top average durations over the last 7 days, and per-train
/// throughput sparklines over the last 7 days (28 6-hour buckets).
/// </summary>
/// <param name="range">Granularity of the executions-over-time chart only.</param>
/// <param name="hideAdminTrains">
/// When true, framework admin trains (matching <c>AdminTrains.FullNames</c>) are
/// excluded from every series. Mirrors the dashboard's "Hide admin trains" toggle.
/// </param>
Task<DashboardMetrics> GetDashboardMetricsAsync(
MetricsRange range,
bool hideAdminTrains,
CancellationToken ct
);

/// <summary>
/// Returns a snapshot of host-process health: working set, GC heap, uptime, and
/// process start time. Synchronous because all data comes from
/// <see cref="System.Diagnostics.Process"/>.
/// </summary>
ServerMetrics GetServerMetrics();

/// <summary>
/// Returns the live scheduler runtime settings, reading from the in-memory
/// <c>SchedulerConfiguration</c> singleton (and <c>LocalWorkerOptions</c> /
/// <c>MetadataCleanupConfiguration</c> if registered). The singleton is the
/// source of truth at runtime; the persisted row is loaded into it at startup
/// by <c>SchedulerConfigBootstrapHostedService</c>.
/// </summary>
SchedulerConfigSnapshot GetSchedulerConfig();

/// <summary>
/// Patches the live scheduler runtime settings. Writes are applied to both the
/// in-memory singleton (so changes take effect immediately) and to the persisted
/// <c>trax.scheduler_config</c> row (so changes survive restart).
/// </summary>
/// <returns>
/// <c>OperationResult(true, Count: N, ...)</c> where <c>N</c> is the number of
/// fields actually changed.
/// </returns>
Task<OperationResult> UpdateSchedulerConfigAsync(
UpdateSchedulerConfigInput input,
CancellationToken ct
);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
namespace Trax.Scheduler.Services.Operations;

/// <summary>
/// 1-hop neighborhood of cross-group dependencies for a manifest group.
/// Returned by <see cref="IOperationsService.GetManifestGroupDependencyGraphAsync"/> and
/// consumed by both the GraphQL <c>operations.manifestGroups.graph</c> query and the
/// dashboard's dependency DAG visualisation.
/// </summary>
/// <param name="Nodes">All groups in the neighborhood plus the focal group itself.</param>
/// <param name="Edges">Directed edges between groups (parent → dependent).</param>
public record ManifestGroupDependencyGraph(
IReadOnlyList<DependencyGraphNode> Nodes,
IReadOnlyList<DependencyGraphEdge> Edges
);

/// <param name="Id">Database ID of the manifest group.</param>
/// <param name="Name">Group name as stored on the row.</param>
/// <param name="IsHighlighted">True for the focal group; the UI uses this to render it differently.</param>
public record DependencyGraphNode(long Id, string Name, bool IsHighlighted);

/// <param name="FromId">Parent group ID (the group whose manifests are depended on).</param>
/// <param name="ToId">Dependent group ID (the group whose manifests depend on the parent).</param>
public record DependencyGraphEdge(long FromId, long ToId);
15 changes: 15 additions & 0 deletions src/Trax.Scheduler/Services/Operations/MetricsRange.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
namespace Trax.Scheduler.Services.Operations;

/// <summary>
/// Selects the granularity of the executions-over-time series in
/// <see cref="DashboardMetrics"/>. The other series in the dashboard metrics block
/// (throughput, top failures, avg durations) are always over the last 7 days.
/// </summary>
public enum MetricsRange
{
/// <summary>Last 60 minutes, 60 buckets, 1 minute each.</summary>
Last60Minutes = 0,

/// <summary>Last 24 hours, 24 buckets, 1 hour each.</summary>
Last24Hours = 1,
}
21 changes: 21 additions & 0 deletions src/Trax.Scheduler/Services/Operations/OperationResult.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
namespace Trax.Scheduler.Services.Operations;

/// <summary>
/// Generic result envelope for <see cref="IOperationsService"/> calls.
/// Failures are returned as <c>OperationResult(false, ..., Message)</c> rather than
/// thrown so both the GraphQL layer and the dashboard can surface user-facing errors
/// without try/catch noise.
/// </summary>
/// <param name="Success">Whether the operation succeeded.</param>
/// <param name="Id">
/// For create/cancel operations, the affected entity's database ID. Null when no row was
/// touched (e.g. on validation failure).
/// </param>
/// <param name="Count">Number of rows affected, when meaningful (batch operations).</param>
/// <param name="Message">Human-readable explanation. Always populated on failure.</param>
public record OperationResult(
bool Success,
long? Id = null,
int? Count = null,
string? Message = null
);
Loading
Loading