diff --git a/Cargo.lock b/Cargo.lock index 340aef0..713f040 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -488,7 +488,7 @@ checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" [[package]] name = "ferro-airflow-dag-parser" -version = "1.0.0" +version = "1.0.1" dependencies = [ "chrono", "dashmap", diff --git a/crates/ferro-airflow-dag-parser/CHANGELOG.md b/crates/ferro-airflow-dag-parser/CHANGELOG.md index 6ef76f4..dd1afef 100644 --- a/crates/ferro-airflow-dag-parser/CHANGELOG.md +++ b/crates/ferro-airflow-dag-parser/CHANGELOG.md @@ -8,6 +8,46 @@ public API require a major bump. ## [Unreleased] +## [1.0.1] - 2026-06-16 + +Security (recursion-DoS hardening). No public-API change — additive, fully +semver-compatible. + +### Security +- **Closed a parser stack-overflow DoS (FP5).** Parsing attacker-controlled + Python could overflow the vendored `littrs-ruff-python-parser` 0.6.2 + recursive-descent parser, aborting the host process with a `SIGSEGV` + (`catch_unwind` cannot intercept a guard-page fault). The previous + pre-screen capped only bracket nesting (32) and *consecutive* single-operator + runs (64), so the non-bracket recursion vectors — `not`/`await` keyword + chains, `~`/`-`/`+` runs, right-associative `a**b**c`, `a if b else …` + conditional / `lambda:` chains, deeply nested compound statements, + *mixed* prefix-operator chains (fuzz Finding 2, `crash-0665b68…`), + `yield`/`yield from` chains, and `async async … def` error-recovery chains — + slipped through and overflowed the parser. The last three were surfaced by + the adversarial design-review (Codex DD) convergence pass and closed by + counting `Yield`/`From`/`Async` in the lexer recursion metric. + + The fix ports FerroAir's complete three-layer recursion guard + (`ferroair-dag-parser`, FA1) into `panic_safe.rs`: (1) an iterative bracket + pre-scan (cap 256), (2) a single real-tokenizer pass that bounds combined + expression recursion (`brackets + operator-run + per-line right-recursion + + indent`, cap 1024) and rejects PEP-750 t-strings (which the parser panics + on), and (3) execution of the parse **and** AST walk on a dedicated 128 MiB + stack so the numeric cap — not the caller's ~2 MiB stack — is the binding + limit. The recursive AST walkers (`collect_shift_edges`, `stringify_expr`, + `resolve_to_task_id`, …) additionally truncate past a 1024 depth so a deep + left-leaning `>>` / attribute / call chain that survives the parser cannot + overflow the walk either. + + This is not a claim of bulletproof input handling — see + `dd-pack/11-known-limitations.md` for the honest residual (a single + left-leaning chain of hundreds of thousands of trailers in a multi-MB file + can still overflow on recursive AST construction/drop, bounded by the + 128 MiB stack). The realistic FP5 parser-recursion shapes (each well under + 4 KiB) are fully closed, with regression tests under `tests/stack_safety.rs` + and an adversarial design-review (Codex DD) convergence pass. + ## [1.0.0] - 2026-06-08 First semver-stable release; the public API is committed under semver. diff --git a/crates/ferro-airflow-dag-parser/Cargo.toml b/crates/ferro-airflow-dag-parser/Cargo.toml index 839aedc..43aecee 100644 --- a/crates/ferro-airflow-dag-parser/Cargo.toml +++ b/crates/ferro-airflow-dag-parser/Cargo.toml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 [package] name = "ferro-airflow-dag-parser" -version = "1.0.0" +version = "1.0.1" description = "Static AST-based extractor for Apache Airflow™ Python DAG files. Recovers dag_id, task_ids, dependencies, schedule, and dynamic-fallback markers without running the source. Extracted from the Ferro ecosystem." categories = ["development-tools", "parser-implementations"] keywords = ["airflow", "dag", "parser", "static-analysis", "python"] diff --git a/crates/ferro-airflow-dag-parser/src/dynamic_markers.rs b/crates/ferro-airflow-dag-parser/src/dynamic_markers.rs index 077335f..fe27933 100644 --- a/crates/ferro-airflow-dag-parser/src/dynamic_markers.rs +++ b/crates/ferro-airflow-dag-parser/src/dynamic_markers.rs @@ -154,6 +154,17 @@ const TASK_DECORATORS: &[&str] = &[ /// /// Returns [`ParseError::Parse`] when ruff cannot parse the source. pub fn detect_dynamic_markers(source: &str) -> Result, ParseError> { + // Same stack-safety shield as the static extractor: the marker + // detector parses the source a second time and walks it with its own + // recursive `MarkerVisitor`, so it must run on the large dedicated + // stack behind the same pre-scan caps — a deeply nested DAG that + // survived the static path would otherwise crash here. + crate::panic_safe::shield_parser_panic("ruff-markers", source, || { + detect_dynamic_markers_impl(source) + }) +} + +fn detect_dynamic_markers_impl(source: &str) -> Result, ParseError> { let parsed = parse_module_safely(source)?; let module: &ModModule = parsed.syntax(); let line_index = LineIndex::new(source); @@ -424,16 +435,28 @@ fn is_dag_callable(expr: &Expr) -> bool { } } +/// Stack-safety cap for the decorator-callee chain walk below. A +/// `@dag()()()…` / `@task()()()…` decorator builds a left-leaning call +/// chain the parser produces iteratively, so it is not bounded by the +/// lexer recursion cap; truncate past this depth (such a decorator is not +/// a `@dag` / `@task` decorator anyway). Mirrors `ruff_impl`'s +/// `MAX_WALK_DEPTH` so both decorator paths are bounded identically +/// (Codex DD R8, 2026-06-16). +const MAX_DECORATOR_CHAIN_DEPTH: usize = 1024; + fn match_dag_decorator(expr: &Expr) -> bool { - fn inner(expr: &Expr) -> Option<&str> { + fn inner(expr: &Expr, depth: usize) -> Option<&str> { + if depth > MAX_DECORATOR_CHAIN_DEPTH { + return None; + } match expr { Expr::Name(ExprName { id, .. }) => Some(id.as_str()), Expr::Attribute(ExprAttribute { attr, .. }) => Some(attr.as_str()), - Expr::Call(call) => inner(&call.func), + Expr::Call(call) => inner(&call.func, depth + 1), _ => None, } } - matches!(inner(expr), Some(name) if DAG_DECORATOR_NAMES.contains(&name)) + matches!(inner(expr, 0), Some(name) if DAG_DECORATOR_NAMES.contains(&name)) } fn callee_is_chain_helper(expr: &Expr) -> bool { @@ -454,15 +477,18 @@ fn is_operator_constructor(expr: &Expr) -> bool { } fn is_task_decorator_call(call: &ExprCall) -> bool { - fn inner(expr: &Expr) -> Option<&str> { + fn inner(expr: &Expr, depth: usize) -> Option<&str> { + if depth > MAX_DECORATOR_CHAIN_DEPTH { + return None; + } match expr { Expr::Name(ExprName { id, .. }) => Some(id.as_str()), Expr::Attribute(ExprAttribute { attr, .. }) => Some(attr.as_str()), - Expr::Call(c) => inner(&c.func), + Expr::Call(c) => inner(&c.func, depth + 1), _ => None, } } - matches!(inner(&call.func), Some(name) if TASK_DECORATORS.contains(&name)) + matches!(inner(&call.func, 0), Some(name) if TASK_DECORATORS.contains(&name)) } /// Conservative: anything that is not `@task()` (zero-arg) is considered diff --git a/crates/ferro-airflow-dag-parser/src/panic_safe.rs b/crates/ferro-airflow-dag-parser/src/panic_safe.rs index 5cb66bb..5d73d62 100644 --- a/crates/ferro-airflow-dag-parser/src/panic_safe.rs +++ b/crates/ferro-airflow-dag-parser/src/panic_safe.rs @@ -1,19 +1,52 @@ // SPDX-License-Identifier: Apache-2.0 -//! Panic isolation around the upstream `ruff_python_parser`. +//! Stack-safety + panic isolation around the upstream +//! `littrs-ruff-python-parser` recursive-descent parser. //! //! `littrs-ruff-python-parser` (the crates.io mirror of -//! `astral-sh/ruff`'s parser) panics on a small number of malformed -//! inputs — we have observed this directly from `cargo fuzz`. A -//! parser whose host process can be killed by a malicious DAG file -//! is unsafe to expose on a dag-processor that sits behind a -//! filesystem watcher, so this module wraps every entry point in -//! `std::panic::catch_unwind` and surfaces the panic as -//! [`ParseError::Internal`]. +//! `astral-sh/ruff`'s parser) can take down the host process on +//! adversarial DAG source in two distinct ways, and a dag-processor +//! that watches an attacker-writable folder is exposed to both: //! -//! When upstream fixes the panic class we can drop this shim, but -//! the cost (one allocation per parse on the panic path, zero on the -//! happy path) is small enough that we will keep it indefinitely as -//! defence-in-depth. +//! 1. **Stack overflow.** The parser descends one frame per grouping +//! construct (`([{`, dict/set, f-string field) *and* per +//! prefix-unary / right-associative / conditional operator, with no +//! upstream recursion limit (verified against `ParseOptions` in +//! 0.6.2). A pathologically nested input overflows the parse thread's +//! guard page, which Linux delivers as a fatal `SIGSEGV`. A +//! guard-page fault is **not** an unwinding panic, so +//! [`std::panic::catch_unwind`] / thread-`join` CANNOT intercept it. +//! 2. **Unwinding panic.** The vendored parser reaches an +//! `unreachable!()` (`expression.rs:1633`) on PEP 750 t-strings and a +//! handful of f-/t-string token-confusion shapes (the 2026-05-03 fuzz +//! wave, 7 distinct repros). That is a normal Rust panic. +//! +//! The defence is three layers, ported from the `FerroAir` dag-parser +//! (`ferroair-dag-parser::api`, FA1 / F-2026-06-08), which shares the +//! exact same 0.6.2 backend: +//! +//! * **Layer 1 — bracket pre-scan** ([`nesting_depth_exceeds`]): an +//! iterative byte scan that rejects grouping-delimiter nesting deeper +//! than [`MAX_PARSE_NESTING_DEPTH`] before the parser runs. +//! * **Layer 1b — lexer pre-scan** ([`lex_reject_reason`]): a single +//! iterative tokenizer pass that rejects t-strings (which the parser +//! panics on) and source whose expression-recursion depth +//! (`brackets + operator-run + per-line right-recursion + indent`) +//! would exceed [`MAX_PARSE_RECURSION`] — this catches the *non*-bracket +//! recursion vectors (`not`/`await`/`~`/`-` chains, `a**b**c`, +//! `a if b else …`, `lambda: lambda:`, deep statement indent) that a +//! bracket-only cap misses entirely. +//! * **Layer 2 — large dedicated stack** ([`PARSE_STACK_SIZE`]): the +//! parse *and* the AST walk run on a 128 MiB thread so the numeric cap, +//! never the (possibly ~2 MiB) caller stack, is the binding limit; the +//! thread's `join()` also folds any unwinding panic into +//! [`ParseError::Internal`]. +//! +//! This is not a claim of bulletproof input handling — the true +//! guarantee is exactly those three layers: a numeric depth cap that +//! sits far above any real DAG and far below the overflow threshold, a +//! real-tokenizer reject pass with no byte-heuristic false positives, +//! and a dedicated stack that makes the cap (not the host stack) the +//! limit. use std::panic::AssertUnwindSafe; @@ -22,94 +55,69 @@ use ruff_python_parser::{ParseError as RuffParseError, Parsed}; use crate::common::ParseError; -/// Maximum bracket nesting depth we hand to `ruff_python_parser`. -/// -/// `catch_unwind` only catches Rust panics. A deeply-nested expression -/// such as `((((... 200×` triggers a recursive descent in the upstream -/// parser that overflows the thread stack, which Linux delivers as -/// `SIGSEGV` and aborts the process before the unwinder runs. The -/// `panic_safe` shim cannot recover from that, so we pre-screen for -/// pathological bracket nesting and reject it as a parse error before -/// the parser ever sees it. +/// Maximum grouping-delimiter nesting depth accepted before the source +/// is rejected WITHOUT invoking the recursive-descent parser. /// -/// 32 is the smallest cap that comfortably covers legitimate Airflow -/// DAG nesting (real DAGs almost never exceed depth 10, even with -/// nested `with DAG(...)` and `BashOperator(task_id=..., bash_command= -/// f"...")` shapes), while rejecting the fuzz corpus -/// (`crash-bd9f087c...` opens 47 brackets without closure and lex-paths -/// the parser into a 60+-frame recursive descent that exits via -/// SIGSEGV on the default thread stack). Independent stack-overflow -/// tests in `tests/parser_stack_safety.rs` confirm depth 32 stays -/// well under the observed overflow point. +/// `256` is chosen with measured headroom: across the entire vendored +/// Apache Airflow source tree the deepest legitimate file nests to 96 +/// (an AWS `SageMaker` example DAG with a deeply nested boto config); the +/// next deepest is 17. A cap of 256 clears the real-world maximum by +/// 2.6× while still rejecting the multi-hundred-deep adversarial inputs +/// the fuzz corpus accumulates. /// -/// # Upstream durable fix -/// -/// See `MAX_UNARY_OP_RUN` below for the same cross-reference to -/// astral-sh/ruff PR #24810 — the same parser-side recursion guard -/// covers the bracket-nesting recursion shape as well. -const MAX_BRACKET_DEPTH: usize = 32; +/// The byte scan never *under*-counts a real grouping delimiter (every +/// `([{` is a literal byte the scan sees), so any input that would +/// recurse past the cap is rejected; it may *over*-count brackets that +/// live inside string / comment literals, which only makes the guard +/// reject *more* (the safe direction). +const MAX_PARSE_NESTING_DEPTH: usize = 256; -/// Maximum run length of consecutive unary-prefix operator bytes (`~`, -/// `-`, `+`) we will accept, ignoring intervening ASCII whitespace. -/// -/// Each `~`, `-`, `+` at expression-start position becomes a -/// `UnaryOp(...)` AST node nested inside the next, and the upstream -/// `ruff_python_parser` builds that nesting via recursive descent. A -/// long run such as `~~~~~ ... x` (the 2026-05-09 fuzz artifact -/// `crash-ba2528b6...` carries 1961 consecutive `~`) overflows the -/// thread stack inside `ruff_python_parser`'s expression entry, which -/// Linux delivers as `SIGSEGV`. `catch_unwind` does NOT catch stack -/// overflow, so the production process aborts. Pre-screening the -/// input before the parser ever sees it is the only defence. -/// -/// 64 is well above any legitimate Python expression (real code never -/// chains more than two `--` / `~~` operators) and well below the -/// observed overflow threshold (between 1000 and 1500 on a 2 MiB -/// thread stack). The same operator chain via the `not` keyword is -/// recognised separately because it spans 3 bytes plus whitespace. -/// -/// # Upstream durable fix +/// Maximum recursive-expression depth (grouping delimiters + operator +/// recursion + statement indent) accepted before the source is rejected. /// -/// astral-sh/ruff PR #24810 ("Parser recursion limit", -/// , OPEN as of -/// 2026-05-09) introduces a `max_recursion_depth` parameter on the -/// parser (default 202, matching `CPython`'s `MAXSTACK`) that wraps every -/// recursive expression / statement / pattern entry — including the -/// `parse_lhs_expression` site that this `MAX_UNARY_OP_RUN` cap exists -/// to defend against. Once that PR merges and lands in a -/// `littrs-ruff-python-parser` release, this caller-side pre-screen -/// becomes redundant; we keep it as defence in depth (the cost is one -/// linear scan per parse, zero allocations on the happy path) and -/// because it also defends against the same recursion shape via -/// `BinOp` (e.g. `1+1+1+1+...`) which the upstream limit catches -/// equally well but at parse-time rather than pre-screen-time. +/// The bracket pre-scan above bounds `([{` nesting, but the vendored +/// parser also recurses with no grouping delimiter on prefix-unary +/// chains (`not not not …`, `- - - …`, `~~~…`, `await await …` — one +/// frame per operator), on the right-associative power operator +/// (`a ** b ** c ** …`), on conditional / lambda chains +/// (`a if b else …`, `lambda: lambda: …`), and at the *statement* level +/// on deeply nested compound statements (`if a:` / `for …:` …, one frame +/// per indentation level). None of these are caught by counting +/// brackets, yet each overflows even a 128 MiB stack within a realistic +/// input size. A stack-overflow abort is uncatchable, so it must be +/// bounded *before* the parser runs. /// -/// Tracking issue: astral-sh/ruff #22930. -const MAX_UNARY_OP_RUN: usize = 64; +/// `1024` clears any realistic DAG by a wide margin — across the +/// vendored Apache Airflow tree the deepest combined depth is well under +/// ~120 (the depth-96 `SageMaker` bracket nest) — while sitting far below +/// the ~100 k-frame depth at which the parser overflows. +const MAX_PARSE_RECURSION: usize = 1024; -/// Parse a Python module while isolating any panic from the upstream -/// parser as [`ParseError::Internal`], and any stack-overflow class -/// failure as [`ParseError::Parse`]. +/// Stack size for the dedicated parse / walk thread. A `tokio` worker or +/// `std::thread::spawn` thread defaults to only ~2 MiB, which a +/// legitimately deep DAG (the depth-96 `SageMaker` example) can already +/// exhaust under instrumentation. Parsing on a 128 MiB stack gives the +/// recursive descent ample headroom so that the depth cap above — never +/// a stack overflow — is what stops a deep input, regardless of how +/// small the *caller's* stack is. +const PARSE_STACK_SIZE: usize = 128 * 1024 * 1024; + +/// Parse a Python module, surfacing an upstream unwinding panic as +/// [`ParseError::Internal`] and a syntax error as [`ParseError::Parse`]. +/// +/// This is the *inner* parse: it assumes the caller has already run the +/// [`shield_parser_panic`] pre-scans (so pathologically nested input +/// never reaches it) and runs inside the shield's large stack. The +/// `catch_unwind` is retained as defence-in-depth so a direct call is +/// still panic-safe. pub fn parse_module_safely(source: &str) -> Result, ParseError> { - if let Some(depth) = max_bracket_depth_exceeds(source, MAX_BRACKET_DEPTH) { - return Err(ParseError::Parse(format!( - "input rejected: bracket nesting depth {depth} exceeds {MAX_BRACKET_DEPTH} \ - (would risk stack overflow in upstream parser)" - ))); - } - if let Some(run) = max_unary_op_run_exceeds(source, MAX_UNARY_OP_RUN) { - return Err(ParseError::Parse(format!( - "input rejected: unary-prefix operator chain length {run} exceeds {MAX_UNARY_OP_RUN} \ - (would risk stack overflow in upstream parser)" - ))); - } let result: Result, RuffParseError>, _> = std::panic::catch_unwind(AssertUnwindSafe(|| ruff_parser::parse_module(source))); match result { Ok(Ok(parsed)) => Ok(parsed), Ok(Err(e)) => Err(ParseError::Parse(e.to_string())), Err(panic_payload) => { - let msg = panic_message(&panic_payload); + let msg = panic_payload_message(panic_payload.as_ref()); Err(ParseError::Internal(format!( "ruff_python_parser panicked on input: {msg}" ))) @@ -117,76 +125,282 @@ pub fn parse_module_safely(source: &str) -> Result Option { +/// Iterative (non-recursive, overflow-proof) bracket pre-scan. Returns +/// the maximum grouping-delimiter nesting depth iff it exceeds `cap`. +/// +/// Counts opening brackets `(`, `[`, `{` and pairs them with `)`, `]`, +/// `}`. String literals and comments are NOT excluded — that would +/// require a real tokenizer; the resulting false positives are harmless +/// because `cap` is far above what real code uses. +fn nesting_depth_exceeds(source: &str, cap: usize) -> Option { let mut depth: usize = 0; + let mut max: usize = 0; for &b in source.as_bytes() { match b { b'(' | b'[' | b'{' => { - depth = depth.saturating_add(1); - if depth > limit { - return Some(depth); + depth += 1; + if depth > max { + max = depth; } } - b')' | b']' | b'}' => { - depth = depth.saturating_sub(1); - } + b')' | b']' | b'}' => depth = depth.saturating_sub(1), _ => {} } } - None + (max > cap).then_some(max) } -/// Walk `source` once and return the first run length that exceeds -/// `limit`, or `None` if every consecutive run of unary-prefix operator -/// bytes (`~`, `-`, `+`) — counting operator bytes only and skipping -/// ASCII whitespace within the run — stays within bounds. +/// Single iterative lexer pass that rejects source the vendored parser +/// cannot handle safely, returning a skip-file reason or `None`. /// -/// The scan does not differentiate between unary and binary uses of -/// `-` / `+` (e.g. `a - b - c - ...` would also count). That is -/// deliberate: real Python source never chains 64+ consecutive `-` / -/// `+` either (every additional operator means another `BinOp` -/// expression node, equally subject to recursive descent), so the -/// over-broad scan provides extra defence in depth without rejecting -/// legitimate code. The cap is enforced only on operator characters, -/// so identifier characters and digits between them break the run. -fn max_unary_op_run_exceeds(source: &str, limit: usize) -> Option { - let mut run: usize = 0; - for &b in source.as_bytes() { - match b { - b'~' | b'-' | b'+' => { - run = run.saturating_add(1); - if run > limit { - return Some(run); +/// The lexer is iterative (a mode *stack*, never recursion) so it cannot +/// itself overflow, and — being the real tokenizer — it classifies +/// `t"`/`{{`/operators-inside-strings correctly, so there are no +/// byte-heuristic false positives. Two rejections: +/// +/// 1. **t-strings** (`TStringStart`) — the parser PANICS on PEP 750 +/// t-strings (`unreachable!()` at `expression.rs:1633`) rather than +/// erroring. Real Airflow DAGs (Python <= 3.12) never use them and +/// the vendored parser cannot parse them anyway. +/// 2. **Excessive expression recursion** — a running depth of +/// `brackets + consecutive-operator-run + per-line right-recursive +/// operators + indent`. The consecutive-operator run resets at every +/// operand, so left-associative chains (`a+b+c…`, big list/dict +/// literals) stay at depth ~1 and are NOT rejected; only genuine +/// prefix / `**` / conditional / lambda recursion and statement +/// indent accumulate. +fn lex_reject_reason(source: &str) -> Option { + use ruff_python_ast::token::TokenKind as T; + use ruff_python_parser::Mode; + use ruff_python_parser::lexer::lex; + + let mut lexer = lex(source, Mode::Module); + let mut brackets: usize = 0; + // Consecutive operator tokens since the last operand/delimiter — + // captures prefix-unary chains; resets at any non-operator so + // left-associative binary chains do not accumulate. + let mut op_run: usize = 0; + // Right-recursive operators (`**`, conditional `if`/`else`, `lambda`) + // on the current logical line — these do NOT reduce at each operand, + // so they are counted per line and reset at a top-level newline. + let mut line_right_rec: usize = 0; + // Block-statement nesting depth (Indent − Dedent). Deeply nested + // compound statements (`if a:` / `for …:` / `def …:` …) recurse at + // the *statement* level — one frame per indentation level — + // independently of any expression nesting, so they must be bounded + // too. + let mut indent: usize = 0; + let mut max: usize = 0; + + loop { + let tk = lexer.next_token(); + match tk { + T::EndOfFile => break, + T::Indent => { + indent += 1; + op_run = 0; + } + T::Dedent => { + indent = indent.saturating_sub(1); + op_run = 0; + } + T::TStringStart => { + return Some( + "input rejected: Python 3.14 t-strings (PEP 750) are not supported by \ + the vendored parser" + .to_owned(), + ); + } + T::Lpar | T::Lsqb | T::Lbrace => { + brackets += 1; + op_run = 0; + } + T::Rpar | T::Rsqb | T::Rbrace => { + brackets = brackets.saturating_sub(1); + op_run = 0; + } + // Top-level statement boundary: the right-recursive run ends. + T::Newline => { + op_run = 0; + if brackets == 0 { + line_right_rec = 0; } } - // ASCII whitespace is permissive within a run so - // `~ ~ ~ ~ ... x` is rejected the same as `~~~~...x`. - b' ' | b'\t' | b'\r' | b'\n' => {} - _ => { - run = 0; + // Trivia: a non-logical newline (a physical line break INSIDE + // a bracket / implicit continuation) and a comment are NOT + // operands and must be transparent to the run counters. A + // prefix chain split across physical lines inside parens — + // `x = (\n not\n not\n … a\n)` — recurses one parser frame per + // `not` regardless of the line breaks; if the default arm + // below reset `op_run` on each `NonLogicalNewline` the chain + // would never accumulate and would overflow the parser (Codex + // DD R6, 2026-06-16). Skipping them here makes the consecutive + // run survive the line breaks, exactly like the operands that + // genuinely separate operators do not. + T::NonLogicalNewline | T::Comment => {} + // Operators that drive parser recursion. Counting a generous + // superset (incl. left-assoc binaries) only ever *over*-counts + // — the safe direction — and the per-operand reset keeps real + // binary chains shallow. + T::Not + | T::Minus + | T::Plus + | T::Tilde + | T::Star + | T::DoubleStar + | T::Await + | T::Lambda + | T::Slash + | T::Percent + | T::And + | T::Or + | T::If + | T::Else + // `yield` / `yield from` recurse one parser frame per keyword: + // `parse_yield[_from]_expression` parses another expression to + // its right, which can be another `yield`. `yield from yield + // from … x` (and `yield yield … x`) has NO operand between the + // keywords, so counting `Yield` + `From` in the consecutive-run + // makes the chain accumulate and be rejected (Codex DD R2, + // 2026-06-16). A single `yield from x` is op_run 2 — well clear. + | T::Yield + | T::From + // `async` recurses at the STATEMENT level via error recovery: + // `parse_async_statement` consumes a stray `async`, records the + // error, then calls `parse_statement` again — so `async async + // … def f(): pass` is one parser frame per `async` with no + // bracket / indent / operand between (Codex DD R3, 2026-06-16). + // A well-formed `async def` / `async for` / `async with` is + // op_run 1 (the next token is an operand/keyword that resets). + | T::Async => { + op_run += 1; + if matches!(tk, T::DoubleStar | T::If | T::Else | T::Lambda) { + line_right_rec += 1; + } } + // Any operand / other token ends a consecutive-operator run. + _ => op_run = 0, + } + max = max.max(brackets + op_run + line_right_rec + indent); + if max > MAX_PARSE_RECURSION { + return Some(format!( + "input rejected: expression recursion depth {max} exceeds the stack-safety \ + cap of {MAX_PARSE_RECURSION}" + )); } } None } -fn panic_message(payload: &Box) -> String { +/// Shield a parse + AST-walk against the two ways adversarial DAG source +/// can take down the host thread (see the module docs): stack overflow +/// (headed off before the parser runs by the two pre-scans, then given a +/// 128 MiB stack so the cap is the binding limit) and an unwinding panic +/// (captured by the parse thread's `join()` and folded into +/// [`ParseError::Internal`]). +/// +/// `f` should perform the *whole* unit of work that touches the AST +/// (parse + recursive visitor walk), so the walk is bounded by the same +/// large stack and depth caps as the parse — a deep tree that survives +/// the parser would otherwise overflow a recursive walker on the +/// caller's small stack. +/// +/// On either rejection the production posture is: log at `warn`, skip the +/// offending file, keep serving the remaining DAGs. Callers already treat +/// [`ParseError::Parse`] / [`ParseError::Internal`] this way. +pub fn shield_parser_panic(backend: &'static str, src: &str, f: F) -> Result +where + F: FnOnce() -> Result + Send, + T: Send, +{ + // Layer 1 — reject pathologically nested input before it can drive + // the recursive-descent parser into a fatal (uncatchable) overflow. + if let Some(depth) = nesting_depth_exceeds(src, MAX_PARSE_NESTING_DEPTH) { + tracing::warn!( + target: "ferro_airflow_dag_parser::shield", + backend, + depth, + cap = MAX_PARSE_NESTING_DEPTH, + "DAG source rejected: grouping-delimiter nesting depth exceeds the \ + stack-safety cap; skipping file" + ); + return Err(ParseError::Parse(format!( + "input rejected: bracket/brace nesting depth {depth} exceeds the \ + stack-safety cap of {MAX_PARSE_NESTING_DEPTH}" + ))); + } + + // Layer 1b — iterative lexer pass: reject t-strings (the parser + // panics on them) and source whose expression recursion depth + // (brackets + operator chains incl. non-bracket prefix / `**` / + // conditional recursion + indent) would overflow even the 128 MiB + // stack. The lexer cannot itself overflow. + if let Some(reason) = lex_reject_reason(src) { + tracing::warn!( + target: "ferro_airflow_dag_parser::shield", + backend, + %reason, + "DAG source rejected before parse (stack-safety / unsupported syntax); skipping file" + ); + return Err(ParseError::Parse(reason)); + } + + // Layer 2 — run the parse + walk on a large dedicated stack and fold + // any unwinding panic into a skip-file error. The scoped thread lets + // `f` borrow `src` without a copy; `join()` both captures the panic + // and guarantees PARSE_STACK_SIZE of headroom for the recursive + // descent regardless of the caller's stack. + let joined = std::thread::scope(|scope| { + std::thread::Builder::new() + .name("ferro-airflow-dag-parse".into()) + .stack_size(PARSE_STACK_SIZE) + .spawn_scoped(scope, f) + .map(std::thread::ScopedJoinHandle::join) + }); + + match joined { + // Parse + walk ran to completion — propagate its Result. + Ok(Ok(result)) => result, + // The thread panicked (unwinding) — fold into Internal. + Ok(Err(payload)) => { + let msg = panic_payload_message(payload.as_ref()); + tracing::warn!( + target: "ferro_airflow_dag_parser::shield", + backend, + payload = %msg, + "upstream python parser panicked; surfacing as ParseError::Internal" + ); + Err(ParseError::Internal(format!( + "ruff_python_parser panicked on input: {msg}" + ))) + } + // Thread spawn itself failed (OS thread / address-space limit) — + // surface as an internal error rather than panicking the caller. + Err(e) => Err(ParseError::Internal(format!( + "failed to spawn dag-parse thread: {e}" + ))), + } +} + +/// Best-effort extraction of a panic payload's text. Handles the two +/// common payload shapes (`&'static str`, `String`) and falls back to a +/// sentinel for anything else. +fn panic_payload_message(payload: &(dyn std::any::Any + Send)) -> String { payload .downcast_ref::<&'static str>() - .map(|s| (*s).to_string()) + .map(|s| (*s).to_owned()) .or_else(|| payload.downcast_ref::().cloned()) - .unwrap_or_else(|| "".to_string()) + .unwrap_or_else(|| "".to_owned()) } #[cfg(test)] mod tests { use super::*; + use crate::common::ExtractedDag; + + // ----------------------------------------------------------------- + // parse_module_safely — inner parse + catch_unwind + // ----------------------------------------------------------------- #[test] fn happy_path_returns_parsed() { @@ -200,86 +414,14 @@ mod tests { assert!(matches!(err, ParseError::Parse(_)), "got {err:?}"); } - #[test] - fn deep_bracket_nesting_rejected_before_parser() { - // 1000× `(` without closure — far above MAX_BRACKET_DEPTH. - // Pre-fix this would recursive-descend into the upstream - // parser and SIGSEGV on stack overflow; the shim now turns - // that into a graceful `ParseError::Parse`. - let src = "(".repeat(1000); - let err = parse_module_safely(&src).expect_err("must reject"); - match err { - ParseError::Parse(msg) => { - assert!( - msg.contains("bracket nesting depth"), - "unexpected message: {msg}" - ); - } - other => panic!("expected Parse, got {other:?}"), - } - } - - #[test] - fn moderate_nesting_passes_to_parser() { - // 16 nested calls — well within MAX_BRACKET_DEPTH (32). The - // upstream parser handles this; the test pins the cap is - // generous enough for real-world DAG shapes. - let src = "x = ".to_string() + &"(".repeat(16) + "1" + &")".repeat(16) + "\n"; - let _ = parse_module_safely(&src).expect("must parse"); - } - - #[test] - fn long_unary_op_chain_rejected_before_parser() { - // 2026-05-09 fuzz artifact `crash-ba2528b6...` carried 1961 - // consecutive `~` characters. Pre-fix, the upstream parser - // recursive-descended ~1500 frames and SIGSEGVed on the - // default 2 MiB thread stack — `catch_unwind` does not catch - // stack overflow, so the production process aborts. The - // pre-screen now turns that into a graceful `ParseError::Parse`. - let src = "~".repeat(1961) + "x"; - let err = parse_module_safely(&src).expect_err("must reject"); - match err { - ParseError::Parse(msg) => { - assert!( - msg.contains("unary-prefix operator chain"), - "unexpected message: {msg}" - ); - } - other => panic!("expected Parse, got {other:?}"), - } - } - - #[test] - fn moderate_unary_op_chain_passes_to_parser() { - // 8 nested negations (`--------x` = `Unary(-, Unary(-, ...))`). - // Far below MAX_UNARY_OP_RUN (64); the parser handles this - // and the cap stays out of the way for legitimate expressions. - let src = "x = ".to_string() + &"-".repeat(8) + "1\n"; - let _ = parse_module_safely(&src).expect("must parse"); - } - - #[test] - fn whitespace_within_unary_chain_still_rejected() { - // `~ ~ ~ ~ ... x` with 100 `~` separated by spaces — same - // recursion shape as `~~~~...x`, must be rejected. - let mut src = String::new(); - for _ in 0..100 { - src.push_str("~ "); - } - src.push('x'); - let err = parse_module_safely(&src).expect_err("must reject"); - assert!(matches!(err, ParseError::Parse(_)), "got {err:?}"); - } - /// Regression test for an upstream `littrs-ruff-python-parser` panic /// found by `cargo fuzz` (artifact /// `crash-11c5872116de631c61335e4a170f3d1acf82241f`). The bytes - /// trigger a panic at `parser/expression.rs:1633:25` in the - /// upstream parser; our shim must catch it and return + /// trigger a panic at `parser/expression.rs:1633:25` in the upstream + /// parser; the inner shim must catch it and return /// `ParseError::Internal` rather than aborting. #[test] - fn shim_catches_upstream_parser_panic() { - // Slice extracted from the fuzz artifact. + fn parse_module_safely_catches_upstream_parser_panic() { let crash_bytes: &[u8] = &[ 0x26, 0x26, 0x60, 0x08, 0x00, 0x31, 0x74, 0x27, 0x40, 0x0a, 0x00, 0x30, 0x74, 0x27, 0x10, 0x0a, 0x00, 0x30, 0x74, 0x27, 0x66, 0x22, 0x66, 0x27, 0x27, 0x66, 0x27, 0x66, @@ -292,21 +434,190 @@ mod tests { 0x7b, 0x10, 0x00, 0x7b, 0x3a, 0x2e, 0x00, 0x00, 0x00, 0x7b, 0x7b, 0x7b, 0x7b, 0x6b, ]; let Ok(src) = std::str::from_utf8(crash_bytes) else { - // The bytes happen to be valid UTF-8; if a future maintainer - // mutates the test material into invalid UTF-8 the shim - // signature is unchanged and we just skip. return; }; - // Three acceptable outcomes: - // - `Ok(_)`: upstream fixed the panic, shim is now a no-op. - // - `Err(ParseError::Parse(_))`: upstream now returns a - // graceful syntax error for the same input. - // - `Err(ParseError::Internal(_))`: shim caught the upstream - // panic and converted it. - // Anything else is a regression. match parse_module_safely(src) { Ok(_) | Err(ParseError::Parse(_) | ParseError::Internal(_)) => {} Err(other) => panic!("unexpected variant: {other:?}"), } } + + // ----------------------------------------------------------------- + // shield_parser_panic — large stack + join panic capture + // ----------------------------------------------------------------- + + #[test] + fn shield_returns_ok_when_closure_does_not_panic() { + let out = shield_parser_panic("test", "", || -> Result, ParseError> { + Ok(Vec::new()) + }) + .expect("no panic, no parse error"); + assert!(out.is_empty()); + } + + #[test] + fn shield_converts_static_str_panic_to_internal() { + let err = shield_parser_panic("test", "", || -> Result, ParseError> { + panic!("static-str payload"); + }) + .expect_err("must convert panic to error"); + match err { + ParseError::Internal(msg) => assert!(msg.contains("static-str payload")), + other => panic!("expected Internal, got {other:?}"), + } + } + + #[test] + fn shield_converts_string_panic_to_internal() { + let err = shield_parser_panic("test", "", || -> Result, ParseError> { + panic!("{}", String::from("owned-string payload")); + }) + .expect_err("must convert panic to error"); + match err { + ParseError::Internal(msg) => assert!(msg.contains("owned-string payload")), + other => panic!("expected Internal, got {other:?}"), + } + } + + #[test] + fn panic_payload_message_handles_unknown_shape() { + let payload: Box = Box::new(0_u32); + let msg = panic_payload_message(payload.as_ref()); + assert_eq!(msg, ""); + } + + // ----------------------------------------------------------------- + // nesting_depth_exceeds — bracket pre-scan boundary + // ----------------------------------------------------------------- + + #[test] + fn nesting_depth_exceeds_reports_depth_only_past_cap() { + assert_eq!(nesting_depth_exceeds("([{}])", 10), None); + assert_eq!(nesting_depth_exceeds("", 10), None); + // closers floor the running depth at 0 — unbalanced closers never + // report a phantom depth. + assert_eq!(nesting_depth_exceeds("))))]]]", 10), None); + let deep = "[".repeat(20); + assert_eq!(nesting_depth_exceeds(&deep, 10), Some(20)); + } + + #[test] + fn nesting_depth_exceeds_is_exact_at_the_cap_boundary() { + // Depth EXACTLY at the cap must be admitted (the comparison is + // strictly `>`, not `>=`); cap+1 must be rejected and report the + // true depth. Pins the boundary so a `>`/`>=` mutant is caught. + let cap = 8usize; + assert_eq!(nesting_depth_exceeds(&"(".repeat(cap), cap), None); + assert_eq!( + nesting_depth_exceeds(&"(".repeat(cap + 1), cap), + Some(cap + 1) + ); + // Each of the three delimiter classes counts, and the max is the + // peak simultaneous depth (so a balanced-then-reopened sequence + // reports the peak, not the sum). Pins the per-`+= 1` increment. + assert_eq!(nesting_depth_exceeds("[][][]", 1), None); // peak 1 + assert_eq!(nesting_depth_exceeds("{{{", 2), Some(3)); + assert_eq!(nesting_depth_exceeds("((( )))((( )))", 3), None); // peak 3 + } + + #[test] + fn nesting_depth_exceeds_at_production_cap() { + // The production cap (256) admits depth-256 and rejects 257. + assert_eq!( + nesting_depth_exceeds( + &"(".repeat(MAX_PARSE_NESTING_DEPTH), + MAX_PARSE_NESTING_DEPTH + ), + None + ); + assert_eq!( + nesting_depth_exceeds( + &"(".repeat(MAX_PARSE_NESTING_DEPTH + 1), + MAX_PARSE_NESTING_DEPTH + ), + Some(MAX_PARSE_NESTING_DEPTH + 1) + ); + } + + // ----------------------------------------------------------------- + // lex_reject_reason — non-bracket recursion vectors + t-strings + // ----------------------------------------------------------------- + + #[test] + fn lex_reject_catches_non_bracket_recursion_vectors() { + // Prefix-unary, right-assoc `**`, conditional, lambda, and deep + // statement indent recurse with NO bracket, overflowing even the + // 128 MiB stack. All must be rejected BEFORE the parser runs. + let prefix_not = format!("x = {}True", "not ".repeat(3000)); + let prefix_neg = format!("x = {}1", "-".repeat(3000)); + let prefix_inv = format!("x = {}1", "~".repeat(3000)); + let prefix_await = format!("x = {}y", "await ".repeat(3000)); + let pow_chain = format!("x = {}2", "2**".repeat(3000)); + let cond_chain = format!("x = {}1", "1 if a else ".repeat(2000)); + let lambda_chain = format!("f = {}1", "lambda: ".repeat(3000)); + // Deeply nested compound statements recurse at the statement level. + let mut nested_if = String::new(); + for i in 0..1200 { + for _ in 0..i { + nested_if.push(' '); + } + nested_if.push_str("if a:\n"); + } + for _ in 0..1200 { + nested_if.push(' '); + } + nested_if.push_str("pass\n"); + for (label, src) in [ + ("not-chain", &prefix_not), + ("neg-chain", &prefix_neg), + ("inv-chain", &prefix_inv), + ("await-chain", &prefix_await), + ("pow-chain", &pow_chain), + ("cond-chain", &cond_chain), + ("lambda-chain", &lambda_chain), + ("nested-if", &nested_if), + ] { + assert!( + lex_reject_reason(src).is_some(), + "{label} must be rejected by the recursion guard" + ); + } + } + + #[test] + fn lex_reject_flags_tstrings() { + let reason = lex_reject_reason("x = t\"hello {name}\"\n").expect("t-string rejected"); + assert!(reason.contains("t-strings"), "unexpected reason: {reason}"); + } + + #[test] + fn lex_reject_has_no_false_positives_on_real_expressions() { + // Left-associative chains, big literals, normal f-strings, and a + // deeply-but-legitimately nested config must NOT be rejected — the + // per-operand reset keeps them shallow. + let sum_chain = format!("total = {}0", "v + ".repeat(5000)); // left-assoc, iterative + let big_list = format!("xs = [{}]", "a + b, ".repeat(5000)); // 5k elements of sums + let fstring = "msg = f\"run {dag_id} at {ts} value={x + y}\"\n".to_owned(); + let nested = format!("cfg = {}1{}", "[".repeat(96), "]".repeat(96)); // depth-96 SageMaker + // A realistically deep DAG: nested with/for/if blocks (~12 levels). + let mut real_nest = String::new(); + for i in 0..12 { + real_nest.push_str(&" ".repeat(i)); + real_nest.push_str("with x:\n"); + } + real_nest.push_str(&" ".repeat(12)); + real_nest.push_str("pass\n"); + for (label, src) in [ + ("sum-chain", &sum_chain), + ("big-list", &big_list), + ("fstring", &fstring), + ("nested-96", &nested), + ("real-nested-blocks", &real_nest), + ] { + assert!( + lex_reject_reason(src).is_none(), + "{label} is a legitimate expression and must NOT be rejected" + ); + } + } } diff --git a/crates/ferro-airflow-dag-parser/src/ruff_impl.rs b/crates/ferro-airflow-dag-parser/src/ruff_impl.rs index d2365b6..c8d4e29 100644 --- a/crates/ferro-airflow-dag-parser/src/ruff_impl.rs +++ b/crates/ferro-airflow-dag-parser/src/ruff_impl.rs @@ -23,6 +23,24 @@ const DAG_NAMES: &[&str] = &["DAG"]; const DAG_DECORATOR_NAMES: &[&str] = &["dag"]; const SETTER_METHODS: &[&str] = &["set_upstream", "set_downstream"]; +/// Maximum depth the recursive AST walkers descend before truncating. +/// +/// `collect_shift_edges`, `terminal_task_ids`, `resolve_to_task_id`, +/// `stringify_expr`, and the decorator `inner_name` walk left-leaning +/// chains — `a >> b >> …` (shift), `a.b.c…` (attribute), `f()()…` +/// (call) — that the parser builds *iteratively*, so the resulting tree +/// can be far deeper than any expression the lexer-pass recursion cap in +/// `panic_safe.rs` bounds (that cap counts brackets / prefix-operator +/// runs / indent, none of which a left-leaning binary or trailer chain +/// accumulates). A deep-but-cap-admitted attribute chain +/// (`schedule=a.a.a…`, ~200 k deep) overflowed `stringify_expr` even on +/// the 128 MiB parse stack (Codex DD, 2026-06-16). Past this depth the +/// walk degrades gracefully — a sentinel string / dropped edge / `None` — +/// rather than aborting the process. `1024` clears any real DAG (the +/// longest realistic `>>` chain is a few hundred tasks) by a wide margin +/// and sits far below the overflow point. +const MAX_WALK_DEPTH: usize = 1024; + /// Parse `source` and return all DAGs found. /// /// # Errors @@ -32,6 +50,16 @@ const SETTER_METHODS: &[&str] = &["set_upstream", "set_downstream"]; /// `dag_id` / `task_id` literal violates Airflow's 250-char / safe-charset /// rule. pub fn extract_all(source: &str) -> Result, ParseError> { + // Run the whole parse + recursive AST walk inside the stack-safety + // shield: the two pre-scans reject pathologically nested input before + // the parser runs, and the 128 MiB dedicated stack bounds both the + // parser's recursive descent and this `Walker`'s recursion (the + // `>>` / attribute / call chains it walks would otherwise overflow a + // small caller stack on a deep-but-cap-admitted tree). + crate::panic_safe::shield_parser_panic("ruff", source, || extract_all_inner(source)) +} + +fn extract_all_inner(source: &str) -> Result, ParseError> { let parsed = parse_module_safely(source)?; let module: &ModModule = parsed.syntax(); let line_index = LineIndex::new(source); @@ -211,7 +239,7 @@ impl Walker<'_> { && (matches!(op, Operator::RShift | Operator::LShift)) && let Some(idx) = dag_idx { - self.collect_shift_edges(left, *op, right, idx); + self.collect_shift_edges(left, *op, right, idx, 0); return Ok(()); } if let Expr::Call(call) = expr @@ -219,9 +247,9 @@ impl Walker<'_> { && SETTER_METHODS.contains(&attr.as_str()) && let Some(idx) = dag_idx { - let lhs = self.resolve_to_task_id(value); + let lhs = self.resolve_to_task_id(value, 0); for arg in &call.arguments.args { - let rhs = self.resolve_to_task_id(arg); + let rhs = self.resolve_to_task_id(arg, 0); if let (Some(l), Some(r)) = (lhs.clone(), rhs) { if attr.as_str() == "set_downstream" { push_unique_edge(&mut self.dags[idx].deps_edges, (l.clone(), r)); @@ -234,9 +262,23 @@ impl Walker<'_> { Ok(()) } - fn collect_shift_edges(&mut self, left: &Expr, op: Operator, right: &Expr, dag_idx: usize) { - let left_terms = self.terminal_task_ids(left, op); - let right_terms = self.terminal_task_ids(right, op); + fn collect_shift_edges( + &mut self, + left: &Expr, + op: Operator, + right: &Expr, + dag_idx: usize, + depth: usize, + ) { + // Stack-safety: a `a >> b >> c >> …` spine recurses once per `>>`. + // The parser builds it iteratively, so the tree is far deeper than + // the lexer cap bounds; truncate past MAX_WALK_DEPTH rather than + // overflow the parse stack. + if depth > MAX_WALK_DEPTH { + return; + } + let left_terms = self.terminal_task_ids(left, op, depth); + let right_terms = self.terminal_task_ids(right, op, depth); for l in &left_terms { for r in &right_terms { let edge = if matches!(op, Operator::RShift) { @@ -255,30 +297,33 @@ impl Walker<'_> { }) = left && matches!(op2, Operator::RShift | Operator::LShift) { - self.collect_shift_edges(l2, *op2, r2, dag_idx); + self.collect_shift_edges(l2, *op2, r2, dag_idx, depth + 1); } } - fn terminal_task_ids(&self, expr: &Expr, parent_op: Operator) -> Vec { + fn terminal_task_ids(&self, expr: &Expr, parent_op: Operator, depth: usize) -> Vec { + if depth > MAX_WALK_DEPTH { + return Vec::new(); + } if let Expr::BinOp(ExprBinOp { left: _, op, right, .. }) = expr && matches!(op, Operator::RShift | Operator::LShift) { let _ = parent_op; - return self.terminal_task_ids(right, *op); + return self.terminal_task_ids(right, *op, depth + 1); } let mut out = Vec::new(); match expr { Expr::List(ast::ExprList { elts, .. }) | Expr::Tuple(ast::ExprTuple { elts, .. }) => { for elt in elts { - if let Some(id) = self.resolve_to_task_id(elt) { + if let Some(id) = self.resolve_to_task_id(elt, depth + 1) { out.push(id); } } } other => { - if let Some(id) = self.resolve_to_task_id(other) { + if let Some(id) = self.resolve_to_task_id(other, depth + 1) { out.push(id); } } @@ -286,7 +331,10 @@ impl Walker<'_> { out } - fn resolve_to_task_id(&self, expr: &Expr) -> Option { + fn resolve_to_task_id(&self, expr: &Expr, depth: usize) -> Option { + if depth > MAX_WALK_DEPTH { + return None; + } match expr { Expr::Name(ExprName { id, .. }) => self .task_aliases @@ -298,8 +346,10 @@ impl Walker<'_> { // receiver / callee. Resolving a bare `Name` callee picks // up `@task`-decorated functions registered as aliases of // themselves. - Expr::Call(call) => self.resolve_to_task_id(&call.func), - Expr::Attribute(ExprAttribute { value, .. }) => self.resolve_to_task_id(value), + Expr::Call(call) => self.resolve_to_task_id(&call.func, depth + 1), + Expr::Attribute(ExprAttribute { value, .. }) => { + self.resolve_to_task_id(value, depth + 1) + } _ => None, } } @@ -329,7 +379,7 @@ fn merge_dag_kwargs(dag: &mut ExtractedDag, call: &ExprCall) -> Result<(), Parse } } "schedule" | "schedule_interval" | "timetable" => { - dag.schedule = Some(stringify_expr(&kw.value)); + dag.schedule = Some(stringify_expr(&kw.value, 0)); } "default_args" => { dag.has_default_args = true; @@ -378,16 +428,22 @@ fn match_dag_decorator(expr: &Expr) -> Option> { } fn is_task_decorator(expr: &Expr) -> bool { - fn inner_name(expr: &Expr) -> Option<&str> { + // Stack-safety: a `@a()()()…`-style decorator recurses one frame per + // call into `inner_name`; truncate past MAX_WALK_DEPTH (such a + // decorator is not a task decorator anyway). + fn inner_name(expr: &Expr, depth: usize) -> Option<&str> { + if depth > MAX_WALK_DEPTH { + return None; + } match expr { Expr::Name(ExprName { id, .. }) => Some(id.as_str()), Expr::Attribute(ExprAttribute { attr, .. }) => Some(attr.as_str()), - Expr::Call(call) => inner_name(&call.func), + Expr::Call(call) => inner_name(&call.func, depth + 1), _ => None, } } matches!( - inner_name(expr), + inner_name(expr, 0), Some("task" | "task_group" | "setup" | "teardown" | "sensor" | "short_circuit") ) } @@ -411,7 +467,15 @@ fn constant_str(expr: &Expr) -> Option { None } -fn stringify_expr(expr: &Expr) -> String { +fn stringify_expr(expr: &Expr, depth: usize) -> String { + // Stack-safety: `a.b.c…` / `f()()…` recurse one frame per attribute / + // call. The parser builds these iteratively, so the chain can be + // hundreds of thousands deep within a few-hundred-KiB file and + // overflow even the 128 MiB parse stack (Codex DD, 2026-06-16). + // Truncate past MAX_WALK_DEPTH with a sentinel rather than abort. + if depth > MAX_WALK_DEPTH { + return "".to_string(); + } match expr { Expr::StringLiteral(ExprStringLiteral { value, .. }) => value.to_str().to_string(), Expr::NoneLiteral(_) => "None".to_string(), @@ -423,10 +487,10 @@ fn stringify_expr(expr: &Expr) -> String { }, Expr::Name(ExprName { id, .. }) => id.as_str().to_string(), Expr::Attribute(ExprAttribute { value, attr, .. }) => { - format!("{}.{}", stringify_expr(value), attr.as_str()) + format!("{}.{}", stringify_expr(value, depth + 1), attr.as_str()) } Expr::Call(call) => { - let func = stringify_expr(&call.func); + let func = stringify_expr(&call.func, depth + 1); format!("{func}(...)") } _ => "".to_string(), diff --git a/crates/ferro-airflow-dag-parser/tests/mutation_guard.rs b/crates/ferro-airflow-dag-parser/tests/mutation_guard.rs index b7784ea..34a03e4 100644 --- a/crates/ferro-airflow-dag-parser/tests/mutation_guard.rs +++ b/crates/ferro-airflow-dag-parser/tests/mutation_guard.rs @@ -29,7 +29,9 @@ use ferro_airflow_dag_parser::{ // --------------------------------------------------------------------------- fn dag_id_of(dag: &ExtractedDag) -> Option<&str> { - dag.dag_id.as_ref().map(ferro_airflow_dag_parser::DagId::as_str) + dag.dag_id + .as_ref() + .map(ferro_airflow_dag_parser::DagId::as_str) } fn task_id_strings(dag: &ExtractedDag) -> Vec<&str> { @@ -82,7 +84,10 @@ fn dag_id_one_over_max_len_is_rejected() { match err { ParseError::InvalidIdentifier { kind, reason, .. } => { assert_eq!(kind, "dag_id"); - assert!(reason.contains("251"), "reason should report length 251: {reason}"); + assert!( + reason.contains("251"), + "reason should report length 251: {reason}" + ); } other => panic!("expected InvalidIdentifier, got {other:?}"), } @@ -91,32 +96,35 @@ fn dag_id_one_over_max_len_is_rejected() { #[test] fn task_id_at_exact_max_len_is_accepted() { let id = "t".repeat(250); - let src = format!( - "with DAG(dag_id=\"d\"):\n x = BashOperator(task_id=\"{id}\")\n" - ); + let src = format!("with DAG(dag_id=\"d\"):\n x = BashOperator(task_id=\"{id}\")\n"); let dags = extract_all_static_dags(&src).expect("250-char task_id must be accepted"); assert_eq!(task_id_strings(&dags[0]), vec![id.as_str()]); } // =========================================================================== -// panic_safe.rs — bracket-depth and unary-op caps (reached via extractor) +// panic_safe.rs — stack-safety caps (reached via the public extractor) +// +// The guard is two-layer (see `src/panic_safe.rs`): an iterative bracket +// pre-scan with cap `MAX_PARSE_NESTING_DEPTH = 256`, and a lexer pass with +// the combined-recursion cap `MAX_PARSE_RECURSION = 1024` that also catches +// non-bracket recursion (`not`/`await`/`**`/conditional/`lambda`/indent). +// These integration tests pin the observable boundary through the public +// API so a `>`/`>=`/`==` mutation of either cap comparison dies. // =========================================================================== #[test] fn bracket_depth_exactly_at_cap_passes() { - // MAX_BRACKET_DEPTH is 32. A depth of exactly 32 must NOT be - // rejected (`depth > limit` is false at 32). The `>=` mutant would - // reject this; the `==` mutant would only reject at exactly 33 (so - // a depth-33 input below distinguishes it from `>`). - let src = format!("x = {}1{}\n", "(".repeat(32), ")".repeat(32)); - // Either parses cleanly or fails for a *non-bracket* reason — but it - // must not be rejected by the bracket pre-screen. + // Depth of exactly 256 (the bracket cap) must NOT be rejected + // (`depth > cap` is false at 256). The `>=` mutant would reject this. + let src = format!("x = {}1{}\n", "(".repeat(256), ")".repeat(256)); + // Either parses cleanly or fails for a *non-cap* reason — but it must + // not be rejected by the nesting pre-screen. match extract_all_static_dags(&src) { Ok(_) => {} Err(ParseError::Parse(msg)) => { assert!( - !msg.contains("bracket nesting depth"), - "depth-32 must not trip the bracket cap: {msg}" + !msg.contains("nesting depth"), + "depth-256 must not trip the nesting cap: {msg}" ); } Err(other) => panic!("unexpected error: {other:?}"), @@ -125,15 +133,14 @@ fn bracket_depth_exactly_at_cap_passes() { #[test] fn bracket_depth_one_over_cap_rejected() { - // Depth 33 = first depth strictly greater than 32. Kills `> -> ==` - // (which would only fire at exactly 33 — actually identical here, so - // we also assert the deeper case below) and `> -> >=`. - let src = format!("x = {}1{}\n", "(".repeat(33), ")".repeat(33)); - let err = extract_all_static_dags(&src).expect_err("depth 33 must be rejected"); + // Depth 257 = first depth strictly greater than 256. Kills `> -> >=` + // (which would admit 257) by requiring rejection here. + let src = format!("x = {}1{}\n", "(".repeat(257), ")".repeat(257)); + let err = extract_all_static_dags(&src).expect_err("depth 257 must be rejected"); match err { ParseError::Parse(msg) => assert!( - msg.contains("bracket nesting depth"), - "expected bracket-cap message: {msg}" + msg.contains("nesting depth") && msg.contains("256"), + "expected nesting-cap message: {msg}" ), other => panic!("expected Parse, got {other:?}"), } @@ -141,15 +148,15 @@ fn bracket_depth_one_over_cap_rejected() { #[test] fn bracket_depth_far_over_cap_rejected() { - // Depth 200 (>> 32). Kills `> -> ==` (the `==` variant only fires at - // exactly 33, so a depth-200 unbalanced opener would slip through - // it but is caught by the real `>` operator). - let src = "(".repeat(200); - let err = extract_all_static_dags(&src).expect_err("depth 200 must be rejected"); + // Depth 4096 (>> 256), unbalanced openers. Kills `> -> ==` (the `==` + // variant only fires at exactly 257, so a depth-4096 input would slip + // through it) — the real `>` operator rejects everything above 256. + let src = "(".repeat(4096); + let err = extract_all_static_dags(&src).expect_err("depth 4096 must be rejected"); match err { ParseError::Parse(msg) => assert!( - msg.contains("bracket nesting depth"), - "expected bracket-cap message: {msg}" + msg.contains("nesting depth"), + "expected nesting-cap message: {msg}" ), other => panic!("expected Parse, got {other:?}"), } @@ -157,18 +164,18 @@ fn bracket_depth_far_over_cap_rejected() { #[test] fn closing_brackets_reduce_depth_so_balanced_input_passes() { - // 40 balanced open/close *pairs* never nested deeper than 1. Kills + // 400 balanced open/close *pairs* never nested deeper than 1. Kills // the "delete match arm `)` `]` `}`" mutant: without the closing arm - // depth would climb to 40 ( > 32 ) and be wrongly rejected. + // depth would climb to 400 ( > 256 ) and be wrongly rejected. let mut src = String::from("x = "); - for _ in 0..40 { + for _ in 0..400 { src.push_str("(1)"); } src.push('\n'); match extract_all_static_dags(&src) { Ok(_) => {} Err(ParseError::Parse(msg)) => assert!( - !msg.contains("bracket nesting depth"), + !msg.contains("nesting depth"), "balanced brackets must not trip the cap: {msg}" ), Err(other) => panic!("unexpected error: {other:?}"), @@ -176,30 +183,47 @@ fn closing_brackets_reduce_depth_so_balanced_input_passes() { } #[test] -fn unary_op_run_exactly_at_cap_passes() { - // MAX_UNARY_OP_RUN is 64. A run of exactly 64 `-` must NOT be - // rejected (`run > limit` false at 64). Kills `> -> >=`. - let src = format!("x = {}1\n", "-".repeat(64)); +fn recursion_run_exactly_at_cap_passes() { + // A run of exactly 1024 `-` (combined recursion depth = 1024) must NOT + // be rejected (`max > cap` false at 1024). Kills `> -> >=`. + let src = format!("x = {}1\n", "-".repeat(1024)); match extract_all_static_dags(&src) { Ok(_) => {} Err(ParseError::Parse(msg)) => assert!( - !msg.contains("unary-prefix operator chain"), - "run-64 must not trip the unary cap: {msg}" + !msg.contains("recursion depth"), + "run-1024 must not trip the recursion cap: {msg}" ), Err(other) => panic!("unexpected error: {other:?}"), } } #[test] -fn unary_op_run_far_over_cap_rejected() { - // Run of 300 `~`. Kills both `> -> ==` (fires only at exactly 65) - // and `> -> >=`; the real operator rejects everything above 64. - let src = format!("{}x\n", "~".repeat(300)); - let err = extract_all_static_dags(&src).expect_err("run 300 must be rejected"); +fn recursion_run_far_over_cap_rejected() { + // Run of 2000 `~` (>> 1024). The lexer-pass recursion cap rejects it + // even though no bracket is present — the vector a bracket-only cap + // would miss entirely. + let src = format!("{}x\n", "~".repeat(2000)); + let err = extract_all_static_dags(&src).expect_err("run 2000 must be rejected"); + match err { + ParseError::Parse(msg) => assert!( + msg.contains("recursion depth"), + "expected recursion-cap message: {msg}" + ), + other => panic!("expected Parse, got {other:?}"), + } +} + +#[test] +fn keyword_not_chain_rejected_by_recursion_cap() { + // `not not not …` recurses one frame per `not` with NO bracket and NO + // single-byte operator — the exact shape the old bracket+byte caps + // missed. The lexer pass must reject it as a graceful skip-file error. + let src = format!("x = {}True\n", "not ".repeat(3000)); + let err = extract_all_static_dags(&src).expect_err("not-chain must be rejected"); match err { ParseError::Parse(msg) => assert!( - msg.contains("unary-prefix operator chain"), - "expected unary-cap message: {msg}" + msg.contains("recursion depth"), + "expected recursion-cap message: {msg}" ), other => panic!("expected Parse, got {other:?}"), } @@ -399,7 +423,10 @@ with contextlib.suppress(Exception): pass "; let dags = extract_all_static_dags(src).expect("parse"); - assert!(dags.is_empty(), "non-DAG context manager produced DAGs: {dags:?}"); + assert!( + dags.is_empty(), + "non-DAG context manager produced DAGs: {dags:?}" + ); } #[test] @@ -432,7 +459,10 @@ def helper(): pass "; let dags = extract_all_static_dags(src).expect("parse"); - assert!(dags.is_empty(), "non-dag decorator produced a DAG: {dags:?}"); + assert!( + dags.is_empty(), + "non-dag decorator produced a DAG: {dags:?}" + ); } #[test] @@ -566,7 +596,9 @@ with DAG(dag_id="cs"): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat inside a DAG must be flagged: {markers:?}" ); } @@ -584,7 +616,9 @@ chain(*items) "; let markers = dynamic_markers_for(src); assert!( - !markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + !markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat at module scope must NOT be flagged: {markers:?}" ); } @@ -602,7 +636,9 @@ with DAG(dag_id="cs2"): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "attribute chain helper must be flagged: {markers:?}" ); } @@ -622,7 +658,9 @@ with DAG(dag_id="loopgen"): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ForLoopTaskGeneration { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ForLoopTaskGeneration { .. })), "operator constructed in a for-loop must be flagged: {markers:?}" ); } @@ -640,7 +678,9 @@ with DAG(dag_id="loopgen2"): "#; let markers = dynamic_markers_for(src); assert!( - !markers.iter().any(|m| matches!(m, DynamicMarker::ForLoopTaskGeneration { .. })), + !markers + .iter() + .any(|m| matches!(m, DynamicMarker::ForLoopTaskGeneration { .. })), "non-operator loop call must NOT flag: {markers:?}" ); } @@ -660,7 +700,9 @@ if os.environ.get("ENABLE"): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ImportTimeBranching { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ImportTimeBranching { .. })), "DAG under non-constant if must flag: {markers:?}" ); } @@ -678,7 +720,9 @@ if True: "#; let markers = dynamic_markers_for(src); assert!( - !markers.iter().any(|m| matches!(m, DynamicMarker::ImportTimeBranching { .. })), + !markers + .iter() + .any(|m| matches!(m, DynamicMarker::ImportTimeBranching { .. })), "DAG under constant `if True` must NOT flag branching: {markers:?}" ); } @@ -728,12 +772,16 @@ def q(): "#; let dyn_markers = dynamic_markers_for(dynamic); assert!( - dyn_markers.iter().any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), + dyn_markers + .iter() + .any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), "@task(expand=True) must flag: {dyn_markers:?}" ); let bare_markers = dynamic_markers_for(bare); assert!( - !bare_markers.iter().any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), + !bare_markers + .iter() + .any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), "bare @task must NOT flag UnsupportedTaskFlow: {bare_markers:?}" ); } @@ -756,7 +804,9 @@ def p(): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::UnsupportedTaskFlow { .. })), "@task('positional') must flag UnsupportedTaskFlow: {markers:?}" ); } @@ -815,7 +865,9 @@ with DAG(dag_id="asg"): "#; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat on the RHS of an assignment must flag: {markers:?}" ); } @@ -835,7 +887,9 @@ with DAG(dag_id='ann_asg'): "; let markers = dynamic_markers_for(src); assert!( - markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat on the RHS of an annotated assignment must flag: {markers:?}" ); } @@ -882,7 +936,9 @@ chain(*items) "#; let markers = dynamic_markers_for(src); assert!( - !markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + !markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat after the DAG block must NOT flag (context leaked): {markers:?}" ); } @@ -904,7 +960,9 @@ chain(*items) "#; let markers = dynamic_markers_for(src); assert!( - !markers.iter().any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), + !markers + .iter() + .any(|m| matches!(m, DynamicMarker::ChainSplat { .. })), "chain splat after the @dag function must NOT flag: {markers:?}" ); } @@ -960,7 +1018,10 @@ from airflow.models.baseoperator import chain with DAG(dag_id="ctx"): chain(*items) "#; - assert!(has_chain_splat(bare_dag), "bare DAG `with` must flag the splat"); + assert!( + has_chain_splat(bare_dag), + "bare DAG `with` must flag the splat" + ); } #[test] @@ -1013,7 +1074,10 @@ from airflow.models.baseoperator import chain def pipe(): chain(*items) "#; - assert!(has_chain_splat(dag_deco), "@dag decorator must open the context"); + assert!( + has_chain_splat(dag_deco), + "@dag decorator must open the context" + ); } #[test] @@ -1249,7 +1313,11 @@ def bare_pipeline(): pass "; let dags = extract_all_static_dags(src).expect("parse"); - assert_eq!(dags.len(), 1, "bare @dag must register exactly one DAG: {dags:?}"); + assert_eq!( + dags.len(), + 1, + "bare @dag must register exactly one DAG: {dags:?}" + ); assert_eq!(dag_id_of(&dags[0]), Some("bare_pipeline")); } @@ -1271,7 +1339,10 @@ def not_a_dag(): pass "; let dags = extract_all_static_dags(src).expect("parse"); - assert!(dags.is_empty(), "non-dag Name-call decorator produced a DAG: {dags:?}"); + assert!( + dags.is_empty(), + "non-dag Name-call decorator produced a DAG: {dags:?}" + ); } #[test] @@ -1290,7 +1361,10 @@ def celery_task(): pass "; let dags = extract_all_static_dags(src).expect("parse"); - assert!(dags.is_empty(), "non-dag attribute-call decorator produced a DAG: {dags:?}"); + assert!( + dags.is_empty(), + "non-dag attribute-call decorator produced a DAG: {dags:?}" + ); } #[test] diff --git a/crates/ferro-airflow-dag-parser/tests/stack_safety.rs b/crates/ferro-airflow-dag-parser/tests/stack_safety.rs new file mode 100644 index 0000000..f93c01d --- /dev/null +++ b/crates/ferro-airflow-dag-parser/tests/stack_safety.rs @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 +//! Regression coverage for FP5 — the fuzz-discovered stack-overflow denial-of-service +//! in the vendored `littrs-ruff-python-parser` 0.6.2 recursive-descent +//! parser (and in this crate's recursive AST walkers). +//! +//! `littrs-ruff-python-parser` descends one frame per grouping construct +//! AND per prefix-unary / right-associative / conditional / lambda +//! operator, with no upstream recursion limit. Sufficiently nested +//! attacker-controlled DAG source overflows the parse thread's stack, +//! which Linux delivers as a fatal `SIGSEGV` — an abort that +//! `catch_unwind` cannot intercept. The three-layer guard in +//! `src/panic_safe.rs` (bracket pre-scan, lexer-pass recursion cap, and a +//! 128 MiB dedicated parse/walk stack) turns every such input into a +//! graceful skip-file `Err`, never a process abort. +//! +//! Each `*_rejected` test below is non-vacuous: with the guard removed +//! (temporarily revert `shield_parser_panic` to a plain parse on the +//! caller stack), the matching input aborts this test binary with +//! `SIGSEGV` instead of returning an `Err`. The `*_passes` tests pin that +//! the guard does not reject legitimate deep-but-realistic DAG shapes. + +#![cfg(feature = "parser-ruff")] + +use std::fmt::Write as _; + +use ferro_airflow_dag_parser::{ + ParseError, detect_dynamic_markers, dynamic_markers_for, extract_all_static_dags, + extract_static_dag, +}; + +/// Drive `src` through every public AST entry point and assert each one +/// RETURNS (Ok or Err) rather than aborting the process. If the guard +/// regressed, the call site overflows the stack and the test binary dies +/// with SIGSEGV here — which is exactly the failure this pins against. +fn assert_all_paths_return(src: &str) { + let _ = extract_static_dag(src); + let _ = extract_all_static_dags(src); + let _ = detect_dynamic_markers(src); + let _ = dynamic_markers_for(src); +} + +/// Assert the static extractor rejects `src` with a `Parse` error whose +/// message mentions the stack-safety cap (recursion or nesting). +fn assert_cap_rejected(label: &str, src: &str) { + // Both AST paths must reject (and never abort). + match extract_all_static_dags(src) { + Err(ParseError::Parse(msg)) => assert!( + msg.contains("recursion depth") || msg.contains("nesting depth"), + "{label}: expected a stack-safety cap message, got: {msg}" + ), + other => panic!("{label}: expected ParseError::Parse (cap), got {other:?}"), + } + match detect_dynamic_markers(src) { + Err(ParseError::Parse(msg)) => assert!( + msg.contains("recursion depth") || msg.contains("nesting depth"), + "{label}: marker path expected a stack-safety cap message, got: {msg}" + ), + other => panic!("{label}: marker path expected ParseError::Parse (cap), got {other:?}"), + } +} + +// --------------------------------------------------------------------------- +// The five parser-recursion overflow shapes the bracket-only cap missed. +// --------------------------------------------------------------------------- + +#[test] +fn prefix_not_chain_rejected() { + // `not not not … True` — one parser frame per `not`, a keyword the + // old byte-scan never saw. + assert_cap_rejected("not-chain", &format!("x = {}True\n", "not ".repeat(4000))); +} + +#[test] +fn prefix_await_chain_rejected() { + // `await await … y` inside an async function — keyword prefix-unary. + let body = format!(" return {}y\n", "await ".repeat(4000)); + assert_cap_rejected("await-chain", &format!("async def f():\n{body}")); +} + +#[test] +fn prefix_byte_operator_chains_rejected() { + // `----…1`, `~~~…1`, `++++…1` — single-byte prefix-unary chains. + assert_cap_rejected("neg-chain", &format!("x = {}1\n", "-".repeat(4000))); + assert_cap_rejected("inv-chain", &format!("x = {}1\n", "~".repeat(4000))); + assert_cap_rejected("pos-chain", &format!("x = {}1\n", "+".repeat(4000))); +} + +#[test] +fn power_chain_rejected() { + // `a ** b ** c …` — the right-associative power operator recurses. + assert_cap_rejected("pow-chain", &format!("x = {}2\n", "2**".repeat(4000))); +} + +#[test] +fn conditional_chain_rejected() { + // `1 if a else 1 if a else …` — conditional-expression recursion. + assert_cap_rejected( + "cond-chain", + &format!("x = {}1\n", "1 if a else ".repeat(3000)), + ); +} + +#[test] +fn lambda_chain_rejected() { + // `lambda: lambda: …` — nested lambda bodies recurse. + assert_cap_rejected( + "lambda-chain", + &format!("f = {}1\n", "lambda: ".repeat(4000)), + ); +} + +#[test] +fn deep_statement_indent_rejected() { + // Deeply nested compound statements recurse at the *statement* level + // (one parser frame per indentation level) with no expression nesting. + let mut src = String::new(); + for i in 0..2000 { + for _ in 0..i { + src.push(' '); + } + src.push_str("if a:\n"); + } + for _ in 0..2000 { + src.push(' '); + } + src.push_str("pass\n"); + assert_cap_rejected("nested-if", &src); +} + +#[test] +fn deep_bracket_nesting_rejected() { + // The original bracket-nesting shape (still covered by Layer 1). + let src = format!("x = {}1{}\n", "[".repeat(4096), "]".repeat(4096)); + assert_cap_rejected("bracket-nest", &src); +} + +#[test] +fn yield_from_chain_rejected() { + // Codex DD R2 (2026-06-16): `yield from yield from … x` recurses one + // parser frame per `yield from` (`parse_yield_from_expression` parses + // another expression to its right). The lexer pass did not count + // `Yield`/`From`, so the chain bypassed the cap and overflowed the + // parser at ~50 k (550 KiB). Now `Yield` + `From` are counted. + assert_cap_rejected( + "yield-from-chain", + &format!( + "def g():\n yield from {}x\n", + "yield from ".repeat(50_000) + ), + ); +} + +#[test] +fn yield_chain_rejected() { + // `yield yield yield … x` = `yield (yield (yield … x))`, same parser + // recursion without the `from`. + assert_cap_rejected( + "yield-chain", + &format!("def g():\n yield {}x\n", "yield ".repeat(80_000)), + ); +} + +#[test] +fn async_keyword_chain_rejected() { + // Codex DD R3 (2026-06-16): `async async … def f(): pass` recurses one + // parser frame per `async` via error recovery (`parse_async_statement` + // consumes a stray `async`, records the error, then calls + // `parse_statement` again). No bracket / indent / operand between, so + // the lexer pass did not count it until `T::Async` was added. + assert_cap_rejected( + "async-chain", + &format!("{}def f():\n pass\n", "async ".repeat(90_000)), + ); +} + +#[test] +fn legitimate_async_not_rejected() { + // `T::Async` counting must NOT trip on real async statements. + for (label, src) in [ + ("async-def", "async def f():\n pass\n"), + ( + "async-for", + "async def f():\n async for x in y:\n pass\n", + ), + ( + "async-with", + "async def f():\n async with y as z:\n pass\n", + ), + ("async-await", "async def f():\n x = await h()\n"), + ] { + if let Err(ParseError::Parse(msg)) = extract_all_static_dags(src) { + assert!( + !msg.contains("recursion depth"), + "{label} must NOT hit the recursion cap: {msg}" + ); + } + } +} + +#[test] +fn legitimate_yield_and_import_not_rejected() { + // The `Yield`/`From` counting must NOT trip on real generators or + // `from … import` / `raise … from` (a single keyword per line resets + // at the operand). + for (label, src) in [ + ("single-yield-from", "def g():\n yield from gen()\n"), + ( + "many-yield-lines", + "def g():\n yield 1\n yield 2\n yield from h()\n", + ), + ( + "import-from", + "from airflow.operators.bash import BashOperator\nfrom a.b.c import d\n", + ), + ("raise-from", "def f():\n raise ValueError() from err\n"), + ] { + // Ok or a non-cap parse error is fine; only a recursion-cap + // rejection would be a false positive. + if let Err(ParseError::Parse(msg)) = extract_all_static_dags(src) { + assert!( + !msg.contains("recursion depth"), + "{label} must NOT hit the recursion cap: {msg}" + ); + } + } +} + +#[test] +fn implicit_continuation_prefix_chain_rejected() { + // Codex DD R6 (2026-06-16): a prefix chain split across physical lines + // INSIDE parens — `x = (\n not\n not\n … a\n)` — recurses one parser + // frame per `not`, but each physical line break is a + // `NonLogicalNewline` token. The lexer pass used to reset `op_run` on + // every unrecognised token (including `NonLogicalNewline`), so the run + // never accumulated and the chain overflowed the parser. Trivia + // (`NonLogicalNewline` / `Comment`) is now transparent to the run. + assert_cap_rejected( + "not-multiline-paren", + &format!("x = (\n{}a\n)\n", "not\n".repeat(50_000)), + ); + // The same with a comment after each operator (also trivia). + assert_cap_rejected( + "not-comment-multiline-paren", + &format!("x = (\n{}a\n)\n", "not # c\n".repeat(50_000)), + ); +} + +#[test] +fn legitimate_multiline_bracketed_expr_not_rejected() { + // A real multi-line expression inside parens/brackets (operands and + // commas between the operators reset the run) must NOT be rejected, + // even with comments interleaved. + for (label, src) in [ + ( + "sum-multiline", + format!("x = (\n{}0\n)\n", "a +\n".repeat(5000)), + ), + ( + "list-multiline", + format!("xs = [\n{}]\n", "a.b,\n".repeat(5000)), + ), + ( + "comment-heavy", + "x = (\n a # c1\n + b # c2\n + c # c3\n)\n".to_owned(), + ), + ] { + if let Err(ParseError::Parse(msg)) = extract_all_static_dags(&src) { + assert!( + !msg.contains("recursion depth"), + "{label} must NOT hit the recursion cap: {msg}" + ); + } + } +} + +#[test] +fn alternating_mixed_prefix_operators_rejected() { + // FP5 / fuzz Finding 2 (crash-0665b68…): the OLD guard counted only a + // consecutive run of a SINGLE prefix operator (`MAX_UNARY_OP_RUN = 64`), + // so `~not ~not …` — each individual run < 64 but the combined prefix + // nesting unbounded — slipped through and overflowed the parser. The + // new `op_run` counts a run across ALL mixed prefix operators and only + // resets at an operand/bracket/newline, so the alternation accumulates + // and is rejected. + let src = format!("x = {}True\n", "~not ".repeat(2000)); + assert_cap_rejected("alternating-prefix", &src); +} + +// --------------------------------------------------------------------------- +// t-strings: the parser PANICS on them — must be rejected pre-parse. +// --------------------------------------------------------------------------- + +#[test] +fn tstring_rejected_not_fatal() { + let src = "x = t\"hello {name}\"\n"; + match extract_static_dag(src) { + Err(ParseError::Parse(_)) => {} + other => panic!("t-string should be a Parse rejection, got {other:?}"), + } + assert_all_paths_return(src); + // A normal f-string must still parse fine (no false positive). + assert!(extract_static_dag("y = f\"hello {name}\"\n").is_ok()); +} + +// --------------------------------------------------------------------------- +// Recursive AST WALKERS in `ruff_impl` (NOT the parser): `>>`/`<<` shift +// chains, attribute chains, and call chains build deep left-leaning trees +// the parser produces *iteratively* — so they are far deeper than the +// lexer-pass recursion cap bounds and would overflow even the 128 MiB +// parse stack. `MAX_WALK_DEPTH` truncates each walker so it returns +// instead of aborting. Every N below is past the measured overflow point +// (`stringify_expr` aborted at ~200 k on the 128 MiB stack, Codex DD +// 2026-06-16), so these are non-vacuous: remove the depth guards and they +// abort the test binary with a stack overflow. +// --------------------------------------------------------------------------- + +#[test] +fn deep_shift_chain_in_dag_does_not_abort() { + // `a >> a >> a >> …` inside `with DAG(...)` drives `collect_shift_edges` + // to recurse once per `>>`. Must return, never abort. + let mut src = String::from("with DAG('d'):\n a"); + for _ in 0..600_000 { + src.push_str(" >> a"); + } + src.push('\n'); + let _ = extract_all_static_dags(&src); + let _ = extract_static_dag(&src); +} + +#[test] +fn deep_attribute_chain_schedule_does_not_abort() { + // `schedule=a.a.a.a…` drives `stringify_expr` to recurse once per + // attribute access (the exact Codex-DD-verified overflow). Must return. + let mut src = String::from("with DAG('d', schedule=a"); + for _ in 0..300_000 { + src.push_str(".a"); + } + src.push_str("):\n pass\n"); + match extract_all_static_dags(&src) { + Ok(_) | Err(_) => {} // either is fine; the point is it RETURNS + } + let _ = dynamic_markers_for(&src); +} + +#[test] +fn moderately_deep_call_chain_in_shift_returns() { + // `a >> f()()()…` exercises the `resolve_to_task_id` call-chain guard. + // Its frames are tiny, so the binding limit at extreme depth is the + // recursive AST *drop* (intrinsic to the ruff AST + Rust `Drop`, a + // documented residual — see `dd-pack/11-known-limitations.md`), not + // this walker. At a realistic-but-deep 100 k it returns cleanly. + let mut src = String::from("with DAG('d'):\n a >> f"); + for _ in 0..100_000 { + src.push_str("()"); + } + src.push('\n'); + let _ = extract_all_static_dags(&src); +} + +#[test] +fn marker_path_decorator_call_chain_does_not_abort() { + // Codex DD R8 (2026-06-16): the MARKER path's `match_dag_decorator` / + // `is_task_decorator_call` helpers also recurse through a decorator + // call chain (`@dag()()…` / `@task()()…`). They are now capped at + // `MAX_DECORATOR_CHAIN_DEPTH`, mirroring the static walker's + // `inner_name`. This drives the marker path past the cap and asserts + // it returns (the deepest extreme remains the documented multi-MB + // AST-drop residual, not these helpers). + let dag = format!("@dag{}\ndef p():\n pass\n", "()".repeat(4000)); + let _ = dynamic_markers_for(&dag); + let _ = detect_dynamic_markers(&dag); + let task = format!( + "@dag\ndef p():\n @task{}\n def t():\n pass\n", + "()".repeat(4000) + ); + let _ = dynamic_markers_for(&task); + let _ = detect_dynamic_markers(&task); +} + +// --------------------------------------------------------------------------- +// No false positives: legitimate deep-but-realistic DAG shapes must pass. +// --------------------------------------------------------------------------- + +#[test] +fn deepest_real_world_dag_depth_passes() { + // 96 is the deepest nesting measured across the entire vendored Apache + // Airflow source tree (an AWS SageMaker example DAG). It must parse, + // NOT be rejected by the stack-safety cap, so the "run Airflow example + // DAGs unmodified" contract holds. + let src = format!("value = {}1{}\n", "[".repeat(96), "]".repeat(96)); + match extract_all_static_dags(&src) { + Ok(_) => {} + Err(ParseError::Parse(msg)) => assert!( + !msg.contains("recursion depth") && !msg.contains("nesting depth"), + "depth-96 real-world DAG must not hit the cap: {msg}" + ), + Err(other) => panic!("unexpected error for depth-96 DAG: {other:?}"), + } +} + +#[test] +fn long_legit_task_chain_passes() { + // A real DAG that chains 200 tasks with `>>` and a left-associative + // arithmetic default — both stay shallow in the recursion metric and + // must NOT be rejected. + let mut src = String::from("with DAG('pipeline') as dag:\n"); + for i in 0..200 { + let _ = writeln!(src, " t{i} = BashOperator(task_id=\"t{i}\")"); + } + src.push_str(" t0"); + for i in 1..200 { + let _ = write!(src, " >> t{i}"); + } + src.push('\n'); + match extract_all_static_dags(&src) { + Ok(dags) => assert_eq!(dags.len(), 1, "the legit chained DAG must be recovered"), + Err(other) => panic!("legit 200-task chain must parse, got {other:?}"), + } +} + +#[test] +fn left_associative_arithmetic_chain_passes() { + // `v + v + v + … + 0` is a 5000-term left-assoc chain. The parser + // builds it iteratively and the recursion metric's per-operand reset + // keeps it at depth ~1 — it must NOT be rejected. + let src = format!("total = {}0\n", "v + ".repeat(5000)); + match extract_all_static_dags(&src) { + Ok(_) => {} + Err(ParseError::Parse(msg)) => assert!( + !msg.contains("recursion depth") && !msg.contains("nesting depth"), + "left-assoc arithmetic must not hit the cap: {msg}" + ), + Err(other) => panic!("unexpected error: {other:?}"), + } +} diff --git a/dd-pack/11-known-limitations.md b/dd-pack/11-known-limitations.md new file mode 100644 index 0000000..f537c6b --- /dev/null +++ b/dd-pack/11-known-limitations.md @@ -0,0 +1,84 @@ + +# ferro-airflow-dag-parser — known limitations + +## FP5 — parser stack-overflow DoS — ✅ CLOSED (v1.0.1, 2026-06-16) + +**Was:** parsing attacker-controlled Python DAG source could overflow the +vendored `littrs-ruff-python-parser` 0.6.2 recursive-descent parser and abort +the host process with a `SIGSEGV` (`catch_unwind` cannot intercept a guard-page +fault). The original pre-screen capped only bracket nesting (32) and a +*consecutive run of a single* prefix operator (64), so these vectors slipped +through and overflowed the parser: + +- `not not not …` / `await await …` keyword prefix chains, +- `~~~…` / `---…` / `+++…` byte prefix runs, +- right-associative `a ** b ** c …`, +- `a if b else …` conditional and `lambda: lambda: …` chains, +- deeply nested compound statements (`if a:` / `for …:` …, statement-level + recursion), +- **mixed** prefix-operator chains (`~not ~not …`) where each individual + consecutive run stayed under 64 but the combined nesting was unbounded — + the exact shape of fuzz Finding 2, `crash-0665b68…`, +- `yield from yield from … x` / `yield yield … x` (parser recurses per + `yield` keyword — Codex DD R2), +- `async async … def f(): pass` (the parser's error recovery for a stray + `async` recurses into `parse_statement` — Codex DD R3). + +**Fix (v1.0.1):** ported FerroAir's complete three-layer recursion guard +(`ferroair-dag-parser`, FA1) into `src/panic_safe.rs`: + +1. **Bracket pre-scan** — iterative byte scan, reject grouping-delimiter + nesting deeper than 256. +2. **Lexer pre-scan** — one iterative real-tokenizer pass tracking + `brackets + op_run + line_right_rec + indent` (cap 1024); `op_run` counts a + consecutive run across *all* mixed prefix operators / recursion-driving + keywords (`Yield`/`From`/`Async` included) and resets only at an + operand/bracket/newline (this is what closes the mixed-prefix Finding 2 and + the `yield`/`async` vectors). Also rejects PEP-750 t-strings (the parser + panics on them). +3. **Dedicated 128 MiB stack** — the parse **and** the AST walk run on a + 128 MiB thread so the numeric cap, not the caller's ~2 MiB stack, is the + binding limit; the thread's `join()` also folds any unwinding panic into + `ParseError::Internal`. + +The recursive AST walkers in `ruff_impl.rs` (`collect_shift_edges`, +`terminal_task_ids`, `resolve_to_task_id`, `stringify_expr`, decorator +`inner_name`) additionally truncate past `MAX_WALK_DEPTH = 1024`, so a deep +left-leaning `>>` / attribute / call chain that survives the parser (these are +left-associative, built iteratively, and not bounded by the lexer cap) cannot +overflow the walk either. This closed an additional Codex-DD-verified overflow: +`schedule=a.a.a…` (~200 k deep) overflowed `stringify_expr` even on the 128 MiB +stack; it now returns gracefully. + +Regression coverage: `tests/stack_safety.rs` (each `*_rejected` test is +non-vacuous — removing the guard aborts the test binary with a stack overflow), +`tests/mutation_guard.rs` (cap boundaries), unit tests in `panic_safe.rs`. + +This is **not** a claim of bulletproof input handling. The true guarantee is +exactly those three layers plus the walker depth cap: a numeric depth cap far +above any real DAG (deepest measured across the vendored Apache Airflow tree is +96) and far below the overflow threshold, a real-tokenizer reject pass with no +byte-heuristic false positives, and a dedicated stack that makes the cap the +binding limit. + +### Residual (honest) — deep left-leaning AST construction / drop + +A single left-leaning chain of **hundreds of thousands** of attribute / call / +shift trailers (`a.a.a…`, `f()()…`, `a >> a >> …`) requires a **multi-MB** +source file and builds a correspondingly deep `Box`-linked AST. Constructing or +**dropping** such an AST recurses (intrinsic to the ruff AST + Rust `Drop`), so +it can still overflow the 128 MiB parse stack at extreme depth (measured: +`~2 M` call trailers ≈ a 4 MiB file). This is: + +- a **different mechanism** from FP5 (it is the recursive AST `Drop`, not the + parser's recursive descent), and not a defect introduced by this crate; +- **shared with the upstream `ferroair-dag-parser`** (same vendored AST), where + it is likewise bounded by the 128 MiB stack and accepted; +- **not cheaply closable pre-parse** — the lexer cannot distinguish a single + N-deep chain (overflow) from N shallow siblings in a flat literal + (`[a.b, c.d, …]`, safe) without parsing, so a token-count cap would + false-positive on large flat DAG literals. + +The realistic FP5 parser-recursion shapes (each well under 4 KiB) and the +Codex-DD-verified `stringify_expr` walk overflow are fully closed; the residual +requires a multi-MB single-chain file and is bounded by the 128 MiB stack. diff --git a/dd-pack/fuzz-campaign-2026-06-08.md b/dd-pack/fuzz-campaign-2026-06-08.md index 35b9eb6..7c8a357 100644 --- a/dd-pack/fuzz-campaign-2026-06-08.md +++ b/dd-pack/fuzz-campaign-2026-06-08.md @@ -33,7 +33,22 @@ coordinate field now rejects whole-segment `.`/`..` components (new tests + the crash input tracked as a permanent known-crash seed. Verified: crashes before the fix, passes after. -## Finding 2 — airflow `dynamic_markers` stack-overflow (TRIAGED — deferred to ferro-air) +## Finding 2 — airflow `dynamic_markers` stack-overflow — ✅ CLOSED (v1.0.1, 2026-06-16) + +> **Closure (2026-06-16, ferro-airflow-dag-parser v1.0.1):** the source fix is +> no longer deferred. FerroAir's complete three-layer recursion guard was +> ported into `panic_safe.rs` — bracket pre-scan (cap 256) + a single +> real-tokenizer pass bounding `brackets + op_run + line_right_rec + indent` +> (cap 1024, with `op_run` counting a run across *all* mixed prefix operators +> so the alternating `-`/`not`/`[` shape below no longer slips through) + +> parse/walk on a dedicated 128 MiB stack. The recursive AST walkers also +> truncate past depth 1024. The exact crash seed (`crash-0665b68…`) now returns +> gracefully. Regression tests: `tests/stack_safety.rs`. Honest residual +> (deep-chain AST drop on multi-MB input) is recorded in +> `dd-pack/11-known-limitations.md` §FP5. The triage below is retained for the +> historical record. + +### (historical triage) Input: ~1500 bytes alternating ~30-char runs of unary `-` with `not` keywords and `[`. `dynamic_markers_for` → `dynamic_markers.rs:157 parse_module_safely(source)?` DOES diff --git a/fuzz/known-crash/airflow_dag_extract/FINDING.md b/fuzz/known-crash/airflow_dag_extract/FINDING.md index ddfc9c7..c0c619c 100644 --- a/fuzz/known-crash/airflow_dag_extract/FINDING.md +++ b/fuzz/known-crash/airflow_dag_extract/FINDING.md @@ -1,5 +1,18 @@ # airflow_dag_extract — 7 crash inputs, 1 upstream bug +> **Status (2026-06-16, ferro-airflow-dag-parser v1.0.1): ✅ HANDLED.** All 7 +> crash inputs here hit the upstream `expression.rs:1633` **t-/f-string panic** +> (an *unwinding* panic, not a stack overflow). They are now (a) rejected +> pre-parse by the lexer pass (PEP-750 t-strings) where applicable, and (b) +> otherwise caught by the shield's `catch_unwind` / thread-`join` and folded +> into `ParseError::Internal` — never a process abort. The separate +> **stack-overflow** recursion DoS (FP5 / fuzz Finding 2, including the +> `not`/`-`/`[` mixed-prefix `crash-0665b68…` seed) is also closed in v1.0.1 +> by the ported three-layer recursion guard; see +> `dd-pack/11-known-limitations.md` §FP5 and `dd-pack/fuzz-campaign-2026-06-08.md` +> Finding 2. The OOM `*-falsepos-*` seeds in this directory remain benign +> records (libFuzzer RSS-accounting artifacts, not real OOMs). + **Date**: 2026-05-03 (extended ad-hoc fuzz wave, post-SSD-install) **Target**: `ferro-protocols/fuzz/fuzz_targets/airflow_dag_extract.rs` **Time-to-first-crash**: 35 seconds (cold-start with 15,530-file corpus) diff --git a/fuzz/known-crash/airflow_dag_extract/oom-c51874bb-1ms-cumulative-rss-falsepos-2026-06-14 b/fuzz/known-crash/airflow_dag_extract/oom-c51874bb-1ms-cumulative-rss-falsepos-2026-06-14 new file mode 100644 index 0000000..013599a Binary files /dev/null and b/fuzz/known-crash/airflow_dag_extract/oom-c51874bb-1ms-cumulative-rss-falsepos-2026-06-14 differ