From 74f630a41d3189171cff7e4b93a2a3c8bae66675 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:25:57 +0200 Subject: [PATCH 1/8] fix(molecule): unbreak elasticsearch_diagnostics under ansible-core 2.19 The scenario reached restart_and_verify_elasticsearch.yml via include_tasks with an absolute path, so it ran outside any role context. In 2.19 that stopped working because the included file resolves {{ role_path }} lazily, and role_path no longer leaks across play boundaries from the earlier include_role. Switching both callers to include_role with tasks_from re-establishes the role context and drops the env-lookup path construction along the way. --- molecule/elasticsearch_diagnostics/converge.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/molecule/elasticsearch_diagnostics/converge.yml b/molecule/elasticsearch_diagnostics/converge.yml index ae0013e..4d0461e 100644 --- a/molecule/elasticsearch_diagnostics/converge.yml +++ b/molecule/elasticsearch_diagnostics/converge.yml @@ -36,8 +36,9 @@ line: "bogus.nonexistent.setting: true" - name: Attempt restart with bad config (should fail with diagnostics) - ansible.builtin.include_tasks: - file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + tasks_from: restart_and_verify_elasticsearch.yml - name: This should not be reached ansible.builtin.fail: @@ -72,5 +73,6 @@ seconds: 5 - name: Restart Elasticsearch with restored config - ansible.builtin.include_tasks: - file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml" + ansible.builtin.include_role: + name: oddly.elasticstack.elasticsearch + tasks_from: restart_and_verify_elasticsearch.yml From 7a046affef42b17f016d3db75ab3e0f52e4aa098 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:26:04 +0200 Subject: [PATCH 2/8] fix(ci): stop collect-diagnostics flagging unrelated OOMs The OOM report was crying wolf because the collector pulled logs from every running container on the Incus host and the OOM detection itself scanned the shared kernel dmesg ring buffer without a time filter. A single matrix job therefore inherited both its concurrent siblings' logs and weeks of historical host-level OOMs, and the summary lit up red on runs that had no memory issue. Now the container discovery and the cgroup memory report both filter by MOLECULE_RUN_SUFFIX so we only look at containers this job created, and the OOM summary parses the bracketed timestamps on dmesg and ES log lines and drops anything older than three hours. --- .../actions/collect-diagnostics/action.yml | 57 +++++++++++++++---- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/.github/actions/collect-diagnostics/action.yml b/.github/actions/collect-diagnostics/action.yml index 28dc326..e7a5964 100644 --- a/.github/actions/collect-diagnostics/action.yml +++ b/.github/actions/collect-diagnostics/action.yml @@ -28,9 +28,16 @@ runs: -i "$SSH_KEY" "root@${INCUS_HOST}" "$@" 2>/dev/null || true } - # Discover running Incus containers managed by molecule - containers=$(ssh_host \ + # Discover running Incus containers managed by molecule. + # Filter by MOLECULE_RUN_SUFFIX so concurrent matrix jobs on the same + # Incus host don't cross-contaminate this job's diagnostics. + all_containers=$(ssh_host \ "incus list --format csv -c n,s | grep ',RUNNING' | cut -d, -f1" || true) + if [ -n "${MOLECULE_RUN_SUFFIX:-}" ]; then + containers=$(echo "$all_containers" | grep -- "${MOLECULE_RUN_SUFFIX}$" || true) + else + containers="$all_containers" + fi for name in $containers; do dir="$diag/service-logs/$name" @@ -105,13 +112,16 @@ runs: } > "$dir/container-resources.txt" 2>&1 || true done - # Container memory report with cgroup OOM events (from Incus host) - ssh_host bash -s << 'MEMSCRIPT' \ + # Container memory report with cgroup OOM events (from Incus host). + # Filter by run suffix to skip concurrent matrix jobs' containers. + ssh_host bash -s "${MOLECULE_RUN_SUFFIX:-}" << 'MEMSCRIPT' \ > "$diag/container-memory-report.txt" 2>&1 || true + suffix="$1" printf "%-50s %10s %10s %10s %10s %10s %s\n" \ "CONTAINER" "CURRENT" "PEAK" "ANON" "FILE" "LIMIT" "OOM_EVENTS" echo "---" for c in $(incus list -f csv -c ns 2>/dev/null | grep ",RUNNING" | cut -d, -f1 | sort); do + [ -n "$suffix" ] && [[ "$c" != *"$suffix" ]] && continue cgdir="/sys/fs/cgroup/lxc.payload.${c}" [ -f "$cgdir/memory.current" ] || continue current=$(($(cat "$cgdir/memory.current") / 1048576)) @@ -147,6 +157,12 @@ runs: report="$diag/OOM-REPORT.txt" found_oom=false + # LXC containers share the host kernel ring buffer and some log + # paths, so dmesg/ES log OOM entries can predate this job by days. + # Ignore anything older than this cutoff to avoid false positives. + since_epoch=$(date -u -d '3 hours ago' +%s) + since_iso=$(date -u -d '3 hours ago' +%Y-%m-%dT%H:%M:%S) + { echo "========================================" echo " OOM / Memory Pressure Detection Report" @@ -178,26 +194,47 @@ runs: done < "$diag/container-memory-report.txt" fi - # Check ES logs for OutOfMemoryError + # Check ES logs for OutOfMemoryError (recent entries only). + # ES log prefix is [YYYY-MM-DDTHH:MM:SS,mmm]; ISO timestamps compare + # lexicographically so string >= works. POSIX awk (mawk-compatible). for eslog in "$diag"/service-logs/*/elasticsearch.log; do [ -f "$eslog" ] || continue container=$(basename "$(dirname "$eslog")") - if grep -q "OutOfMemoryError" "$eslog" 2>/dev/null; then + recent=$(awk -v since="$since_iso" ' + /OutOfMemoryError/ { + if (match($0, /\[[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]/)) { + ts = substr($0, RSTART+1, 19) + if (ts >= since) print + } + }' "$eslog") + if [ -n "$recent" ]; then echo echo "[ES OOM] $container — OutOfMemoryError in elasticsearch.log:" - grep -A2 "OutOfMemoryError" "$eslog" | head -10 + echo "$recent" | head -10 found_oom=true fi done - # Check dmesg for kernel OOM killer + # Check dmesg for kernel OOM killer (recent entries only). + # dmesg -T prefixes lines with a ctime-like bracketed timestamp; + # parse it and compare to the cutoff to drop host-level history. for dlog in "$diag"/service-logs/*/dmesg.log; do [ -f "$dlog" ] || continue container=$(basename "$(dirname "$dlog")") - if grep -qi "oom-killer\|out of memory\|killed process" "$dlog" 2>/dev/null; then + recent=$(awk -v cutoff="$since_epoch" ' + { line = tolower($0) } + line ~ /oom-killer|out of memory|killed process/ { + if (match($0, /^\[[^]]+\]/)) { + ts = substr($0, RSTART+1, RLENGTH-2) + cmd = "date -d \"" ts "\" +%s 2>/dev/null" + if ((cmd | getline epoch) > 0 && (epoch+0) >= (cutoff+0)) print + close(cmd) + } + }' "$dlog") + if [ -n "$recent" ]; then echo echo "[KERNEL OOM] $container — OOM killer in dmesg:" - grep -i "oom-killer\|out of memory\|killed process" "$dlog" | head -10 + echo "$recent" | head -10 found_oom=true fi done From 98e532ce01d72e702f18f2d7d643b06afb542a0f Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:32:19 +0200 Subject: [PATCH 3/8] fix(ci): fall back to upstream galaxy when cache host is unavailable Dependabot pull requests do not inherit repo secrets, so INCUS_HOST expands to an empty string and the internal collection cache URL collapses to http://:8082/..., which the install step then fails on with a DNS error. Every dependabot PR has been bouncing off Test Linting as a result. Each of the four workflows that install collections from the cache now exports CACHE_HOST via env and switches to installing the same pinned community.general, community.crypto and ansible.posix versions from upstream Galaxy when CACHE_HOST is empty. Scheduled and manually-triggered runs continue to use the internal cache unchanged. --- .github/workflows/molecule.yml | 14 +++++++++- .../workflows/test_elasticsearch_upgrade.yml | 28 +++++++++++++++++-- .github/workflows/test_full_stack.yml | 14 +++++++++- .github/workflows/test_linting.yml | 17 ++++++++++- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/.github/workflows/molecule.yml b/.github/workflows/molecule.yml index 9985193..11fbb74 100644 --- a/.github/workflows/molecule.yml +++ b/.github/workflows/molecule.yml @@ -81,10 +81,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_elasticsearch_upgrade.yml b/.github/workflows/test_elasticsearch_upgrade.yml index 8d4679b..dc78035 100644 --- a/.github/workflows/test_elasticsearch_upgrade.yml +++ b/.github/workflows/test_elasticsearch_upgrade.yml @@ -86,10 +86,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | @@ -189,10 +201,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_full_stack.yml b/.github/workflows/test_full_stack.yml index f64d1d5..7665fef 100644 --- a/.github/workflows/test_full_stack.yml +++ b/.github/workflows/test_full_stack.yml @@ -114,10 +114,22 @@ jobs: mkdir -p "$RUNNER_TEMP/ssh-cp" - name: Install collection + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} run: | mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME - ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Set up SSH key for molecule run: | diff --git a/.github/workflows/test_linting.yml b/.github/workflows/test_linting.yml index 766e16e..a5e3b27 100644 --- a/.github/workflows/test_linting.yml +++ b/.github/workflows/test_linting.yml @@ -42,7 +42,22 @@ jobs: SSL_CERT_FILE: /etc/ssl/certs/ca-certificates.crt - name: Install Ansible collection dependencies. - run: ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz + env: + CACHE_HOST: ${{ secrets.INCUS_HOST }} + run: | + # Fall back to upstream Galaxy when the cache host secret is not + # available (e.g. Dependabot PRs, which do not inherit repo secrets). + if [ -n "$CACHE_HOST" ]; then + ansible-galaxy collection install \ + "http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \ + "http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz" + else + ansible-galaxy collection install \ + community.general:12.3.0 \ + community.crypto:3.1.1 \ + ansible.posix:2.1.0 + fi - name: Lint code (yamllint). run: | From 3b528d0ed95afe0c543a0eb56699e4defc89611b Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:32:30 +0200 Subject: [PATCH 4/8] chore(deps): pin ansible-core below 2.19 until role_path usages catch up The previous <2.21 ceiling pulled in 2.19.8, which tightened variable scoping so that role_path no longer leaks across play boundaries from an earlier include_role. The elasticsearch_diagnostics scenario tripped over this, and there are roughly fifty other {{ role_path }}/.. references in production role tasks whose behaviour under 2.19 we have not yet validated end-to-end. Capping at <2.19 buys time to convert those to include_role with tasks_from (or equivalent) before letting 2.19 through again. --- requirements-test.txt | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 7d7ad35..175d7bc 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,9 @@ -ansible-core>=2.18,<2.21 +# Pin below 2.19 until this collection catches up with the variable-scoping +# changes in that release: role_path no longer leaks across play boundaries, +# which breaks include_tasks callers that rely on {{ role_path }}/.. paths +# (the elasticsearch_diagnostics molecule scenario hit this). Revisit once +# the full matrix is known to pass on 2.19. +ansible-core>=2.18,<2.19 ansible-lint molecule pytest From 8eca9371f56a303546983eada73a0aafc11515e9 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:32:36 +0200 Subject: [PATCH 5/8] chore(molecule): add placeholder default scenario to silence log noise Molecule 26 globs molecule/default/molecule.yml for shared-state discovery on every invocation and emits a CRITICAL line when the file is missing, which turns up in every molecule destroy step in CI and makes the logs harder to scan when something actually goes wrong. Every workflow passes -s , so this placeholder is never executed; it only exists so the glob succeeds. --- molecule/default/molecule.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 molecule/default/molecule.yml diff --git a/molecule/default/molecule.yml b/molecule/default/molecule.yml new file mode 100644 index 0000000..fe17cf3 --- /dev/null +++ b/molecule/default/molecule.yml @@ -0,0 +1,12 @@ +--- +# Placeholder scenario. Molecule 26 globs molecule/default/molecule.yml on +# every run to discover shared state, and emits a CRITICAL line when it is +# absent. Every CI invocation passes -s , so this scenario is +# never executed; the file only exists to silence that log noise. +prerun: false +driver: + name: default +platforms: + - name: placeholder +provisioner: + name: ansible From da2348620a8caf635d9d1d2c38231c08b4831ae8 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:38:55 +0200 Subject: [PATCH 6/8] fix(molecule): wait for logstash output content, not just file creation The custom_pipeline verify task was waiting on the output file path only, but logstash opens the file when the pipeline starts and the generator events still have to flow through batch and flush before anything is written. On slower runners (rockylinux9 + release 8 in particular, where the whole job took 17m against 2-4m for the other forty-one combos) the slurp ran before the first event had been flushed, so the assertion saw an empty file and the scenario failed intermittently. Waiting on search_regex: processed_by removes the race: the task only returns once actual content has been written. I bumped the timeout to 120s since we are now legitimately waiting on pipeline work rather than a path appearing in the filesystem. --- molecule/logstash_custom_pipeline/verify.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/molecule/logstash_custom_pipeline/verify.yml b/molecule/logstash_custom_pipeline/verify.yml index 2fc03b9..01af0dd 100644 --- a/molecule/logstash_custom_pipeline/verify.yml +++ b/molecule/logstash_custom_pipeline/verify.yml @@ -60,8 +60,13 @@ - name: Wait for Logstash to process events ansible.builtin.wait_for: path: /var/log/logstash/custom-output.log - timeout: 60 - msg: "Logstash custom output file not created" + # Wait for actual content, not just file creation: Logstash opens the + # output file when the pipeline starts but the generator events still + # need to flow through batch and flush, which is noticeably slower on + # resource-constrained runners. + search_regex: processed_by + timeout: 120 + msg: "Logstash custom output did not contain processed events within 120s" - name: Check output file has data ansible.builtin.slurp: From bfb3ef2973519b427dd86d65d2e8b37a48f60393 Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 10:38:55 +0200 Subject: [PATCH 7/8] chore(ci): move collect-diagnostics upload to upload-artifact@v5 v4 runs on Node.js 20, which GitHub Actions is forcing off the runner on 2nd June 2026 and removing entirely on 16th September. v5 uses Node.js 24 and is otherwise API-compatible for our call sites; no workflow changes needed. The deprecation warning shows up in every failed matrix job right now, which is more log noise we can remove ahead of the cutoff. --- .github/actions/collect-diagnostics/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/collect-diagnostics/action.yml b/.github/actions/collect-diagnostics/action.yml index e7a5964..1d596cb 100644 --- a/.github/actions/collect-diagnostics/action.yml +++ b/.github/actions/collect-diagnostics/action.yml @@ -265,7 +265,7 @@ runs: echo "::endgroup::" - name: Upload diagnostics - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: ${{ inputs.artifact-name }} path: /tmp/molecule-diagnostics/ From 575ba499081e928689f5e51ced32614fe65d4afc Mon Sep 17 00:00:00 2001 From: Sam Crauwels Date: Mon, 13 Apr 2026 11:34:52 +0200 Subject: [PATCH 8/8] fix(ci): exclude placeholder molecule scenario from coverage --- scripts/check-ci-coverage.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/check-ci-coverage.sh b/scripts/check-ci-coverage.sh index 33cc41b..7754177 100755 --- a/scripts/check-ci-coverage.sh +++ b/scripts/check-ci-coverage.sh @@ -10,7 +10,7 @@ MOLECULE_DIR="$REPO_ROOT/molecule" EXIT_CODE=0 # Scenarios that are not standalone tests (utility dirs, shared includes) -EXCLUDED_SCENARIOS="shared plugins" +EXCLUDED_SCENARIOS="default shared plugins" echo "=== Molecule scenario CI coverage check ===" echo