Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 48 additions & 11 deletions .github/actions/collect-diagnostics/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,16 @@ runs:
-i "$SSH_KEY" "root@${INCUS_HOST}" "$@" 2>/dev/null || true
}

# Discover running Incus containers managed by molecule
containers=$(ssh_host \
# Discover running Incus containers managed by molecule.
# Filter by MOLECULE_RUN_SUFFIX so concurrent matrix jobs on the same
# Incus host don't cross-contaminate this job's diagnostics.
all_containers=$(ssh_host \
"incus list --format csv -c n,s | grep ',RUNNING' | cut -d, -f1" || true)
if [ -n "${MOLECULE_RUN_SUFFIX:-}" ]; then
containers=$(echo "$all_containers" | grep -- "${MOLECULE_RUN_SUFFIX}$" || true)
else
containers="$all_containers"
fi

for name in $containers; do
dir="$diag/service-logs/$name"
Expand Down Expand Up @@ -105,13 +112,16 @@ runs:
} > "$dir/container-resources.txt" 2>&1 || true
done

# Container memory report with cgroup OOM events (from Incus host)
ssh_host bash -s << 'MEMSCRIPT' \
# Container memory report with cgroup OOM events (from Incus host).
# Filter by run suffix to skip concurrent matrix jobs' containers.
ssh_host bash -s "${MOLECULE_RUN_SUFFIX:-}" << 'MEMSCRIPT' \
> "$diag/container-memory-report.txt" 2>&1 || true
suffix="$1"
printf "%-50s %10s %10s %10s %10s %10s %s\n" \
"CONTAINER" "CURRENT" "PEAK" "ANON" "FILE" "LIMIT" "OOM_EVENTS"
echo "---"
for c in $(incus list -f csv -c ns 2>/dev/null | grep ",RUNNING" | cut -d, -f1 | sort); do
[ -n "$suffix" ] && [[ "$c" != *"$suffix" ]] && continue
cgdir="/sys/fs/cgroup/lxc.payload.${c}"
[ -f "$cgdir/memory.current" ] || continue
current=$(($(cat "$cgdir/memory.current") / 1048576))
Expand Down Expand Up @@ -147,6 +157,12 @@ runs:
report="$diag/OOM-REPORT.txt"
found_oom=false

# LXC containers share the host kernel ring buffer and some log
# paths, so dmesg/ES log OOM entries can predate this job by days.
# Ignore anything older than this cutoff to avoid false positives.
since_epoch=$(date -u -d '3 hours ago' +%s)
since_iso=$(date -u -d '3 hours ago' +%Y-%m-%dT%H:%M:%S)

{
echo "========================================"
echo " OOM / Memory Pressure Detection Report"
Expand Down Expand Up @@ -178,26 +194,47 @@ runs:
done < "$diag/container-memory-report.txt"
fi

# Check ES logs for OutOfMemoryError
# Check ES logs for OutOfMemoryError (recent entries only).
# ES log prefix is [YYYY-MM-DDTHH:MM:SS,mmm]; ISO timestamps compare
# lexicographically so string >= works. POSIX awk (mawk-compatible).
for eslog in "$diag"/service-logs/*/elasticsearch.log; do
[ -f "$eslog" ] || continue
container=$(basename "$(dirname "$eslog")")
if grep -q "OutOfMemoryError" "$eslog" 2>/dev/null; then
recent=$(awk -v since="$since_iso" '
/OutOfMemoryError/ {
if (match($0, /\[[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]T[0-9][0-9]:[0-9][0-9]:[0-9][0-9]/)) {
ts = substr($0, RSTART+1, 19)
if (ts >= since) print
}
}' "$eslog")
if [ -n "$recent" ]; then
echo
echo "[ES OOM] $container — OutOfMemoryError in elasticsearch.log:"
grep -A2 "OutOfMemoryError" "$eslog" | head -10
echo "$recent" | head -10
found_oom=true
fi
done

# Check dmesg for kernel OOM killer
# Check dmesg for kernel OOM killer (recent entries only).
# dmesg -T prefixes lines with a ctime-like bracketed timestamp;
# parse it and compare to the cutoff to drop host-level history.
for dlog in "$diag"/service-logs/*/dmesg.log; do
[ -f "$dlog" ] || continue
container=$(basename "$(dirname "$dlog")")
if grep -qi "oom-killer\|out of memory\|killed process" "$dlog" 2>/dev/null; then
recent=$(awk -v cutoff="$since_epoch" '
{ line = tolower($0) }
line ~ /oom-killer|out of memory|killed process/ {
if (match($0, /^\[[^]]+\]/)) {
ts = substr($0, RSTART+1, RLENGTH-2)
cmd = "date -d \"" ts "\" +%s 2>/dev/null"
if ((cmd | getline epoch) > 0 && (epoch+0) >= (cutoff+0)) print
close(cmd)
}
}' "$dlog")
Comment on lines +224 to +233
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Handle plain dmesg output in the new cutoff parser.

Lines 55-57 still fall back to raw dmesg, but this awk block only accepts human-readable timestamps that date -d can parse. On hosts where dmesg -T is unavailable, every raw [12345.678] OOM line is dropped here, so kernel OOMs disappear from the report. Please either capture a parseable timestamp at collection time or record enough host-boot metadata to translate raw seconds before applying the cutoff.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.github/actions/collect-diagnostics/action.yml around lines 224 - 233, The
awk block that computes "recent" from "$dlog" drops raw dmesg lines like
"[12345.678]" because it only handles human-readable timestamps; update the
logic that builds "recent" to also accept raw dmesg timestamps by obtaining the
system boot epoch (e.g., btime from /proc/stat) and passing it into the awk
invocation so the parser can recognize /^\[[0-9]+\.[0-9]+\]/, extract the
seconds-since-boot, add btime to produce an epoch, compare that to "cutoff", and
still keep the existing human-readable timestamp branch; ensure you reference
the same variables used in the diff ("recent", "cutoff", "dlog") when wiring the
btime value into awk so kernel OOM lines aren’t dropped.

if [ -n "$recent" ]; then
echo
echo "[KERNEL OOM] $container — OOM killer in dmesg:"
grep -i "oom-killer\|out of memory\|killed process" "$dlog" | head -10
echo "$recent" | head -10
found_oom=true
fi
done
Expand Down Expand Up @@ -228,7 +265,7 @@ runs:
echo "::endgroup::"

- name: Upload diagnostics
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v5
with:
name: ${{ inputs.artifact-name }}
path: /tmp/molecule-diagnostics/
Expand Down
14 changes: 13 additions & 1 deletion .github/workflows/molecule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,22 @@ jobs:
mkdir -p "$RUNNER_TEMP/ssh-cp"

- name: Install collection
env:
CACHE_HOST: ${{ secrets.INCUS_HOST }}
run: |
mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE
cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME
ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz
if [ -n "$CACHE_HOST" ]; then
ansible-galaxy collection install \
"http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \
"http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \
"http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz"
else
ansible-galaxy collection install \
community.general:12.3.0 \
community.crypto:3.1.1 \
ansible.posix:2.1.0
fi

- name: Set up SSH key for molecule
run: |
Expand Down
28 changes: 26 additions & 2 deletions .github/workflows/test_elasticsearch_upgrade.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,22 @@ jobs:
mkdir -p "$RUNNER_TEMP/ssh-cp"

- name: Install collection
env:
CACHE_HOST: ${{ secrets.INCUS_HOST }}
run: |
mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE
cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME
ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz
if [ -n "$CACHE_HOST" ]; then
ansible-galaxy collection install \
"http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \
"http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \
"http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz"
else
ansible-galaxy collection install \
community.general:12.3.0 \
community.crypto:3.1.1 \
ansible.posix:2.1.0
fi

- name: Set up SSH key for molecule
run: |
Expand Down Expand Up @@ -189,10 +201,22 @@ jobs:
mkdir -p "$RUNNER_TEMP/ssh-cp"

- name: Install collection
env:
CACHE_HOST: ${{ secrets.INCUS_HOST }}
run: |
mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE
cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME
ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz
if [ -n "$CACHE_HOST" ]; then
ansible-galaxy collection install \
"http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \
"http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \
"http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz"
else
ansible-galaxy collection install \
community.general:12.3.0 \
community.crypto:3.1.1 \
ansible.posix:2.1.0
fi

- name: Set up SSH key for molecule
run: |
Expand Down
14 changes: 13 additions & 1 deletion .github/workflows/test_full_stack.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,22 @@ jobs:
mkdir -p "$RUNNER_TEMP/ssh-cp"

- name: Install collection
env:
CACHE_HOST: ${{ secrets.INCUS_HOST }}
run: |
mkdir -p $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE
cp -a "$GITHUB_WORKSPACE" $ANSIBLE_COLLECTIONS_PATH/ansible_collections/$COLLECTION_NAMESPACE/$COLLECTION_NAME
ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz
if [ -n "$CACHE_HOST" ]; then
ansible-galaxy collection install \
"http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \
"http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \
"http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz"
else
ansible-galaxy collection install \
community.general:12.3.0 \
community.crypto:3.1.1 \
ansible.posix:2.1.0
fi

- name: Set up SSH key for molecule
run: |
Expand Down
17 changes: 16 additions & 1 deletion .github/workflows/test_linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,22 @@ jobs:
SSL_CERT_FILE: /etc/ssl/certs/ca-certificates.crt

- name: Install Ansible collection dependencies.
run: ansible-galaxy collection install http://${{ secrets.INCUS_HOST }}:8082/collections/community-general-12.3.0.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/community-crypto-3.1.1.tar.gz http://${{ secrets.INCUS_HOST }}:8082/collections/ansible-posix-2.1.0.tar.gz
env:
CACHE_HOST: ${{ secrets.INCUS_HOST }}
run: |
# Fall back to upstream Galaxy when the cache host secret is not
# available (e.g. Dependabot PRs, which do not inherit repo secrets).
if [ -n "$CACHE_HOST" ]; then
ansible-galaxy collection install \
"http://${CACHE_HOST}:8082/collections/community-general-12.3.0.tar.gz" \
"http://${CACHE_HOST}:8082/collections/community-crypto-3.1.1.tar.gz" \
"http://${CACHE_HOST}:8082/collections/ansible-posix-2.1.0.tar.gz"
else
ansible-galaxy collection install \
community.general:12.3.0 \
community.crypto:3.1.1 \
ansible.posix:2.1.0
fi

- name: Lint code (yamllint).
run: |
Expand Down
12 changes: 12 additions & 0 deletions molecule/default/molecule.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
# Placeholder scenario. Molecule 26 globs molecule/default/molecule.yml on
# every run to discover shared state, and emits a CRITICAL line when it is
# absent. Every CI invocation passes -s <scenario>, so this scenario is
# never executed; the file only exists to silence that log noise.
prerun: false
driver:
name: default
platforms:
- name: placeholder
provisioner:
name: ansible
Comment on lines +2 to +12
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

molecule/default is incomplete and currently breaks CI coverage checks.

The run is failing because this new scenario has no verify.yml. Please add a minimal molecule/default/verify.yml (or explicitly exclude this placeholder scenario from the coverage checker).

Minimal fix (new file)
+++ b/molecule/default/verify.yml
+---
+- name: Placeholder verify
+  hosts: all
+  gather_facts: false
+  tasks:
+    - name: Placeholder assertion
+      ansible.builtin.assert:
+        that:
+          - true

As per coding guidelines, "molecule/**: Verify molecule scenarios follow project conventions: prefer extending existing verify.yml over creating new scenarios (each adds ~10 min CI). Check that verify assertions are meaningful and will catch regressions."

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Placeholder scenario. Molecule 26 globs molecule/default/molecule.yml on
# every run to discover shared state, and emits a CRITICAL line when it is
# absent. Every CI invocation passes -s <scenario>, so this scenario is
# never executed; the file only exists to silence that log noise.
prerun: false
driver:
name: default
platforms:
- name: placeholder
provisioner:
name: ansible
---
- name: Placeholder verify
hosts: all
gather_facts: false
tasks:
- name: Placeholder assertion
ansible.builtin.assert:
that:
- true
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@molecule/default/molecule.yml` around lines 2 - 12, Add a minimal Molecule
verify playbook for the placeholder scenario so CI coverage passes: create
molecule/default/verify.yml that either includes/extends the project's shared
verify playbook or contains a tiny Ansible play that runs the standard
idempotence/connection checks (e.g., a hosts: all play with a simple assert or
ping task), ensuring it aligns with the existing convention of reusing the
shared verify logic rather than duplicating heavy flows; reference the
placeholder scenario (platform name "placeholder", prerun: false, driver name
"default") when adding the verify.yml so the coverage checker recognizes and
validates this scenario.

10 changes: 6 additions & 4 deletions molecule/elasticsearch_diagnostics/converge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
line: "bogus.nonexistent.setting: true"

- name: Attempt restart with bad config (should fail with diagnostics)
ansible.builtin.include_tasks:
file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml"
ansible.builtin.include_role:
name: oddly.elasticstack.elasticsearch
tasks_from: restart_and_verify_elasticsearch.yml

- name: This should not be reached
ansible.builtin.fail:
Expand Down Expand Up @@ -72,5 +73,6 @@
seconds: 5

- name: Restart Elasticsearch with restored config
ansible.builtin.include_tasks:
file: "{{ lookup('env', 'ANSIBLE_COLLECTIONS_PATH') | default(lookup('env', 'HOME') + '/.ansible/collections', true) }}/ansible_collections/oddly/elasticstack/roles/elasticsearch/tasks/restart_and_verify_elasticsearch.yml"
ansible.builtin.include_role:
name: oddly.elasticstack.elasticsearch
tasks_from: restart_and_verify_elasticsearch.yml
9 changes: 7 additions & 2 deletions molecule/logstash_custom_pipeline/verify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,13 @@
- name: Wait for Logstash to process events
ansible.builtin.wait_for:
path: /var/log/logstash/custom-output.log
timeout: 60
msg: "Logstash custom output file not created"
# Wait for actual content, not just file creation: Logstash opens the
# output file when the pipeline starts but the generator events still
# need to flow through batch and flush, which is noticeably slower on
# resource-constrained runners.
search_regex: processed_by
timeout: 120
msg: "Logstash custom output did not contain processed events within 120s"

- name: Check output file has data
ansible.builtin.slurp:
Expand Down
7 changes: 6 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
ansible-core>=2.18,<2.21
# Pin below 2.19 until this collection catches up with the variable-scoping
# changes in that release: role_path no longer leaks across play boundaries,
# which breaks include_tasks callers that rely on {{ role_path }}/.. paths
# (the elasticsearch_diagnostics molecule scenario hit this). Revisit once
# the full matrix is known to pass on 2.19.
ansible-core>=2.18,<2.19
ansible-lint
molecule
pytest
Expand Down
2 changes: 1 addition & 1 deletion scripts/check-ci-coverage.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ MOLECULE_DIR="$REPO_ROOT/molecule"
EXIT_CODE=0

# Scenarios that are not standalone tests (utility dirs, shared includes)
EXCLUDED_SCENARIOS="shared plugins"
EXCLUDED_SCENARIOS="default shared plugins"

echo "=== Molecule scenario CI coverage check ==="
echo
Expand Down
Loading