Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions tests/storage/linstor/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,12 @@
import pytest

import functools
import json
import logging
import os

import lib.commands as commands
from lib.common import safe_split

# explicit import for package-scope fixtures
from pkgfixtures import pool_with_saved_yum_state
Expand All @@ -16,6 +18,7 @@
if TYPE_CHECKING:
from lib.host import Host
from lib.pool import Pool
from lib.vm import VM

GROUP_NAME = 'linstor_group'
STORAGE_POOL_NAME = f'{GROUP_NAME}/thin_device'
Expand Down Expand Up @@ -136,3 +139,56 @@ def vm_on_linstor_sr(host, linstor_sr, vm_ref):
yield vm
logging.info("<< Destroy VM")
vm.destroy(verify=True)

@pytest.fixture(scope='function')
def host_and_corrupted_vdi_on_linstor_sr(host, linstor_sr, vm_ref):
vm: VM = host.import_vm(vm_ref, sr_uuid=linstor_sr.uuid)
pool: Pool = host.pool
master: Host = pool.master
vdi_uuid: str = next((
uuid for uuid in vm.vdi_uuids()
if pool.get_vdi_sr_uuid(uuid) == linstor_sr.uuid
))

def get_vdi_volume_name_from_linstor() -> str:
result = master.ssh([
"linstor-kv-tool",
"--dump-volumes",
"-g",
f"xcp-sr-{GROUP_NAME}_thin_device"
])
volumes = json.loads(result)
for k, v in volumes.items():
path = safe_split(k, "/")
if len(path) < 4:
continue
uuid = path[2]
data_type = path[3]
if uuid == vdi_uuid and data_type == "volume-name":
return v
raise FileNotFoundError(f"Could not find matching linstor volume for `{vdi_uuid}`")

def get_vdi_host(path: str) -> Host:
for h in pool.hosts:
result = h.ssh(["test", "-e", path], simple_output=False, check=False)
if result.returncode == 0:
return h
raise FileNotFoundError(f"Could not find matching host for `{vdi_uuid}`")

try:
volume_name = get_vdi_volume_name_from_linstor()
lv_path = f"/dev/{GROUP_NAME}/{volume_name}_00000"
vdi_host = get_vdi_host(lv_path)
logging.info(f"[{host}]: corrupting `{lv_path}`")
vdi_host.ssh([
"dd",
"if=/dev/urandom",
f"of={lv_path}",
"bs=4096",
# Lower values seems to go undetected sometimes
"count=10000" # ~40MB
])
yield vm, vdi_host, volume_name
finally:
logging.info("<< Destroy corrupted VDI")
vm.destroy(verify=True)
79 changes: 79 additions & 0 deletions tests/storage/linstor/test_linstor_sr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest

import json
import logging
import time

Expand Down Expand Up @@ -52,6 +53,29 @@ def test_create_and_destroy_sr(self, pool_with_linstor, provisioning_type, stora
vm.destroy(verify=True)
sr.destroy(verify=True)


def get_drbd_status(host, resource):
logging.debug(f"[{host}] Fetching DRBD status for resource `{resource}`...")
return json.loads(host.ssh(["drbdsetup", "status", resource, "--json"]))

def get_corrupted_resources(host, resource):
return [
(
res.get("name", ""),
conn.get("name", ""),
peer.get("out-of-sync", 0),
)
for res in get_drbd_status(host, resource)
for conn in res.get("connections", [])
for peer in conn.get("peer_devices", [])
if peer.get("out-of-sync", 0) > 0
]

def wait_sync(host, resource):
logging.info(f"[{host}] Waiting for DRBD sync on resource `{resource}`...")
host.ssh(["drbdadm", "wait-sync", resource])


@pytest.mark.usefixtures("linstor_sr")
class TestLinstorSR:
@pytest.mark.quicktest
Expand Down Expand Up @@ -88,6 +112,61 @@ def test_snapshot(self, vm_on_linstor_sr):
finally:
vm.shutdown(verify=True)

@pytest.mark.small_vm
def test_resynchronization(self, host_and_corrupted_vdi_on_linstor_sr):
(vm, host, resource_name) = host_and_corrupted_vdi_on_linstor_sr
hostname = host.hostname()

try:
other_host = next(
next(h for h in host.pool.hosts if h.hostname() == conn.get("name", ""))
for res in get_drbd_status(host, resource_name)
for conn in res.get("connections", [])
for peer in conn.get("peer_devices", [])
if peer.get("peer-disk-state", "") == "UpToDate"
)
logging.info(f"Elected `{other_host}` as peer for verification and repair")
except StopIteration:
pytest.fail("Could not find an UpToDate peer host")

corrupted = None
max_attempts = 3
# Attempting several times since testing revealed `drbdadm verify` can be flaky
for attempt in range(1, max_attempts + 1):
logging.info(f"`drbdadm verify` attempt {attempt}/{max_attempts}")
logging.info(f"[{other_host}] Running DRBD verify for `{resource_name}`...")
other_host.ssh(["drbdadm", "verify", f"{resource_name}:{hostname}/0"])
wait_sync(other_host, resource_name)

corrupted_resources = get_corrupted_resources(other_host, resource_name)
if not corrupted_resources:
logging.warning(f"No corrupted resources found on attempt #{attempt}")
continue
for res_name, peer_name, out_of_sync in corrupted_resources:
if res_name == resource_name and peer_name == hostname:
corrupted = (res_name, peer_name, out_of_sync)
if corrupted:
break
if not corrupted:
pytest.fail(f"Failed to identify corrupted resource after {max_attempts} attempts")

logging.info(f"Invalidating remote resource `{resource_name}`...")
other_host.ssh([
"drbdadm", "invalidate-remote",
f"{resource_name}:{hostname}/0",
"--reset-bitmap=no"
])
wait_sync(other_host, resource_name)
if get_corrupted_resources(other_host, resource_name):
pytest.fail("Corrupted resource did not get fixed")

vm.start(on=host.uuid)
try:
vm.wait_for_os_booted()
vm.test_snapshot_on_running_vm()
finally:
vm.shutdown(verify=True)

# *** tests with reboots (longer tests).

@pytest.mark.reboot
Expand Down