Skip to content

Commit 40c7b6b

Browse files
authored
Add backend validation and migration for failOnContentCheck (#2844)
Fixes #2838 - Ensure `failOnContentCheck` cannot be set when a browser profile is not set in crawlconfig POST and PATCH endpoints, else return 400 with detail `fail_on_content_check_requires_profile` - Add migration to unset `failOnContentCheck` for any workflows in the database that currently have that flag enabled without a profile configured
1 parent 59c11ab commit 40c7b6b

File tree

6 files changed

+237
-6
lines changed

6 files changed

+237
-6
lines changed

backend/btrixcloud/crawlconfigs.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,11 @@ async def add_crawl_config(
259259
# ensure profile is valid, if provided
260260
if profileid:
261261
await self.profiles.get_profile(profileid, org)
262+
else:
263+
if config_in.config and config_in.config.failOnContentCheck:
264+
raise HTTPException(
265+
status_code=400, detail="fail_on_content_check_requires_profile"
266+
)
262267

263268
# ensure proxyId is valid and available for org
264269
if config_in.proxyId:
@@ -525,6 +530,29 @@ async def update_crawl_config(
525530
status_code=400, detail="use_one_of_seeds_or_seedfile"
526531
)
527532

533+
# If adding failOnContentCheck, ensure a profile is set
534+
if update.config and update.config.failOnContentCheck:
535+
if (orig_crawl_config.profileid and update.profileid == "") or (
536+
not orig_crawl_config.profileid and not update.profileid
537+
):
538+
raise HTTPException(
539+
status_code=400, detail="fail_on_content_check_requires_profile"
540+
)
541+
542+
# If unsetting profile, ensure failOnContentCheck is also not set
543+
if orig_crawl_config.profileid and update.profileid == "":
544+
if update.config and update.config.failOnContentCheck:
545+
raise HTTPException(
546+
status_code=400, detail="fail_on_content_check_requires_profile"
547+
)
548+
if orig_crawl_config.config.failOnContentCheck:
549+
if not update.config or (
550+
update.config and update.config.failOnContentCheck is not False
551+
):
552+
raise HTTPException(
553+
status_code=400, detail="fail_on_content_check_requires_profile"
554+
)
555+
528556
# indicates if any k8s crawl config settings changed
529557
changed = False
530558
changed = changed or (

backend/btrixcloud/db.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
) = PageOps = BackgroundJobOps = FileUploadOps = CrawlLogOps = object
3535

3636

37-
CURR_DB_VERSION = "0050"
37+
CURR_DB_VERSION = "0051"
3838

3939

4040
# ============================================================================
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
"""
2+
Migration 0051 - Ensure failOnContentCheck is not set for workflows without profiles
3+
"""
4+
5+
from btrixcloud.migrations import BaseMigration
6+
7+
8+
MIGRATION_VERSION = "0051"
9+
10+
11+
# pylint: disable=duplicate-code
12+
class Migration(BaseMigration):
13+
"""Migration class."""
14+
15+
# pylint: disable=unused-argument
16+
def __init__(self, mdb, **kwargs):
17+
super().__init__(mdb, migration_version=MIGRATION_VERSION)
18+
19+
async def migrate_up(self) -> None:
20+
"""Perform migration up.
21+
22+
Unset failOnContentCheck for workflows that don't have a profile set
23+
"""
24+
crawl_configs_mdb = self.mdb["crawl_configs"]
25+
26+
try:
27+
await crawl_configs_mdb.update_many(
28+
{"profileid": None, "config.failOnContentCheck": True},
29+
{"$set": {"config.failOnContentCheck": False}},
30+
)
31+
32+
# pylint: disable=broad-exception-caught
33+
except Exception as err:
34+
print(
35+
f"Error unsetting failOnContentCheck for configs without profiles: {err}",
36+
flush=True,
37+
)

backend/test/conftest.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,10 @@ def sample_crawl_data():
332332
return {
333333
"runNow": False,
334334
"name": "Test Crawl",
335-
"config": {"seeds": [{"url": "https://example-com.webrecorder.net/"}], "extraHops": 1},
335+
"config": {
336+
"seeds": [{"url": "https://example-com.webrecorder.net/"}],
337+
"extraHops": 1,
338+
},
336339
"tags": ["tag1", "tag2"],
337340
}
338341

@@ -822,6 +825,35 @@ def profile_2_id(admin_auth_headers, default_org_id, profile_browser_2_id):
822825
time.sleep(5)
823826

824827

828+
@pytest.fixture(scope="session")
829+
def profile_2_config_id(admin_auth_headers, default_org_id, profile_2_id):
830+
r = requests.get(
831+
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_2_id}",
832+
headers=admin_auth_headers,
833+
)
834+
assert r.status_code == 200
835+
data = r.json()
836+
assert data["id"] == profile_2_id
837+
838+
# Use profile in a workflow
839+
r = requests.post(
840+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
841+
headers=admin_auth_headers,
842+
json={
843+
"runNow": False,
844+
"name": "Profile 2 Test Crawl",
845+
"description": "Crawl using browser profile",
846+
"config": {
847+
"seeds": [{"url": "https://webrecorder.net/"}],
848+
"exclude": "community",
849+
},
850+
"profileid": profile_2_id,
851+
},
852+
)
853+
data = r.json()
854+
return data["id"]
855+
856+
825857
@pytest.fixture(scope="session")
826858
def seed_file_id(crawler_auth_headers, default_org_id):
827859
with open(os.path.join(curr_dir, "data", "seedfile.txt"), "rb") as fh:

backend/test/test_crawlconfigs.py

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -993,3 +993,137 @@ def test_delete_seed_file_in_use_crawlconfig(
993993
)
994994
assert r.status_code == 200
995995
assert r.json()["id"] == seed_file_id
996+
997+
998+
def test_add_crawl_config_fail_on_content_check_no_profile(
999+
crawler_auth_headers, default_org_id, sample_crawl_data
1000+
):
1001+
# Ensure we're not able to set failOnContentCheck on a new crawlconfig
1002+
# if a profile is not also set
1003+
sample_crawl_data["config"]["failOnContentCheck"] = True
1004+
r = requests.post(
1005+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
1006+
headers=crawler_auth_headers,
1007+
json=sample_crawl_data,
1008+
)
1009+
assert r.status_code == 400
1010+
assert r.json()["detail"] == "fail_on_content_check_requires_profile"
1011+
1012+
# Ensure an empty string doesn't count as a profile being set
1013+
sample_crawl_data["profileid"] = ""
1014+
sample_crawl_data["config"]["failOnContentCheck"] = True
1015+
r = requests.post(
1016+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
1017+
headers=crawler_auth_headers,
1018+
json=sample_crawl_data,
1019+
)
1020+
assert r.status_code == 400
1021+
assert r.json()["detail"] == "fail_on_content_check_requires_profile"
1022+
1023+
1024+
def test_update_crawl_config_fail_on_content_check_no_profile(
1025+
crawler_auth_headers, default_org_id
1026+
):
1027+
# Ensure we're not able to update an existing config with no profile to
1028+
# enable failOnContentCheck
1029+
r = requests.patch(
1030+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_admin_crawl_cid}/",
1031+
headers=crawler_auth_headers,
1032+
json={
1033+
"config": {
1034+
"failOnContentCheck": True,
1035+
}
1036+
},
1037+
)
1038+
assert r.status_code == 400
1039+
assert r.json()["detail"] == "fail_on_content_check_requires_profile"
1040+
1041+
1042+
def test_update_crawl_config_remove_profile_with_fail_on_content_check(
1043+
crawler_auth_headers, default_org_id, profile_2_config_id
1044+
):
1045+
# Ensure removing a profile fails validation if failOnContentCheck is set
1046+
r = requests.patch(
1047+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1048+
headers=crawler_auth_headers,
1049+
json={
1050+
"profileid": "",
1051+
"config": {
1052+
"failOnContentCheck": True,
1053+
},
1054+
},
1055+
)
1056+
assert r.status_code == 400
1057+
assert r.json()["detail"] == "fail_on_content_check_requires_profile"
1058+
1059+
1060+
def test_update_crawl_config_fail_on_content_check_with_profile(
1061+
crawler_auth_headers, default_org_id, profile_2_config_id
1062+
):
1063+
# Ensure we are able to update a config to enable failOnContentCheck
1064+
# if a profile is set for the config
1065+
r = requests.patch(
1066+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1067+
headers=crawler_auth_headers,
1068+
json={
1069+
"config": {
1070+
"failOnContentCheck": True,
1071+
}
1072+
},
1073+
)
1074+
assert r.status_code == 200
1075+
data = r.json()
1076+
assert data["settings_changed"] == True
1077+
assert data["metadata_changed"] == False
1078+
1079+
1080+
def test_update_crawl_config_remove_profile_no_fail_on_content_check(
1081+
crawler_auth_headers, default_org_id, profile_2_id, profile_2_config_id
1082+
):
1083+
# First remove failOnContentCheck
1084+
r = requests.patch(
1085+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1086+
headers=crawler_auth_headers,
1087+
json={
1088+
"config": {
1089+
"failOnContentCheck": False,
1090+
}
1091+
},
1092+
)
1093+
assert r.status_code == 200
1094+
data = r.json()
1095+
assert data["settings_changed"] == True
1096+
assert data["metadata_changed"] == False
1097+
1098+
# Now we should be able to remove the profile without getting an error
1099+
r = requests.patch(
1100+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1101+
headers=crawler_auth_headers,
1102+
json={"profileid": ""},
1103+
)
1104+
assert r.status_code == 200
1105+
data = r.json()
1106+
assert data["settings_changed"] == True
1107+
assert data["metadata_changed"] == False
1108+
1109+
# Add the profile and failOnContentCheck back
1110+
r = requests.patch(
1111+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1112+
headers=crawler_auth_headers,
1113+
json={"profileid": profile_2_id, "config": {"failOnContentCheck": True}},
1114+
)
1115+
assert r.status_code == 200
1116+
data = r.json()
1117+
assert data["settings_changed"] == True
1118+
assert data["metadata_changed"] == False
1119+
1120+
# Now check removing them both together
1121+
r = requests.patch(
1122+
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{profile_2_config_id}/",
1123+
headers=crawler_auth_headers,
1124+
json={"profileid": "", "config": {"failOnContentCheck": False}},
1125+
)
1126+
assert r.status_code == 200
1127+
data = r.json()
1128+
assert data["settings_changed"] == True
1129+
assert data["metadata_changed"] == False

backend/test/test_filter_sort_results.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ def test_get_config_by_modified_by(
2323
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs?modifiedBy={crawler_userid}",
2424
headers=crawler_auth_headers,
2525
)
26-
assert len(r.json()["items"]) == 9
27-
assert r.json()["total"] == 9
26+
assert len(r.json()["items"]) == 10
27+
assert r.json()["total"] == 10
2828

2929

3030
def test_get_configs_by_first_seed(
@@ -362,9 +362,9 @@ def test_sort_crawl_configs(
362362
headers=crawler_auth_headers,
363363
)
364364
data = r.json()
365-
assert data["total"] == 15
365+
assert data["total"] == 16
366366
items = data["items"]
367-
assert len(items) == 15
367+
assert len(items) == 16
368368

369369
last_created = None
370370
for config in items:

0 commit comments

Comments
 (0)