Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions ooniapi/services/ooniprobe/src/ooniprobe/prio.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@
```
"""

from typing import List, Tuple
from functools import lru_cache
import random
from typing import Annotated, Dict, List, Tuple
import logging

from fastapi import Depends
from pydantic import BaseModel

from .common.clickhouse_utils import query_click
from .common.metrics import timer
from .dependencies import ClickhouseDep

from clickhouse_driver import Client as Clickhouse
import sqlalchemy as sa
Expand Down Expand Up @@ -118,7 +124,7 @@ def fetch_prioritization_rules(clickhouse_db: Clickhouse, cc: str) -> tuple:
def generate_test_list(
clickhouse: Clickhouse,
country_code: str,
category_codes: List,
category_codes: List[str] | None,
probe_asn: int,
limit: int,
debug: bool,
Expand Down Expand Up @@ -151,7 +157,69 @@ def generate_test_list(
out.append(i)
if len(out) >= limit:
break

if debug:
return out, entries, prio_rules
return out, (), ()


class CTZ(BaseModel):
url: str
category_code: str


def failover_fetch_citizenlab_data(clickhouse: Clickhouse) -> Dict[str, List[CTZ]]:
"""
Fetches the citizenlab table from the database.
Used only once at startime for failover.
"""

log.info("Started failover_fetch_citizenlab_data")

sql = """SELECT category_code, url
FROM citizenlab
WHERE cc = 'ZZ'
"""

out: Dict[str, List[CTZ]] = {}
query = query_click(clickhouse, sql, {}, query_prio=1)
for e in query:
catcode = e["category_code"]
c = CTZ(url=e["url"], category_code=catcode)
out.setdefault(catcode, []).append(c)

log.info("Fetch done: %d" % len(out))
return out


@lru_cache
def failover_test_lists_cache(clickhouse: ClickhouseDep):
return failover_fetch_citizenlab_data(clickhouse)


FailoverTestListDep = Annotated[
Dict[str, List[CTZ]], Depends(failover_test_lists_cache)
]


def failover_generate_test_list(
failover_test_items: Dict[str, List[CTZ]],
category_codes: List[str] | None,
limit: int,
):
if not category_codes:
category_codes = list(failover_test_items.keys())

candidates: List[CTZ] = []
for catcode in category_codes:
if catcode not in failover_test_items:
continue
new = failover_test_items[catcode]
candidates.extend(new)

limit = min(limit, len(candidates))
selected = random.sample(candidates, k=limit)
out = [
dict(category_code=entry.category_code, url=entry.url, country_code="XX")
for entry in selected
]
return out
10 changes: 7 additions & 3 deletions ooniapi/services/ooniprobe/src/ooniprobe/routers/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ async def receive_measurement(
log.error(
f"[Try {t+1}/{N_RETRIES}] Error trying to send measurement to the fastpath. Error: {exc}"
)
sleep_time = random.uniform(0, min(3, 0.3 * 2 ** t))
sleep_time = random.uniform(0, min(3, 0.3 * 2**t))
await asyncio.sleep(sleep_time)

Metrics.SEND_FASTPATH_FAILURE.inc()
Expand Down Expand Up @@ -274,9 +274,13 @@ def compare_probe_msmt_cc_asn(
Metrics.PROBE_CC_ASN_MATCH.inc()
elif db_probe_cc != cc:
log.error(f"db_cc != cc: {db_probe_cc} != {cc}")
Metrics.PROBE_CC_ASN_NO_MATCH.labels(mismatch="cc", reported=cc, detected=db_probe_cc).inc()
Metrics.PROBE_CC_ASN_NO_MATCH.labels(
mismatch="cc", reported=cc, detected=db_probe_cc
).inc()
elif db_asn != asn:
log.error(f"db_asn != asn: {db_asn} != {asn}")
Metrics.PROBE_CC_ASN_NO_MATCH.labels(mismatch="asn", reported=asn, detected=db_asn).inc()
Metrics.PROBE_CC_ASN_NO_MATCH.labels(
mismatch="asn", reported=asn, detected=db_asn
).inc()
except Exception:
pass
102 changes: 98 additions & 4 deletions ooniapi/services/ooniprobe/src/ooniprobe/routers/v1/probe_services.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import logging
from datetime import datetime, timezone, timedelta
import time
from typing import List, Optional, Any, Dict, Tuple, Optional
from typing import Annotated, List, Optional, Any, Dict, Tuple
import random

import geoip2
import geoip2.errors
from fastapi import APIRouter, Depends, HTTPException, Response, Request
from fastapi import APIRouter, Depends, HTTPException, Query, Response, Request
from prometheus_client import Counter, Info, Gauge
from pydantic import Field

Expand All @@ -21,8 +21,8 @@
from ...common.routers import BaseModel
from ...common.auth import create_jwt, decode_jwt, jwt
from ...common.config import Settings
from ...common.utils import setnocacheresponse
from ...prio import generate_test_list
from ...common.utils import setnocacheresponse, setcacheresponse
from ...prio import FailoverTestListDep, failover_generate_test_list, generate_test_list

router = APIRouter(prefix="/v1")

Expand Down Expand Up @@ -63,6 +63,10 @@ class Metrics:
"geoip_asn_differs", "There's a mismatch between reported ASN and observed ASN"
)

TEST_LIST_URLS_COUNT = Gauge(
"test_list_urls_count", "How many urls were generated for a test list"
)


class ProbeLogin(BaseModel):
# Allow None username and password
Expand Down Expand Up @@ -590,3 +594,93 @@ def random_web_test_helpers(th_list: List[str]) -> List[Dict]:
for th_addr in th_list:
out.append({"address": th_addr, "type": "https"})
return out


class TestListUrlsMeta(BaseModel):
count: int
current_page: int
limit: int
next_url: str
pages: int


class TestListUrlsResult(BaseModel):
category_code: str
country_code: str
url: str


class TestListUrlsResponse(BaseModel):
"""
URL test list
"""

metadata: TestListUrlsMeta
results: List[TestListUrlsResult]


@router.get("/test-list/urls")
def list_test_urls(
clickhouse: ClickhouseDep,
failover_test_items: FailoverTestListDep,
response: Response,
category_codes: Annotated[
str | None,
Query(
description="Comma separated list of URL categories, all uppercase",
pattern=r"[A-Z,]*",
),
] = None,
country_code: Annotated[
str,
Query(
description="Two letter, uppercase country code",
min_length=2,
max_length=2,
alias="probe_cc",
),
] = "ZZ",
limit: Annotated[
int, Query(description="Maximum number of URLs to return", le=9999)
] = -1,
debug: Annotated[
bool,
Query(
description="Include measurement counts and priority",
),
] = False,
) -> TestListUrlsResponse | Dict[str, Any]:
"""
Generate test URL list with prioritization
"""
try:
country_code = country_code.upper()
category_codes_list = category_codes.split(",") if category_codes else None
if limit == -1:
limit = 9999
except Exception as e:
log.error(e, exc_info=True)
setnocacheresponse(response)
return {}

try:
test_items, _1, _2 = generate_test_list(
clickhouse, country_code, category_codes_list, 0, limit, debug
)
except Exception as e:
log.error(e, exc_info=True)
# failover_generate_test_list runs without any database interaction
test_items = failover_generate_test_list(
failover_test_items, category_codes_list, limit
)

# TODO: remove current_page / next_url / pages ?
Metrics.TEST_LIST_URLS_COUNT.set(len(test_items))
out = TestListUrlsResponse(
metadata=TestListUrlsMeta(
count=len(test_items), current_page=-1, limit=-1, next_url="", pages=1
),
results=[TestListUrlsResult(**item) for item in test_items],
)
setcacheresponse("1s", response)
return out
40 changes: 34 additions & 6 deletions ooniapi/services/ooniprobe/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
import shutil
import os
import json
from urllib.request import urlopen

from fastapi.testclient import TestClient
Expand All @@ -12,6 +13,7 @@
from clickhouse_driver import Client as ClickhouseClient

from ooniprobe.common.config import Settings
from ooniprobe.common.clickhouse_utils import insert_click
from ooniprobe.common.dependencies import get_settings
from ooniprobe.dependencies import get_s3_client
from ooniprobe.main import app
Expand Down Expand Up @@ -75,10 +77,11 @@ def client_with_bad_settings():

JWT_ENCRYPTION_KEY = "super_secure"


@pytest.fixture(scope="session")
def fixture_path():
"""
Directory for this fixtures used to store temporary data, will be
Directory for this fixtures used to store temporary data, will be
deleted after the tests are finished
"""
FIXTURE_PATH = Path(os.path.dirname(os.path.realpath(__file__))) / "data"
Expand All @@ -90,6 +93,7 @@ def fixture_path():
except FileNotFoundError:
pass


@pytest.fixture()
def geoip_db_dir(fixture_path):
ooni_tempdir = fixture_path / "geoip"
Expand All @@ -107,7 +111,9 @@ def client(clickhouse_server, test_settings, geoip_db_dir):


@pytest.fixture
def test_settings(alembic_migration, docker_ip, docker_services, geoip_db_dir, fastpath_server):
def test_settings(
alembic_migration, docker_ip, docker_services, geoip_db_dir, fastpath_server
):
port = docker_services.port_for("clickhouse", 9000)
yield make_override_get_settings(
postgresql_url=alembic_migration,
Expand All @@ -116,7 +122,7 @@ def test_settings(alembic_migration, docker_ip, docker_services, geoip_db_dir, f
clickhouse_url=f"clickhouse://test:test@{docker_ip}:{port}",
geoip_db_dir=geoip_db_dir,
collector_id="1",
fastpath_url=fastpath_server
fastpath_url=fastpath_server,
)


Expand Down Expand Up @@ -149,6 +155,7 @@ def clickhouse_server(docker_ip, docker_services):
def clickhouse_db(clickhouse_server):
yield ClickhouseClient.from_url(clickhouse_server)


class S3ClientMock:

def __init__(self) -> None:
Expand All @@ -157,9 +164,11 @@ def __init__(self) -> None:
def upload_fileobj(self, Fileobj, Bucket: str, Key: str):
self.files.append(f"{Bucket}/{Key}")


def get_s3_client_mock() -> S3ClientMock:
return S3ClientMock()


@pytest.fixture(scope="session")
def fastpath_server(docker_ip, docker_services):
port = docker_services.port_for("fakepath", 80)
Expand All @@ -169,9 +178,28 @@ def fastpath_server(docker_ip, docker_services):
)
yield url

def is_fastpath_running(url: str) -> bool:
try:

def is_fastpath_running(url: str) -> bool:
try:
resp = urlopen(url)
return resp.status == 200
except:
return False
return False


@pytest.fixture
def load_url_priorities(clickhouse_db):
path = Path("tests/fixtures/data")
filename = "url_priorities_us.json"
file = Path(path, filename)

with file.open("r") as f:
j = json.load(f)

# 'sign' is created with default value 0, causing a db error.
# use 1 to prevent it
for row in j:
row["sign"] = 1

query = "INSERT INTO url_priorities (sign, category_code, cc, domain, url, priority) VALUES"
insert_click(clickhouse_db, query, j)
3 changes: 1 addition & 2 deletions ooniapi/services/ooniprobe/tests/fakepath/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,4 @@

@app.get("/")
def health():
return

return
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ INSERT INTO citizenlab VALUES ('www.ushmm.org', 'https://www.ushmm.org/', 'ZZ',
INSERT INTO citizenlab VALUES ('www.cabofrio.rj.gov.br', 'http://www.cabofrio.rj.gov.br/', 'BR', 'CULTR');
INSERT INTO citizenlab VALUES ('ncac.org', 'http://ncac.org/', 'ZZ', 'NEWS');
INSERT INTO citizenlab VALUES ('ncac.org', 'https://ncac.org/', 'ZZ', 'NEWS');
INSERT INTO citizenlab VALUES ('ncacd.org', 'https://ncacd.org/', 'ZZ', 'NEWS');
INSERT INTO citizenlab VALUES ('www.facebook.com','http://www.facebook.com/saakashvilimikheil','ge','NEWS');
INSERT INTO citizenlab VALUES ('www.facebook.com','http://www.facebook.com/somsakjeam/videos/1283095981743678/','th','POLR');
INSERT INTO citizenlab VALUES ('www.facebook.com','https://www.facebook.com/','ZZ','GRP');
Expand All @@ -17,8 +18,7 @@ INSERT INTO citizenlab VALUES ('facebook.com','https://facebook.com/watch','jo',
INSERT INTO citizenlab VALUES ('twitter.com','http://twitter.com/ghonim','kw','POLR');
INSERT INTO citizenlab VALUES ('twitter.com','http://twitter.com/ghonim','so','POLR');
INSERT INTO citizenlab VALUES ('twitter.com','https://twitter.com/','ZZ','GRP');
INSERT INTO citizenlab VALUES ('twitter.com','https://twitter.com/funny','ZZ','HUMR');

-- get_measurement_meta integ tests
INSERT INTO jsonl (report_id, input, s3path, linenum) VALUES ('20210709T004340Z_webconnectivity_MY_4818_n1_YCM7J9mGcEHds2K3', 'https://www.backtrack-linux.org/', 'raw/20210709/00/MY/webconnectivity/2021070900_MY_webconnectivity.n0.2.jsonl.gz', 35)


Loading