From 9fbc8b7bb1959894fc6f50c72159bc162c9ce924 Mon Sep 17 00:00:00 2001 From: vladsaveliev Date: Wed, 18 Sep 2024 22:53:46 +0200 Subject: [PATCH 1/2] Split visits and downloads into two apps. Support pip compile. Add pydantic-settings --- README.md | 48 ++++-- app/app_downloads.py | 119 +++++++++++++ app/{main.py => app_visits.py} | 204 +++++++++-------------- app/{downloads/daily.py => downloads.py} | 13 +- app/downloads/__init__.py | 0 app/settings.py | 12 ++ app/utils.py | 2 + pyproject.toml | 29 +++- requirements-dev.txt | 186 ++++++++++++++++++++- requirements.txt | 180 ++++++++++++++++++-- 10 files changed, 626 insertions(+), 167 deletions(-) create mode 100644 app/app_downloads.py rename app/{main.py => app_visits.py} (78%) rename app/{downloads/daily.py => downloads.py} (98%) delete mode 100644 app/downloads/__init__.py create mode 100644 app/settings.py create mode 100644 app/utils.py diff --git a/README.md b/README.md index 7afde16..f93b782 100644 --- a/README.md +++ b/README.md @@ -10,22 +10,23 @@ Currently, there are the following endpoints that are used: ### `/version` - Information about the latest available release - - MultiQC uses this to print a log message advising if the current version is out of date, with information about how to upgrade. + - MultiQC uses this to print a log message advising if the current version is out of date, with information about + how to upgrade. - _[Planned]_: Broadcast messages - - Can be used to announce arbitrary information, such as critical changes. - - No usage currently anticipated, this is mostly a future-proofing tool. + - Can be used to announce arbitrary information, such as critical changes. + - No usage currently anticipated, this is mostly a future-proofing tool. - _[Planned]_: Module-specific warnings - - Warnings scoped to module and MultiQC version - - Will allow MultiQC to notify end users via the log if the module that they are running has serious bugs or errors. + - Warnings scoped to module and MultiQC version + - Will allow MultiQC to notify end users via the log if the module that they are running has serious bugs or errors. ### `/downloads` - MultiQC package downloads across multiple sources, and, when available, different versions: - - [PyPI](https://pypi.org/project/multiqc) (additionally, split by version) - - [BioConda](https://bioconda.github.io/recipes/multiqc) (additionally, split by version) - - [DockerHub](https://hub.docker.com/r/ewels/multiqc) - - [GitHub clones](https://github.com/ewels/MultiQC/graphs/traffic) - - [BioContainers (AWS mirror)](https://api.us-east-1.gallery.ecr.aws/getRepositoryCatalogData) + - [PyPI](https://pypi.org/project/multiqc) (additionally, split by version) + - [BioConda](https://bioconda.github.io/recipes/multiqc) (additionally, split by version) + - [DockerHub](https://hub.docker.com/r/ewels/multiqc) + - [GitHub clones](https://github.com/ewels/MultiQC/graphs/traffic) + - [BioContainers (AWS mirror)](https://api.us-east-1.gallery.ecr.aws/getRepositoryCatalogData) ## Logged metrics @@ -40,11 +41,15 @@ Currently, it reports: - _[Planned]_: Installation method (pip|conda|docker|unknown) - _[Planned]_: CI environment (GitHub Actions|none) -No identifying information is collected. No IPs are logged, no information about what MultiQC is being used for or where, no sample data or metadata is transferred. All code in both MultiQC and this API is open source and can be inspected. +No identifying information is collected. No IPs are logged, no information about what MultiQC is being used for or +where, no sample data or metadata is transferred. All code in both MultiQC and this API is open source and can be +inspected. -This version check can be disabled by adding `no_version_check: true` to your MultiQC config (see [docs](https://multiqc.info/docs/getting_started/config/#checks-for-new-versions)). +This version check can be disabled by adding `no_version_check: true` to your MultiQC config ( +see [docs](https://multiqc.info/docs/getting_started/config/#checks-for-new-versions)). -The request uses a very short timeout (2 seconds) and fails silently if MultiQC has no internet connection or an unexpected response is returned. +The request uses a very short timeout (2 seconds) and fails silently if MultiQC has no internet connection or an +unexpected response is returned. ## Production deployment @@ -56,6 +61,8 @@ ghcr.io/multiqc/apimultiqcinfo:latest ## Development +### Local build + > **Note:** > These instructions are intended for local development work, not a production deployment. @@ -74,10 +81,23 @@ docker compose up The API should now be available at -I recommend using something like [Postcode](https://marketplace.visualstudio.com/items?itemName=rohinivsenthil.postcode) (VSCode extension) or [httpie](https://httpie.io/) or similar. +I recommend using something +like [Postcode](https://marketplace.visualstudio.com/items?itemName=rohinivsenthil.postcode) (VSCode extension) +or [httpie](https://httpie.io/) or similar. When you're done, Ctrl+C to exit, then lean up: ```bash docker compose down ``` + +### Dependencies + +To add a dependency, add it to the `pyproject.toml` file and then compile the requirements: + +```sh +uv pip compile pyproject.toml -o requirements.txt +uv pip compile pyproject.toml --extra dev -o requirements-dev.txt +``` + + diff --git a/app/app_downloads.py b/app/app_downloads.py new file mode 100644 index 0000000..8bf070c --- /dev/null +++ b/app/app_downloads.py @@ -0,0 +1,119 @@ +import logging + +import datetime +from typing import cast + +import uvicorn + +from fastapi import BackgroundTasks, FastAPI, HTTPException, status +from fastapi.responses import PlainTextResponse +from fastapi.routing import APIRoute +from fastapi_utilities import repeat_every +from sqlalchemy.exc import ProgrammingError + +from app import __version__, db + +logger = logging.getLogger("multiqc_app_downloads") + +logger.info("Starting MultiQC API download scraping service") + +# Add timestamp to the uvicorn logger +for h in logging.getLogger("uvicorn.access").handlers: + h.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")) + +app = FastAPI( + title="MultiQC API", + description="MultiQC API service, providing run-time information about available " "updates.", + version=__version__, + license_info={ + "name": "Source code available under the MIT Licence", + "url": "https://github.com/MultiQC/api.multiqc.info/blob/main/LICENSE", + }, +) + +db.create_db_and_tables() + + +@app.get("/") +async def index(_: BackgroundTasks): + """ + Root endpoint for the API. + Returns a list of available endpoints. + """ + routes = [cast(APIRoute, r) for r in app.routes] + return { + "message": "Welcome to the MultiQC downloads scraping service", + "available_endpoints": [ + {"path": route.path, "name": route.name} for route in routes if route.name != "swagger_ui_redirect" + ], + } + + +@app.get("/health") +async def health(): + """ + Health check endpoint. Checks if the visits table contains records + in the past 15 minutes. + """ + try: + visits = db.get_visit_stats(start=datetime.datetime.now() - datetime.timedelta(minutes=15)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + if not visits: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="No recent visits found") + return PlainTextResponse(content=str(len(visits))) + + +@repeat_every( + seconds=60 * 60 * 24, # every day + wait_first=True, + logger=logger, +) +async def update_downloads(): + """ + Repeated task to update the daily download statistics + """ + _update_download_stats() + + +@app.post("/update_downloads") +async def update_downloads_endpoint(background_tasks: BackgroundTasks): + """ + Endpoint to manually update the daily download statistics + """ + try: + background_tasks.add_task(_update_download_stats) + msg = "Queued updating the download stats in the DB" + logger.info(msg) + return PlainTextResponse(content=msg) + except Exception as e: + msg = f"Failed to update the download stats: {e}" + raise HTTPException(status_code=status.INTERNAL_SERVER_ERROR, detail=msg) + + +def _update_download_stats(): + """ + Update the daily download statistics in the database + """ + try: + existing_downloads = db.get_download_stats() + except ProgrammingError: + logger.error("The table does not exist, will create and populate with historical data") + existing_downloads = [] + if len(existing_downloads) == 0: # first time, populate historical data + logger.info("Collecting historical downloads data...") + df = daily.collect_daily_download_stats() + logger.info(f"Adding {len(df)} historical entries to the table...") + db.insert_download_stats(df) + logger.info(f"Successfully populated {len(df)} historical entries") + else: # recent days only + n_days = 4 + logger.info(f"Updating downloads data for the last {n_days} days...") + df = daily.collect_daily_download_stats(days=n_days) + logger.info(f"Adding {len(df)} recent entries to the table. Will update existing " f"entries at the same date") + db.insert_download_stats(df) + logger.info(f"Successfully updated {len(df)} new daily download statistics") + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/app/main.py b/app/app_visits.py similarity index 78% rename from app/main.py rename to app/app_visits.py index 3b9d1e4..bbb5db9 100644 --- a/app/main.py +++ b/app/app_visits.py @@ -1,6 +1,7 @@ import logging +from contextlib import asynccontextmanager -from typing import List, Dict, Optional +from typing import List, Dict, Optional, cast from pathlib import Path @@ -13,33 +14,26 @@ from enum import Enum from os import getenv +from pydantic import HttpUrl import pandas as pd import plotly.express as px from fastapi import BackgroundTasks, FastAPI, HTTPException, status from fastapi.responses import HTMLResponse, PlainTextResponse, Response +from fastapi.routing import APIRoute from fastapi_utilities import repeat_every from github import Github from plotly.graph_objs import Layout -from sqlalchemy.exc import ProgrammingError from app import __version__, db, models -from app.downloads import daily - +from app.utils import strtobool logger = logging.getLogger("multiqc_api") logger.info("Starting MultiQC API service") - -app = FastAPI( - title="MultiQC API", - description="MultiQC API service, providing run-time information about available updates.", - version=__version__, - license_info={ - "name": "Source code available under the MIT Licence", - "url": "https://github.com/MultiQC/api.multiqc.info/blob/main/LICENSE", - }, -) +# Add timestamp to the uvicorn logger +for h in logging.getLogger("uvicorn.access").handlers: + h.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")) def get_latest_release() -> models.LatestRelease: @@ -53,35 +47,45 @@ def get_latest_release() -> models.LatestRelease: return models.LatestRelease( version=release.tag_name, release_date=release.published_at.date(), - html_url=release.html_url, + html_url=HttpUrl(release.html_url), ) -app.latest_release = get_latest_release() +@asynccontextmanager +async def lifespan(_: FastAPI): + yield + # Summarize when the app receives a shutdown signal. + logger.info("Shutdown called, summarizing visits...") + _summarize_visits() + logger.info("Complete, now ready to shut down") -@app.on_event("startup") -async def startup(): - # Add timestamp to the uvicorn logger - logger = logging.getLogger("uvicorn.access") - for h in logger.handlers: - h.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")) +app = FastAPI( + title="MultiQC API", + description="MultiQC API service, providing run-time information about available " "" "" "" "" "" "" "updates.", + version=__version__, + license_info={ + "name": "Source code available under the MIT Licence", + "url": "https://github.com/MultiQC/api.multiqc.info/blob/main/LICENSE", + }, + lifespan=lifespan, +) + +# Sync latest version tag using GitHub API +latest_release = get_latest_release() - # Initialise the DB and tables on server startup - db.create_db_and_tables() - # Sync latest version tag using GitHub API - app.latest_release = get_latest_release() +db.create_db_and_tables() -@app.on_event("startup") @repeat_every(seconds=15 * 60) # every 15 minutes def update_version(): """Sync latest version tag using GitHub API""" - app.latest_release = get_latest_release() + global latest_release + latest_release = get_latest_release() # Fields to store per visit -visit_fieldnames = [ +VISIT_FIELDNAMES = [ "version_multiqc", "version_python", "operating_system", @@ -91,7 +95,8 @@ def update_version(): "is_ci", ] -# Thread-safe in-memory buffer to accumulate recent visits before writing to the CSV file +# Thread-safe in-memory buffer to accumulate recent visits before writing to the CSV +# file visit_buffer: List[Dict[str, str]] = [] visit_buffer_lock = Lock() @@ -122,22 +127,7 @@ async def version( is_conda=is_conda, is_ci=is_ci, ) - return models.VersionResponse(latest_release=app.latest_release) - - -@app.get("/health") -async def health(): - """ - Health check endpoint. Checks if the visits table contains records - in the past 15 minutes. - """ - try: - visits = db.get_visit_stats(start=datetime.datetime.now() - datetime.timedelta(minutes=15)) - except Exception as e: - raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) - if not visits: - raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="No recent visits found") - return PlainTextResponse(content=str(len(visits))) + return models.VersionResponse(latest_release=latest_release) def _log_visit( @@ -185,20 +175,31 @@ def _persist_visits(verbose=False) -> Optional[Response]: with open(CSV_FILE_PATH, mode="r") as file: n_visits_file = sum(1 for _ in file) if not visit_buffer: - return PlainTextResponse(content=f"No new visits to persist. File contains {n_visits_file} entries") + return PlainTextResponse( + content=f"No new visits to persist. File contains {n_visits_file} " + f"" + f"" + f"" + f"" + f"" + f"" + f"entries" + ) logger.debug( - f"Appending {len(visit_buffer)} visits to {CSV_FILE_PATH} that currently contains {n_visits_file} visits" + f"Appending {len(visit_buffer)} visits to {CSV_FILE_PATH} that " + f"currently contains {n_visits_file} visits" ) with open(CSV_FILE_PATH, mode="a") as file: - writer: csv.DictWriter = csv.DictWriter(file, fieldnames=["timestamp"] + visit_fieldnames) + writer: csv.DictWriter = csv.DictWriter(file, fieldnames=["timestamp"] + VISIT_FIELDNAMES) writer.writerows(visit_buffer) if verbose: with open(CSV_FILE_PATH, mode="r") as file: n_visits_file = sum(1 for _ in file) msg = ( - f"Successfully persisted {len(visit_buffer)} visits to {CSV_FILE_PATH}, " + f"Successfully persisted {len(visit_buffer)} visits to " + f"{CSV_FILE_PATH}, " f"file now contains {n_visits_file} entries" ) logger.debug(msg) @@ -211,7 +212,6 @@ def _persist_visits(verbose=False) -> Optional[Response]: return None -@app.on_event("startup") @repeat_every( seconds=10, wait_first=True, @@ -236,7 +236,7 @@ def _summarize_visits(interval="5min") -> Response: df = pd.read_csv( CSV_FILE_PATH, sep=",", - names=["timestamp"] + visit_fieldnames, + names=["timestamp"] + VISIT_FIELDNAMES, dtype="string", na_filter=False, # prevent empty strings from converting to nan or ) @@ -244,10 +244,6 @@ def _summarize_visits(interval="5min") -> Response: df["end"] = df["start"] + pd.to_timedelta(interval) df["start"] = df["start"].dt.strftime("%Y-%m-%d %H:%M") df["end"] = df["end"].dt.strftime("%Y-%m-%d %H:%M") - - def strtobool(val) -> bool: - return str(val).lower() in ("y", "yes", "t", "true", "on", "1") - df["is_docker"] = df["is_docker"].apply(strtobool) df["is_singularity"] = df["is_singularity"].apply(strtobool) df["is_conda"] = df["is_conda"].apply(strtobool) @@ -255,14 +251,16 @@ def strtobool(val) -> bool: df = df.drop(columns=["timestamp"]) # Summarize visits per user per time interval - interval_summary = df.groupby(["start", "end"] + visit_fieldnames).size().reset_index(name="count") + interval_summary = df.groupby(["start", "end"] + VISIT_FIELDNAMES).size().reset_index(name="count") if len(interval_summary) == 0: msg = "No new visits to summarize" logger.info(msg) return PlainTextResponse(content=msg) logger.info( - f"Summarizing {len(df)} visits in {CSV_FILE_PATH} and writing {len(interval_summary)} rows to the DB" + f"Summarizing {len(df)} visits in {CSV_FILE_PATH} and writ" + f"ing " + f"{len(interval_summary)} rows to the DB" ) try: db.insert_visit_stats(interval_summary) @@ -274,13 +272,13 @@ def strtobool(val) -> bool: content=msg, ) else: - msg = f"Successfully summarized {len(df)} visits to {len(interval_summary)} per-interval entries" + msg = f"Successfully summarized {len(df)} visits" f" to " + f"{len(interval_summary)} per-interval entries" logger.info(msg) open(CSV_FILE_PATH, "w").close() # Clear the CSV file on successful write return PlainTextResponse(content=msg) -@app.on_event("startup") @repeat_every( seconds=10 * 60 * 1, # every 10 minutes wait_first=True, @@ -293,53 +291,6 @@ async def summarize_visits(): return _summarize_visits() -@app.on_event("shutdown") -async def shutdown_event(): - """ - Summarize when the app receives a shutdown signal. - """ - logger.info("Shutdown called, summarizing visits...") - _summarize_visits() - logger.info("Complete, now ready to shut down") - - -def _update_download_stats(): - """ - Update the daily download statistics in the database - """ - try: - existing_downloads = db.get_download_stats() - except ProgrammingError: - logger.error("The table does not exist, will create and populate with historical data") - existing_downloads = [] - if len(existing_downloads) == 0: # first time, populate historical data - logger.info("Collecting historical downloads data...") - df = daily.collect_daily_download_stats() - logger.info(f"Adding {len(df)} historical entries to the table...") - db.insert_download_stats(df) - logger.info(f"Successfully populated {len(df)} historical entries") - else: # recent days only - n_days = 4 - logger.info(f"Updating downloads data for the last {n_days} days...") - df = daily.collect_daily_download_stats(days=n_days) - logger.info(f"Adding {len(df)} recent entries to the table. Will update existing entries at the same date") - db.insert_download_stats(df) - logger.info(f"Successfully updated {len(df)} new daily download statistics") - - -@app.on_event("startup") -@repeat_every( - seconds=60 * 60 * 24, # every day - wait_first=True, - logger=logger, -) -async def update_downloads(): - """ - Repeated task to update the daily download statistics. - """ - _update_download_stats() - - @app.post("/persist_visits") async def persist_visits_endpoint(): try: @@ -360,18 +311,6 @@ async def summarize_visits_endpoint(): raise HTTPException(status_code=status.INTERNAL_SERVER_ERROR, detail=msg) -@app.post("/update_downloads") -async def update_downloads_endpoint(background_tasks: BackgroundTasks): - try: - background_tasks.add_task(_update_download_stats) - msg = "Queued updating the download stats in the DB" - logger.info(msg) - return PlainTextResponse(content=msg) - except Exception as e: - msg = f"Failed to update the download stats: {e}" - raise HTTPException(status_code=status.INTERNAL_SERVER_ERROR, detail=msg) - - if os.getenv("ENVIRONMENT") == "DEV": @app.post("/clean_visits_csv_file") @@ -410,20 +349,21 @@ async def version_legacy(background_tasks: BackgroundTasks, v: str = ""): is_conda="", is_ci="", ) - return app.latest_release.version + return latest_release.version @app.get("/") -async def index(background_tasks: BackgroundTasks): +async def index(_: BackgroundTasks): """ Root endpoint for the API. Returns a list of available endpoints. """ + routes = [cast(APIRoute, r) for r in app.routes] return { - "message": "Welcome to the MultiQC service API", + "message": "Welcome to the MultiQC service", "available_endpoints": [ - {"path": route.path, "name": route.name} for route in app.routes if route.name != "swagger_ui_redirect" + {"path": route.path, "name": route.name} for route in routes if route.name != "swagger_ui_redirect" ], } @@ -476,7 +416,8 @@ async def plot_usage( # Plot histogram of df.count per interval from df.start logger.debug( - f"Plotting usage data, color by: {category.name if category else None}, start: {start}, " + f"Plotting usage data, color by: {category.name if category else None}, " + f"start: {start}, " f"end: {end}, interval: {interval.value}, limit: {limit}, format: {format.name}" ) fig = px.histogram( @@ -536,5 +477,20 @@ def plotly_image_response(plot, format: PlotlyImageFormats = PlotlyImageFormats. return Response(content=plot) +@app.get("/health") +async def health(): + """ + Health check endpoint. Checks if the visits table contains records + in the past 15 minutes. + """ + try: + visits = db.get_visit_stats(start=datetime.datetime.now() - datetime.timedelta(minutes=15)) + except Exception as e: + raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=str(e)) + if not visits: + raise HTTPException(status_code=status.HTTP_503_SERVICE_UNAVAILABLE, detail="No recent visits found") + return PlainTextResponse(content=str(len(visits))) + + if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/app/downloads/daily.py b/app/downloads.py similarity index 98% rename from app/downloads/daily.py rename to app/downloads.py index f926255..bf71598 100644 --- a/app/downloads/daily.py +++ b/app/downloads.py @@ -10,9 +10,10 @@ Stores data in a CSV file that can be sent to a database. -Can be re-run regularly to update the data, so only new data will be added to +Can be re-run regularly to update the data, so only new data will be added to an existing CSV file. """ + import logging import json @@ -26,14 +27,14 @@ import pypistats import requests from github import Github -from dotenv import load_dotenv + +from settings import settings # Load environment variables from the .env file -load_dotenv() logger = logging.getLogger("multiqc_api") -SOURCES_DIR = Path(__file__).parent / "sources" +SOURCES_DIR = Path(__file__).parent / "downloads" / "sources" # Whether we can write back daily.csv and other pulled stats to keep under version control. # Usually the code dir is not writable in the container environment. SOURCES_IS_WRITABLE = os.access(SOURCES_DIR, os.W_OK) @@ -314,7 +315,7 @@ def get_github_prs(days: int | None = None): """ Daily and total PRs and contributors. """ - g = Github(os.environ["GITHUB_TOKEN"]) + g = Github(settings.github_token) repo = g.get_repo("MultiQC/MultiQC") entries = [] for pr in repo.get_pulls(state="all", sort="created", direction="asc"): @@ -337,7 +338,7 @@ def get_github_prs(days: int | None = None): df["author"] = df["author"].apply(lambda x: x if x != 0 else []) # Collect the number of new contributors per day - contributors = set() + contributors: set[str] = set() entries = [] for date, row in df.iterrows(): authors = set(row.author) diff --git a/app/downloads/__init__.py b/app/downloads/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/settings.py b/app/settings.py new file mode 100644 index 0000000..a1b008e --- /dev/null +++ b/app/settings.py @@ -0,0 +1,12 @@ +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + model_config = SettingsConfigDict(env_file=".env", extra="ignore") + + github_token: str + database_url: str + environment: str = "dev" + + +settings = Settings() diff --git a/app/utils.py b/app/utils.py new file mode 100644 index 0000000..f3875c9 --- /dev/null +++ b/app/utils.py @@ -0,0 +1,2 @@ +def strtobool(val) -> bool: + return str(val).lower() in ("y", "yes", "t", "true", "on", "1") diff --git a/pyproject.toml b/pyproject.toml index 8ec4e20..42bf35d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,12 +5,33 @@ build-backend = "setuptools.build_meta" [project] name = "api.multiqc.info" version = "2023.0" -dependencies = {file = ["requirements.txt"]} -optional-dependencies = {dev = { file = ["requirements-dev.txt"] }} +dependencies = [ + "fastapi", + "fastapi-utilities", + "uvicorn", + "sqlmodel", + "pydantic", + "pydantic-settings", + "pymysql[rsa]", + "psycopg2-binary", + "types-requests", + # visits + "pandas", + "plotly", + # downloads + "gitpython", + "PyGithub", + "pypistats", +] +optional-dependencies = { dev = [ + "pre-commit", + "ruff", + "mypy", +] } requires-python = ">=3.12" authors = [ - {name = "Phil Ewels", email = "phil.ewels@seqera.io"}, - {name = "Vlad Savelyev", email = "vladislav.savelyev@seqera.io"}, + { name = "Phil Ewels", email = "phil.ewels@seqera.io" }, + { name = "Vlad Savelyev", email = "vladislav.savelyev@seqera.io" }, ] [tool.ruff] diff --git a/requirements-dev.txt b/requirements-dev.txt index c0e2644..28fb68a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,181 @@ -pre-commit -pytest -flake8 -flake8-docstrings -pytest-cov +# This file was autogenerated by uv v0.1.4 via the following command: +# uv pip compile pyproject.toml --extra dev -o requirements-dev.txt +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # starlette +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via + # cryptography + # pynacl +cfgv==3.4.0 + # via pre-commit +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via uvicorn +click-spinner==0.1.10 + # via fastapi-utilities +croniter==1.4.1 + # via fastapi-utilities +cryptography==43.0.1 + # via + # pyjwt + # pymysql +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +deprecated==1.2.14 + # via pygithub +distlib==0.3.8 + # via virtualenv +dominate==2.9.1 + # via pytablewriter +fastapi==0.115.0 + # via fastapi-utilities +fastapi-utilities==0.2.0 +filelock==3.16.1 + # via virtualenv +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.5 + # via httpx +httpx==0.27.2 + # via pypistats +identify==2.6.1 + # via pre-commit +idna==3.10 + # via + # anyio + # httpx + # requests +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +mypy==1.11.2 +mypy-extensions==1.0.0 + # via mypy +nodeenv==1.9.1 + # via pre-commit +numpy==2.1.1 + # via pandas +packaging==24.1 + # via + # plotly + # typepy +pandas==2.2.2 +pathvalidate==3.2.1 + # via pytablewriter +platformdirs==4.3.6 + # via + # pypistats + # virtualenv +plotly==5.24.1 +pre-commit==3.8.0 +prettytable==3.11.0 + # via pypistats +psycopg2-binary==2.9.9 +pycparser==2.22 + # via cffi +pydantic==2.9.2 + # via + # fastapi + # fastapi-utilities + # pydantic-settings + # sqlmodel +pydantic-core==2.23.4 + # via pydantic +pydantic-settings==2.5.2 +pygithub==2.4.0 +pyjwt==2.9.0 + # via pygithub +pymysql==1.1.1 +pynacl==1.5.0 + # via pygithub +pypistats==1.6.0 +pytablewriter==1.2.0 + # via pypistats +python-dateutil==2.9.0.post0 + # via + # croniter + # pandas + # pypistats + # typepy +python-dotenv==1.0.1 + # via pydantic-settings +python-slugify==8.0.4 + # via pypistats +pytz==2024.2 + # via + # pandas + # typepy +pyyaml==6.0.2 + # via pre-commit +requests==2.32.3 + # via pygithub +ruff==0.6.5 +setuptools==75.1.0 + # via pytablewriter +six==1.16.0 + # via python-dateutil +sniffio==1.3.1 + # via + # anyio + # httpx +sqlalchemy==2.0.35 + # via + # fastapi-utilities + # sqlmodel +sqlmodel==0.0.22 +starlette==0.38.5 + # via fastapi +tabledata==1.3.3 + # via pytablewriter +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via plotly +termcolor==2.4.0 + # via pypistats +text-unidecode==1.3 + # via python-slugify +typepy==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +typing-extensions==4.12.2 + # via + # fastapi + # mypy + # pydantic + # pydantic-core + # pygithub + # sqlalchemy +tzdata==2024.1 + # via pandas +urllib3==2.2.3 + # via + # pygithub + # requests +uvicorn==0.30.6 +virtualenv==20.26.5 + # via pre-commit +wcwidth==0.2.13 + # via prettytable +wrapt==1.16.0 + # via deprecated diff --git a/requirements.txt b/requirements.txt index 08f196f..408fa62 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,166 @@ -fastapi -fastapi-utilities -kaleido -pandas -plotly -PyGithub -gitpython -pymysql[rsa] -sqlmodel -psycopg2-binary -uvicorn -python-dotenv -pypistats -logzio-python-handler +# This file was autogenerated by uv v0.1.4 via the following command: +# uv pip compile pyproject.toml -o requirements.txt +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # starlette +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +cffi==1.17.1 + # via + # cryptography + # pynacl +chardet==5.2.0 + # via mbstrdecoder +charset-normalizer==3.3.2 + # via requests +click==8.1.7 + # via uvicorn +click-spinner==0.1.10 + # via fastapi-utilities +croniter==1.4.1 + # via fastapi-utilities +cryptography==43.0.1 + # via + # pyjwt + # pymysql +dataproperty==1.0.1 + # via + # pytablewriter + # tabledata +deprecated==1.2.14 + # via pygithub +dominate==2.9.1 + # via pytablewriter +fastapi==0.115.0 + # via fastapi-utilities +fastapi-utilities==0.2.0 +gitdb==4.0.11 + # via gitpython +gitpython==3.1.43 +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.5 + # via httpx +httpx==0.27.2 + # via pypistats +idna==3.10 + # via + # anyio + # httpx + # requests +mbstrdecoder==1.1.3 + # via + # dataproperty + # pytablewriter + # typepy +numpy==2.1.1 + # via pandas +packaging==24.1 + # via + # plotly + # typepy +pandas==2.2.2 +pathvalidate==3.2.1 + # via pytablewriter +platformdirs==4.3.6 + # via pypistats +plotly==5.24.1 +prettytable==3.11.0 + # via pypistats +psycopg2-binary==2.9.9 +pycparser==2.22 + # via cffi +pydantic==2.9.2 + # via + # fastapi + # fastapi-utilities + # pydantic-settings + # sqlmodel +pydantic-core==2.23.4 + # via pydantic +pydantic-settings==2.5.2 +pygithub==2.4.0 +pyjwt==2.9.0 + # via pygithub +pymysql==1.1.1 +pynacl==1.5.0 + # via pygithub +pypistats==1.6.0 +pytablewriter==1.2.0 + # via pypistats +python-dateutil==2.9.0.post0 + # via + # croniter + # pandas + # pypistats + # typepy +python-dotenv==1.0.1 + # via pydantic-settings +python-slugify==8.0.4 + # via pypistats +pytz==2024.2 + # via + # pandas + # typepy +requests==2.32.3 + # via pygithub +setuptools==75.1.0 + # via pytablewriter +six==1.16.0 + # via python-dateutil +smmap==5.0.1 + # via gitdb +sniffio==1.3.1 + # via + # anyio + # httpx +sqlalchemy==2.0.35 + # via + # fastapi-utilities + # sqlmodel +sqlmodel==0.0.22 +starlette==0.38.5 + # via fastapi +tabledata==1.3.3 + # via pytablewriter +tcolorpy==0.1.6 + # via pytablewriter +tenacity==9.0.0 + # via plotly +termcolor==2.4.0 + # via pypistats +text-unidecode==1.3 + # via python-slugify +typepy==1.3.2 + # via + # dataproperty + # pytablewriter + # tabledata +types-requests==2.32.0.20240914 +typing-extensions==4.12.2 + # via + # fastapi + # pydantic + # pydantic-core + # pygithub + # sqlalchemy +tzdata==2024.1 + # via pandas +urllib3==2.2.3 + # via + # pygithub + # requests + # types-requests +uvicorn==0.30.6 +wcwidth==0.2.13 + # via prettytable +wrapt==1.16.0 + # via deprecated From bb0ef6e5d2c631710dcd38b5fe782fbd0b13fee6 Mon Sep 17 00:00:00 2001 From: vladsaveliev Date: Wed, 18 Sep 2024 23:20:59 +0200 Subject: [PATCH 2/2] Use asyncio instead of repeated tasks --- app/app_downloads.py | 34 ++++++++++++++++----------- app/app_visits.py | 56 +++++++++++++++++++++----------------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/app/app_downloads.py b/app/app_downloads.py index 8bf070c..e90f07c 100644 --- a/app/app_downloads.py +++ b/app/app_downloads.py @@ -1,6 +1,8 @@ +import asyncio import logging import datetime +from contextlib import asynccontextmanager from typing import cast import uvicorn @@ -8,7 +10,6 @@ from fastapi import BackgroundTasks, FastAPI, HTTPException, status from fastapi.responses import PlainTextResponse from fastapi.routing import APIRoute -from fastapi_utilities import repeat_every from sqlalchemy.exc import ProgrammingError from app import __version__, db @@ -21,8 +22,25 @@ for h in logging.getLogger("uvicorn.access").handlers: h.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")) + +@asynccontextmanager +async def lifespan(_: FastAPI): + asyncio.create_task(update_downloads()) + + yield + + +async def update_downloads(): + """ + Repeated task to update the daily download statistics + """ + while True: + _update_download_stats() + await asyncio.sleep(60 * 60 * 24) # 24 hours + + app = FastAPI( - title="MultiQC API", + title="MultiQC download scraper service", description="MultiQC API service, providing run-time information about available " "updates.", version=__version__, license_info={ @@ -64,18 +82,6 @@ async def health(): return PlainTextResponse(content=str(len(visits))) -@repeat_every( - seconds=60 * 60 * 24, # every day - wait_first=True, - logger=logger, -) -async def update_downloads(): - """ - Repeated task to update the daily download statistics - """ - _update_download_stats() - - @app.post("/update_downloads") async def update_downloads_endpoint(background_tasks: BackgroundTasks): """ diff --git a/app/app_visits.py b/app/app_visits.py index bbb5db9..c01c401 100644 --- a/app/app_visits.py +++ b/app/app_visits.py @@ -1,3 +1,4 @@ +import asyncio import logging from contextlib import asynccontextmanager @@ -20,7 +21,6 @@ from fastapi import BackgroundTasks, FastAPI, HTTPException, status from fastapi.responses import HTMLResponse, PlainTextResponse, Response from fastapi.routing import APIRoute -from fastapi_utilities import repeat_every from github import Github from plotly.graph_objs import Layout @@ -53,6 +53,10 @@ def get_latest_release() -> models.LatestRelease: @asynccontextmanager async def lifespan(_: FastAPI): + asyncio.create_task(update_version()) + asyncio.create_task(persist_visits()) + asyncio.create_task(summarize_visits()) + yield # Summarize when the app receives a shutdown signal. logger.info("Shutdown called, summarizing visits...") @@ -62,7 +66,7 @@ async def lifespan(_: FastAPI): app = FastAPI( title="MultiQC API", - description="MultiQC API service, providing run-time information about available " "" "" "" "" "" "" "updates.", + description="MultiQC API service, providing run-time information about available " "" "" "" "" "" "" "" "updates.", version=__version__, license_info={ "name": "Source code available under the MIT Licence", @@ -77,11 +81,27 @@ async def lifespan(_: FastAPI): db.create_db_and_tables() -@repeat_every(seconds=15 * 60) # every 15 minutes -def update_version(): +async def update_version(): + """Sync latest version tag using GitHub API""" + while True: + await asyncio.sleep(15 * 60) # every 15 minutes + global latest_release + latest_release = get_latest_release() + + +async def persist_visits(): """Sync latest version tag using GitHub API""" - global latest_release - latest_release = get_latest_release() + while True: + await asyncio.sleep(10) # every 10 seconds + _persist_visits(verbose=True) + + +async def summarize_visits(): + """Repeated task to summarize visits.""" + while True: + await asyncio.sleep(10 * 60) # every 10 minutes + _summarize_visits() + _persist_visits(verbose=True) # Fields to store per visit @@ -212,15 +232,6 @@ def _persist_visits(verbose=False) -> Optional[Response]: return None -@repeat_every( - seconds=10, - wait_first=True, - logger=logger, -) -async def persist_visits(): - return _persist_visits(verbose=True) - - def _summarize_visits(interval="5min") -> Response: """ Summarize visits from the CSV file and write to the database @@ -272,25 +283,12 @@ def _summarize_visits(interval="5min") -> Response: content=msg, ) else: - msg = f"Successfully summarized {len(df)} visits" f" to " - f"{len(interval_summary)} per-interval entries" + msg = f"Successfully summarized {len(df)} visits to {len(interval_summary)} per-interval entries" logger.info(msg) open(CSV_FILE_PATH, "w").close() # Clear the CSV file on successful write return PlainTextResponse(content=msg) -@repeat_every( - seconds=10 * 60 * 1, # every 10 minutes - wait_first=True, - logger=logger, -) -async def summarize_visits(): - """ - Repeated task to summarize visits. - """ - return _summarize_visits() - - @app.post("/persist_visits") async def persist_visits_endpoint(): try: