Add models, config, .flake8

beeldengeluid · Mar 19, 2024 · 1d5ff27 · 1d5ff27
1 parent a92569d
commit 1d5ff27
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 1 deletion.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,58 @@
+# use .flake8 until we can move this config to pyproject.toml (not possible yet (27/02/2024) according to issue below)
+# https://github.com/PyCQA/flake8/issues/234
+
+[flake8]
+select =
+    # B: bugbear warnings
+    B,
+
+    # B950: bugbear max-linelength warning
+    # as suggested in the black docs
+    # https://github.com/psf/black/blob/d038a24ca200da9dacc1dcb05090c9e5b45b7869/docs/the_black_code_style/current_style.md#line-length
+    B950,
+
+    # C: currently only C901, mccabe code complexity
+    C,
+
+    # E: pycodestyle errors
+    E,
+
+    # F: flake8 codes for pyflakes
+    F,
+
+    # W: pycodestyle warnings
+    W,
+
+extend-ignore =
+    # E203: pycodestyle's "whitespace before ',', ';' or ':'" error
+    # ignored as suggested in the black docs
+    # https://github.com/psf/black/blob/d038a24ca200da9dacc1dcb05090c9e5b45b7869/docs/the_black_code_style/current_style.md#slices
+    E203,
+
+    # E501: pycodestyle's "line too long (82 > 79) characters" error
+    # ignored in favor of B950 as suggested in the black docs
+    # https://github.com/psf/black/blob/d038a24ca200da9dacc1dcb05090c9e5b45b7869/docs/the_black_code_style/current_style.md#line-length
+    E501,
+
+    # W503 line break before binary operator
+    W503,
+
+# set max-line-length to be black compatible, as suggested in the black docs
+# https://github.com/psf/black/blob/d038a24ca200da9dacc1dcb05090c9e5b45b7869/docs/the_black_code_style/current_style.md#line-length
+max-line-length = 88
+
+# set max cyclomatic complexity for mccabe plugin
+max-complexity = 10
+
+# show total number of errors, set exit code to 1 if tot is not empty
+count = True
+
+# show the source generating each error or warning
+show-source = True
+
+# count errors and warnings
+statistics = True
+
+exclude = 
+    .venv
+    misc
diff --git a/.github/workflows/_deploy.yml b/.github/workflows/_deploy.yml
@@ -1,4 +1,4 @@
-name: Deploy dane-visual-feature-extraction-worker to ghcr
+name: Deploy dane-whisper-asr-worker to ghcr
 
 on:
   workflow_call:

diff --git a/config/config.yml b/config/config.yml
@@ -0,0 +1,34 @@
+RABBITMQ:
+    HOST: dane-rabbitmq-api.default.svc.cluster.local
+    PORT: 5672
+    EXCHANGE: DANE-exchange
+    RESPONSE_QUEUE: DANE-response-queue
+    USER: guest # change this for production mode
+    PASSWORD: guest # change this for production mode
+ELASTICSEARCH:
+    HOST:
+        - elasticsearch
+    PORT: 9200
+    USER: '' # change this for production mode
+    PASSWORD: '' # change this for production mode
+    SCHEME: http
+    INDEX: dane-index-k8s
+FILE_SYSTEM:
+    BASE_MOUNT: data # data when running locally
+    INPUT_DIR: input-files
+    OUTPUT_DIR: output-files
+INPUT:
+    TEST_INPUT_PATH: testsource__testcarrier/inputfile.wav
+    S3_ENDPOINT_URL: https://s3-host
+    MODEL: s3://bucket/model
+    DELETE_ON_COMPLETION: False
+OUTPUT:
+    DELETE_ON_COMPLETION: True
+    TRANSFER_ON_COMPLETION: True
+    S3_ENDPOINT_URL: https://s3-host
+    S3_BUCKET: bucket-name  # bucket reserved for 1 type of output
+    S3_FOLDER_IN_BUCKET: folder  # folder within the bucket
+WHISPER_ASR_SETTINGS:
+    WORD_TIMESTAMPS: True
+DANE_DEPENDENCIES:
+    - input-generating-worker
diff --git a/models.py b/models.py
@@ -0,0 +1,35 @@
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, TypedDict
+from dane.provenance import Provenance
+
+
+# returned by callback()
+class CallbackResponse(TypedDict):
+    state: int
+    message: str
+
+
+# These are the types of output this worker (possibly) provides (depending on configuration)
+class OutputType(Enum):
+    # name of output type, should just have a significant name, no other restrictions
+    # (as far as I understand)
+    TRANSCRIPT = "transcript"
+    PROVENANCE = "provenance"  # produced by provenance.py
+
+
+@dataclass
+class WhisperASRInput:
+    state: int  # HTTP status code
+    message: str  # error/success message
+    source_id: str = ""  # <program ID>__<carrier ID>
+    input_file_path: str = ""  # where the audio was downloaded from
+    provenance: Optional[Provenance] = None  # mostly: how long did it take to download
+
+
+@dataclass
+class WhisperASROutput:
+    state: int  # HTTP status code
+    message: str  # error/success message
+    output_file_path: str = ""  # where to store the text file
+    provenance: Optional[Provenance] = None  # audio extraction provenance