CogStack
diff --git a/‎.github/workflows/main.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/main.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
Lines changed: 216 additions & 65 deletions b/‎medcat/2_train_model/2_supervised_training/meta_annotation_training.ipynb
Lines changed: 216 additions & 65 deletions
@@ -9,10 +9,10 @@ on:
 jobs:
   native-py:
 
-    runs-on: ubuntu-20.04
+    runs-on: ubuntu-24.04
     strategy:
       matrix:
-        python-version: [ '3.8', '3.9', '3.10', '3.11' ]
+        python-version: [ '3.9', '3.10', '3.11', '3.12' ]
       max-parallel: 4
 
     steps:
 
@@ -13,8 +13,7 @@
     "from medcat.cat import CAT\n",
     "from medcat.meta_cat import MetaCAT\n",
     "from medcat.config_meta_cat import ConfigMetaCAT\n",
-    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBPE, TokenizerWrapperBERT\n",
-    "from tokenizers import ByteLevelBPETokenizer"
+    "from medcat.tokenizers.meta_cat_tokenizers import TokenizerWrapperBERT"
    ]
   },
   {
@@ -31,82 +30,234 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5d0606ec",
+   "id": "f310cef3",
    "metadata": {},
    "source": [
-    "# Set parameters"
+    "### Load the model pack with MetaCATs\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "dd7a2e97",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# relative path to working_with_cogstack folder\n",
-    "_rel_path = os.path.join(\"..\", \"..\", \"..\")\n",
-    "# absolute path to working_with_cogstack folder\n",
-    "base_path = os.path.abspath(_rel_path)\n",
-    "# Load mct export\n",
-    "ann_dir = os.path.join(base_path, \"data\", \"medcattrainer_export\")\n",
-    "\n",
-    "mctrainer_export_path = ann_dir + \"\"  # name of your mct export\n",
-    "\n",
+    "model_pack = '<enter path to the model pack>' # .zip model pack location \n",
+    "mctrainer_export = \"<enter mct export location>\"  # name of your mct export"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "921d5e9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Load model\n",
-    "model_dir = os.path.join(base_path, \"models\", \"modelpack\")\n",
-    "modelpack = '' # name of modelpack\n",
-    "model_pack_path = os.path.join(model_dir, modelpack)\n",
-    "     #output_modelpack = model_dir + f\"{today}_trained_model\"\n",
+    "cat = CAT.load_model_pack(model_pack)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "b205d51b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are: 3 meta cat models in this model pack.\n"
+     ]
+    }
+   ],
+   "source": [
     "\n",
-    "# will be used to date the trained model\n",
-    "today = str(date.today())\n",
-    "today = today.replace(\"-\",\"\")\n",
+    "# Check what meta cat models are in this model pack.\n",
+    "print(f'There are: {len(cat._meta_cats)} meta cat models in this model pack.')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "31d7632a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"Category Name\": \"Temporality\",\n",
+      "  \"Description\": \"No description\",\n",
+      "  \"Classes\": {\n",
+      "    \"Past\": 0,\n",
+      "    \"Recent\": 1,\n",
+      "    \"Future\": 2\n",
+      "  },\n",
+      "  \"Model\": \"bert\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(cat._meta_cats[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e9180c4c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"Category Name\": \"Presence\",\n",
+      "  \"Description\": \"No description\",\n",
+      "  \"Classes\": {\n",
+      "    \"Hypothetical (N/A)\": 1,\n",
+      "    \"Not present (False)\": 0,\n",
+      "    \"Present (True)\": 2\n",
+      "  },\n",
+      "  \"Model\": \"bert\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(cat._meta_cats[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "275ca9ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\n",
+      "  \"Category Name\": \"Experiencer\",\n",
+      "  \"Description\": \"No description\",\n",
+      "  \"Classes\": {\n",
+      "    \"Family\": 1,\n",
+      "    \"Other\": 0,\n",
+      "    \"Patient\": 2\n",
+      "  },\n",
+      "  \"Model\": \"bert\"\n",
+      "}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(cat._meta_cats[2])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3047b1d9",
+   "metadata": {},
+   "source": [
+    "<b> NOTE: </b> \n",
+    " The name for the classification task can vary. E.g: The Category Name for 'Experiencer' can be 'Subject', as it has been configured an annoated in MedCATTrainer this way, but the model expects 'Experiencer'\n",
+    " \n",
+    " To accomodate for this, we have a list that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_category_names`\n",
     "\n",
-    "# Initialise meta_ann models\n",
-    "if model_pack_path[-4:] == '.zip':\n",
-    "    base_dir_meta_models = model_pack_path[:-4]\n",
-    "else:\n",
-    "    base_dir_meta_models = model_pack_path\n",
+    "E.g. for Experiencer, it will be pre-loaded as alternative_category_names = ['Experiencer','Subject']\n",
     "\n",
-    "# Iterate through the meta_models contained in the model\n",
-    "meta_model_names = [] # These Meta_annotation tasks should correspond to the ones labelled in the mcttrainer export\n",
-    "for dirpath, dirnames, filenames in os.walk(base_dir_meta_models):\n",
-    "    for dirname in dirnames:\n",
-    "        if dirname.startswith('meta_'):\n",
-    "            meta_model_names.append(dirname[5:])"
+    "Set this list to ensure during training / fine-tuning the model is aware of alternative names for classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ca00fb0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(cat._meta_cats[0].config.general.alternative_category_names)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "35aa5605",
+   "id": "5dba296c",
    "metadata": {},
    "source": [
-    "Before you run the next section please double check that the model meta_annotation names matches to those specified in the mct export.\n",
-    "\n"
+    "💡 In case you are using older modelpacks, the above field will be empty. In that case, "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92e41964",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only run in case the above output is an empty list\n",
+    "category_name_mapping = [[\"Presence\"],[\"Temporality\",\"Time\"],[\"Experiencer\",\"Subject\"]]\n",
+    "lookup = {item: group for group in category_name_mapping for item in group}\n",
+    "\n",
+    "for meta_model in range(len(cat._meta_cats)):\n",
+    "    cat._meta_cats[meta_model].config.general.alternative_category_names = lookup.get(cat._meta_cats[meta_model].config.general.category_name)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8bf6f5c3",
+   "id": "12e91f77",
    "metadata": {},
    "source": [
-    "Depending on the model pack you have, please run the LSTM model or BERT model section. <br>\n",
-    "If you are unsure, use this section to check the model type."
+    "<b> NOTE: </b> \n",
+    " The name for the classes can vary too. Some sites may have trained a MetaCAT model for the same task, but called a class value a slightly different name.\n",
+    " \n",
+    " E.g: For the Presence task, the class name can be 'Not present (False)' or 'False'\n",
+    " \n",
+    " To accomodate for this, we have a mapping that stores the variations for the alternate names. This attribute can be found under `mc.config.general.alternative_class_names`\n",
+    "\n",
+    " E.g. for Presence, it will be pre-loaded as alternative_class_names = [[\"Hypothetical (N/A)\",\"Hypothetical\"],[\"Not present (False)\",\"False\"],[\"Present (True)\",\"True\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f6b06e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(cat._meta_cats[0].config.general.alternative_class_names)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c97c986",
+   "metadata": {},
+   "source": [
+    "💡 In case you are using older modelpacks, the above field will be empty. In that case, please run the following code:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2933f7e1",
+   "id": "0fdfae70",
    "metadata": {},
    "outputs": [],
    "source": [
-    "for meta_model in meta_model_names:\n",
-    "    config_file = os.path.join(base_dir_meta_models,\"meta_\"+meta_model,\"config.json\")\n",
-    "    with open(config_file, 'r') as jfile:\n",
-    "        config_dict = json.load(jfile)\n",
-    "    print(f\"Model used for meta_{meta_model}:\",config_dict['model']['model_name'])"
+    "# Only run in case the above output is an empty list\n",
+    "class_name_mapping =  {\n",
+    "    \"Temporality\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n",
+    "    \"Time\": [[\"Past\"], [\"Recent\", \"Present\"], [\"Future\"]],\n",
+    "    \"Experiencer\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n",
+    "    \"Subject\": [[\"Family\"], [\"Other\"], [\"Patient\"]],\n",
+    "    \"Presence\": [[\"Hypothetical (N/A)\", \"Hypothetical\"], [\"Not present (False)\", \"False\"], [\"Present (True)\", \"True\"]]\n",
+    "}\n",
+    "\n",
+    "for meta_model in range(len(cat._meta_cats)):\n",
+    "    cat._meta_cats[meta_model].config.general.alternative_class_names = class_name_mapping[cat._meta_cats[meta_model].config.general.category_name]"
    ]
   },
   {
@@ -124,30 +275,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for meta_model in meta_model_names:\n",
-    "    \n",
-    "    # load the meta_model\n",
-    "    mc = MetaCAT.load(save_dir_path=os.path.join(base_dir_meta_models,\"meta_\"+meta_model))\n",
+    "# Train the first meta cat model - 'Temporality' Task.\n",
+    "meta_cat = cat._meta_cats[0]\n",
     "\n",
-    "    # changing parameters\n",
-    "    mc.config.train['nepochs'] = 15\n",
+    "# to overwrite the existing model, resave the fine-tuned model with the same model pack dir\n",
+    "meta_cat_task = meta_cat.config.general.category_name\n",
+    "model_pack_dir = '<enter path to meta model pack>'\n",
+    "save_dir_path = os.path.join(model_pack_dir,\"meta_\"+ meta_cat_task)\n",
     "\n",
-    "    save_dir_path= \"test_meta_\"+meta_model # Where to save the meta_model and results. \n",
-    "    #Ideally this should replace the meta_models inside the modelpack\n",
+    "# to save the new model elsewhere, uncomment the below line\n",
+    "#save_dir_path= \"test_meta_\"+meta_cat_task # Where to save the meta_model and results. \n",
     "\n",
-    "    # train the meta_model\n",
-    "    results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
-    "    \n",
-    "    # Save results\n",
-    "    json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
+    "# train the meta_model\n",
+    "results = meta_cat.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
+    "\n",
+    "# Save results\n",
+    "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_cat_task+'_results.json'), 'w'))"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "ab23e424",
    "metadata": {},
    "source": [
-    "## If you dont have the model packs, and are training from scratch"
+    "## If you dont have the model packs, and are training from scratch\n",
+    "<b>⚠️This is very rare, it is recommended to always use the model packs and then fine-tune them</b>"
    ]
   },
   {
@@ -167,23 +319,22 @@
     "\n",
     "tokenizer = TokenizerWrapperBERT.load(\"\", config.model['model_variant'])\n",
     "\n",
-    "save_dir_path= \"test_meta\" # Where to save the meta_model and results. \n",
-    "#Ideally this should replace the meta_models inside the modelpack\n",
+    "save_dir_path= \"test_meta_\" + meta_cat_task # Where to save the meta_model and results. \n",
     "\n",
     "# Initialise and train meta_model\n",
     "mc = MetaCAT(tokenizer=tokenizer, embeddings=None, config=config)\n",
-    "results = mc.train_from_json(mctrainer_export_path, save_dir_path=save_dir_path)\n",
+    "results = mc.train_from_json(mctrainer_export, save_dir_path=save_dir_path)\n",
     "\n",
     "# Save results\n",
-    "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_'+meta_model+'_results.json'), 'w'))"
+    "json.dump(results['report'], open(os.path.join(save_dir_path,'meta_' + meta_cat_task+'_results.json'), 'w'))"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python [conda env:cattrainer]",
    "language": "python",
-   "name": "python3"
+   "name": "conda-env-cattrainer-py"
   },
   "language_info": {
    "codemirror_mode": {
@@ -195,7 +346,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.8"
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,