From 6bea5151bca9a8f332c60459a7a52323f7cf95b6 Mon Sep 17 00:00:00 2001 From: rashika Date: Fri, 22 Mar 2024 18:08:56 -0400 Subject: [PATCH] Fixed the plot.ipynb so that best prediction file per group could be accurately selected and used --- src/plot.ipynb | 56 ++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/src/plot.ipynb b/src/plot.ipynb index ccde901..08b4c1d 100644 --- a/src/plot.ipynb +++ b/src/plot.ipynb @@ -44,6 +44,8 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Map column names to full names (for axis labels)\n", @@ -51,11 +53,7 @@ "\n", "# Map ontology namespaces to full names (for plot titles)\n", "ontology_dict = {'biological_process': 'BPO', 'molecular_function': 'MFO', 'cellular_component': 'CCO'}" - ], - "metadata": { - "collapsed": false - }, - "execution_count": null + ] }, { "cell_type": "code", @@ -88,7 +86,7 @@ " else:\n", " df['is_baseline'].fillna(False, inplace=True)\n", " # print(methods)\n", - "df = df.drop(columns='filename').set_index(['group', 'label', 'ns', 'tau'])\n", + "df = df.set_index(['group', 'label', 'ns', 'filename','tau'])\n", "df" ] }, @@ -105,6 +103,8 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Assign colors based on group\n", @@ -113,9 +113,7 @@ "df['colors'] = pd.factorize(df['colors'])[0]\n", "df['colors'] = df['colors'].apply(lambda x: cmap.colors[x % len(cmap.colors)])\n", "df" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "code", @@ -152,6 +150,8 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Add first last points to precision and recall curves to improve APS calculation\n", @@ -165,47 +165,41 @@ "if metric.startswith('f') and add_extreme_points:\n", " df_methods = df_methods.reset_index().groupby(['group', 'label', 'ns'], as_index=False).apply(add_points).set_index(['group', 'label', 'ns'])\n", "df_methods" - ], - "metadata": { - "collapsed": false - }, - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Filter the dataframe for the best method and threshold\n", "df_best = df.loc[index_best, ['cov', 'colors'] + cols + [metric]]\n", "df_best" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Calculate average precision score \n", "if metric.startswith('f'):\n", " df_best['aps'] = df_methods.groupby(level=['group', 'label', 'ns'])[[cols[0], cols[1]]].apply(lambda x: (x[cols[0]].diff(-1).shift(1) * x[cols[1]]).sum())\n", "df_best" - ], - "metadata": { - "collapsed": false - }, - "execution_count": null + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ "# Calculate the max coverage across all thresholds\n", "df_best['max_cov'] = df_methods.groupby(level=['group', 'label', 'ns'])['cov'].max()\n", "df_best" - ], - "metadata": {}, - "execution_count": null + ] }, { "cell_type": "code", @@ -247,7 +241,7 @@ "\n", " # Iterate methods\n", " for i, (index, row) in enumerate(df_g.sort_values(by=[metric, 'max_cov'], ascending=[False if metric.startswith('f') else True, False]).iterrows()):\n", - " data = df_methods.loc[index[:-1]]\n", + " data = df_methods.loc[index[:-2]]\n", " \n", " # Precision-recall or mi-ru curves\n", " ax.plot(data[cols[0]], data[cols[1]], color=row['colors'], label=row['label'], lw=2, zorder=500-i)\n", @@ -282,12 +276,10 @@ }, { "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], - "source": [], - "metadata": { - "collapsed": false - }, - "execution_count": null + "source": [] } ], "metadata": { @@ -306,7 +298,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.10.9" } }, "nbformat": 4,