Scripts first post

NCBI-Hackathons · Sep 10, 2018 · d1396ad · d1396ad
1 parent baa7d21
commit d1396ad
Show file tree

Hide file tree

Showing 10 changed files with 8,398 additions and 0 deletions.
diff --git a/01_Text_wrangling.ipynb b/01_Text_wrangling.ipynb
diff --git a/02_Run_APIs.ipynb b/02_Run_APIs.ipynb
diff --git a/02_Run_APIs.py b/02_Run_APIs.py
diff --git a/03_Fuzzy_match.ipynb b/03_Fuzzy_match.ipynb
diff --git a/04_Machine_learning_classification.ipynb b/04_Machine_learning_classification.ipynb
diff --git a/05_Chart_the_trends.ipynb b/05_Chart_the_trends.ipynb
@@ -0,0 +1,293 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Part 5. Chart the trends\n",
+    "App to analyze web-site search logs (internal search)<br>\n",
+    "**This script:** Biggest Movers / Percent change charts<br>\n",
+    "Authors: [email protected], <br>\n",
+    "Last modified: 2018-09-09\n",
+    "\n",
+    "\n",
+    "## Script contents\n",
+    "\n",
+    "1. Start-up / What to put into place, where\n",
+    "2. Load and clean a subset of data\n",
+    "3. Put stats into form that matplotlib can consume and export data\n",
+    "4. Biggest movers bar chart - Percent change in search frequency\n",
+    "\n",
+    "\n",
+    "## FIXMEs\n",
+    "\n",
+    "Things Dan wrote for Dan; modify as needed. There are more FIXMEs in context.\n",
+    "\n",
+    "* [ ] \n",
+    "\n",
+    "\n",
+    "## RESOURCES\n",
+    "\n",
+    "- Partly based on code from Mueller-Guido 2017, Visualize_coefficients, p 341.\n",
+    "- https://stackoverflow.com/questions/tagged/matplotlib\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 1. Start-up / What to put into place, where\n",
+    "# ============================================\n",
+    "\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "\n",
+    "from matplotlib.colors import ListedColormap\n",
+    "\n",
+    "\n",
+    "# Set working directory\n",
+    "os.chdir('/Users/wendlingd/Projects/webDS/_util')\n",
+    "\n",
+    "localDir = '05_Chart_the_trends_files/' # Different than others, see about changing\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 2. Load and clean a subset of data\n",
+    "# ===================================\n",
+    "\n",
+    "logAfterFuzzyMatch = pd.read_excel('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
+    "\n",
+    "# Limit to off-LAN, NLM Home\n",
+    "df1 = logAfterFuzzyMatch.loc[logAfterFuzzyMatch['StaffYN'].str.contains('N') == True]\n",
+    "searchfor = ['www.nlm.nih.gov$', 'www.nlm.nih.gov/$']\n",
+    "df1 = df1[df1.Referrer.str.contains('|'.join(searchfor))]\n",
+    "\n",
+    "'''\n",
+    "# If you want to remove unparsed\n",
+    "df1 = df1[df1.SemanticGroup.str.contains(\"Unparsed\") == False]\n",
+    "df1 = df1[df1.preferredTerm.str.contains(\"PubMed strategy, citation, unclear, etc.\") == False]\n",
+    "'''\n",
+    "\n",
+    "\n",
+    "# reduce cols\n",
+    "df2 = df1[['Timestamp', 'preferredTerm', 'SemanticTypeName', 'SemanticGroup']]\n",
+    "\n",
+    "# Get nan count, remove nan rows\n",
+    "Unassigned = df2['preferredTerm'].isnull().sum()\n",
+    "df2 = df2[~pd.isnull(df2['Timestamp'])]\n",
+    "df2 = df2[~pd.isnull(df2['preferredTerm'])]\n",
+    "df2 = df2[~pd.isnull(df2['SemanticTypeName'])]\n",
+    "df2 = df2[~pd.isnull(df2['SemanticGroup'])]\n",
+    "\n",
+    "# Limit to May and June and assign month name\n",
+    "df2.loc[(df2['Timestamp'] > '2018-05-01 00:00:00') & (df2['Timestamp'] < '2018-06-01 00:00:00'), 'Month'] = 'May'\n",
+    "df2.loc[(df2['Timestamp'] > '2018-06-01 00:00:00') & (df2['Timestamp'] < '2018-07-01 00:00:00'), 'Month'] = 'June'\n",
+    "df2 = df2.loc[(df2['Month'] != \"\")]\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "'''\n",
+    "--------------------------\n",
+    "IN CASE YOU COMPLETE CYCLE AND THEN SEE THAT LABELS SHOULD BE SHORTENED\n",
+    "\n",
+    "# Shorten names if needed\n",
+    "df2['preferredTerm'] = df2['preferredTerm'].str.replace('National Center for Biotechnology Information', 'NCBI')\n",
+    "df2['preferredTerm'] = df2['preferredTerm'].str.replace('Samples of Formatted Refs J Articles', 'Formatted Refs Authors J Articles')\n",
+    "df2['preferredTerm'] = df2['preferredTerm'].str.replace('Formatted References for Authors of Journal Articles', 'Formatted Refs J Articles')\n",
+    "\n",
+    "dobby = df2.loc[df2['preferredTerm'].str.contains('Formatted') == True]\n",
+    "dobby = df2.loc[df2['preferredTerm'].str.contains('Biotech') == True]\n",
+    "\n",
+    "writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
+    "df2.to_excel(writer,'logAfterFuzzyMatch')\n",
+    "# df2.to_excel(writer,'Sheet2')\n",
+    "writer.save()\n",
+    "'''\n",
+    "\n",
+    "writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
+    "df2.to_excel(writer,'logAfterFuzzyMatch')\n",
+    "# df2.to_excel(writer,'Sheet2')\n",
+    "writer.save()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Count number of unique preferredTerm\n",
+    "\n",
+    "# May counts\n",
+    "May = df2.loc[df2['Month'].str.contains('May') == True]\n",
+    "MayCounts = May.groupby('preferredTerm').size()\n",
+    "MayCounts = pd.DataFrame({'MayCount':MayCounts})\n",
+    "# MayCounts = MayCounts.sort_values(by='timesSearched', ascending=False)\n",
+    "MayCounts = MayCounts.reset_index()\n",
+    "\n",
+    "# June counts\n",
+    "June = df2.loc[df2['Month'].str.contains('June') == True]\n",
+    "JuneCounts = June.groupby('preferredTerm').size()\n",
+    "JuneCounts = pd.DataFrame({'JuneCount':JuneCounts})\n",
+    "# JuneCounts = JuneCounts.sort_values(by='timesSearched', ascending=False)\n",
+    "JuneCounts = JuneCounts.reset_index()\n",
+    "\n",
+    "\n",
+    "# Remove rows with a count less than 10; next code would make some exponential.\n",
+    "MayCounts = MayCounts[MayCounts['MayCount'] >= 10]\n",
+    "JuneCounts = JuneCounts[JuneCounts['JuneCount'] >= 10]\n",
+    "\n",
+    "# Join, removing terms not searched in BOTH months \n",
+    "df3 = pd.merge(MayCounts, JuneCounts, how='inner', on='preferredTerm')\n",
+    "\n",
+    "# Assign the percentage of that month's search share\n",
+    "# MayPercent\n",
+    "df3['MayPercent'] = \"\"\n",
+    "MayTotal = df3.MayCount.sum()\n",
+    "df3['MayPercent'] = df3.MayCount / MayTotal * 100\n",
+    "\n",
+    "# JunePercent\n",
+    "df3['JunePercent'] = \"\"\n",
+    "JuneTotal = df3.JuneCount.sum()\n",
+    "df3['JunePercent'] = df3.JuneCount / JuneTotal * 100\n",
+    "\n",
+    "# Assign Percent Change\n",
+    "df3['PercentChange'] = \"\"\n",
+    "df3['PercentChange'] = df3.JunePercent - df3.MayPercent\n",
+    "\n",
+    "# Prep for next phase\n",
+    "\n",
+    "PercentChangeData = df3\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. Put stats into form that matplotlib can consume and export data\n",
+    "# ===================================================================\n",
+    "\n",
+    "PercentChangeData = PercentChangeData.sort_values(by='PercentChange', ascending=True)\n",
+    "PercentChangeData = PercentChangeData.reset_index()\n",
+    "PercentChangeData.drop(['index'], axis=1, inplace=True)          \n",
+    "     \n",
+    "negative_values = PercentChangeData.head(20)\n",
+    "\n",
+    "positive_values = PercentChangeData.tail(20)\n",
+    "positive_values = positive_values.sort_values(by='PercentChange', ascending=True)\n",
+    "positive_values = positive_values.reset_index()\n",
+    "positive_values.drop(['index'], axis=1, inplace=True) \n",
+    "\n",
+    "interesting_values =  negative_values.append([positive_values])\n",
+    "\n",
+    "\n",
+    "# Write out full file and chart file\n",
+    "\n",
+    "writer = pd.ExcelWriter(localDir + 'PercentChangeData.xlsx')\n",
+    "PercentChangeData.to_excel(writer,'PercentChangeData')\n",
+    "# df2.to_excel(writer,'Sheet2')\n",
+    "writer.save()\n",
+    "\n",
+    "writer = pd.ExcelWriter(localDir + 'interesting_values.xlsx')\n",
+    "interesting_values.to_excel(writer,'interesting_values')\n",
+    "# df2.to_excel(writer,'Sheet2')\n",
+    "writer.save()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 4. Biggest movers bar chart - Percent change in search frequency\n",
+    "# =================================================================\n",
+    "'''\n",
+    "Re-start:\n",
+    "interesting_values = pd.read_excel(localDir + 'interesting_values.xlsx')\n",
+    "'''\n",
+    "\n",
+    "\n",
+    "# Percent change chart\n",
+    "cm = ListedColormap(['#0000aa', '#ff2020'])\n",
+    "colors = [cm(1) if c < 0 else cm(0)\n",
+    "          for c in interesting_values.PercentChange]\n",
+    "ax = interesting_values.plot(x='preferredTerm', y='PercentChange',\n",
+    "                             kind='bar', \n",
+    "                             color=colors,\n",
+    "                             fontsize=10) # figsize=(30, 10), \n",
+    "ax.set_xlabel(\"preferredTerm\")\n",
+    "ax.set_ylabel(\"Percent change for June\")\n",
+    "ax.legend_.remove()\n",
+    "plt.axvline(x=19.4, linewidth=.5, color='gray')\n",
+    "plt.axvline(x=19.6, linewidth=.5, color='gray')\n",
+    "plt.subplots_adjust(bottom=0.4)\n",
+    "plt.ylabel(\"Percent change in search frequency\")\n",
+    "plt.xlabel(\"Standardized topic name from UMLS+\")\n",
+    "plt.xticks(rotation=60, ha=\"right\", fontsize=9)\n",
+    "plt.suptitle('Biggest movers - How June site searches were different from the past', fontsize=16, fontweight='bold')\n",
+    "plt.title('NLM Home page, classify-able search terms only. In June use of the terms on the left\\ndropped the most, and use of the terms on the right rose the most, compared to May.', fontsize=10)\n",
+    "plt.show()\n",
+    "\n",
+    "# How June was different than May\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Outlier check\n",
+    "# =================================================================\n",
+    "'''\n",
+    "Why did Bibliographic Entity increase by 4%?\n",
+    "'''\n",
+    "\n",
+    "huh = logAfterFuzzyMatch[logAfterFuzzyMatch.preferredTerm.str.startswith(\"Biblio\") == True] # retrieve records to eyeball\n",
+    "# huh = huh.groupby('preferredTerm').size()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}