Skip to content

Commit

Permalink
Scripts first post
Browse files Browse the repository at this point in the history
  • Loading branch information
wendlingd authored Sep 10, 2018
1 parent baa7d21 commit d1396ad
Show file tree
Hide file tree
Showing 10 changed files with 8,398 additions and 0 deletions.
3,274 changes: 3,274 additions & 0 deletions 01_Text_wrangling.ipynb

Large diffs are not rendered by default.

1,077 changes: 1,077 additions & 0 deletions 02_Run_APIs.ipynb

Large diffs are not rendered by default.

983 changes: 983 additions & 0 deletions 02_Run_APIs.py

Large diffs are not rendered by default.

530 changes: 530 additions & 0 deletions 03_Fuzzy_match.ipynb

Large diffs are not rendered by default.

906 changes: 906 additions & 0 deletions 04_Machine_learning_classification.ipynb

Large diffs are not rendered by default.

293 changes: 293 additions & 0 deletions 05_Chart_the_trends.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Part 5. Chart the trends\n",
"App to analyze web-site search logs (internal search)<br>\n",
"**This script:** Biggest Movers / Percent change charts<br>\n",
"Authors: [email protected], <br>\n",
"Last modified: 2018-09-09\n",
"\n",
"\n",
"## Script contents\n",
"\n",
"1. Start-up / What to put into place, where\n",
"2. Load and clean a subset of data\n",
"3. Put stats into form that matplotlib can consume and export data\n",
"4. Biggest movers bar chart - Percent change in search frequency\n",
"\n",
"\n",
"## FIXMEs\n",
"\n",
"Things Dan wrote for Dan; modify as needed. There are more FIXMEs in context.\n",
"\n",
"* [ ] \n",
"\n",
"\n",
"## RESOURCES\n",
"\n",
"- Partly based on code from Mueller-Guido 2017, Visualize_coefficients, p 341.\n",
"- https://stackoverflow.com/questions/tagged/matplotlib\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1. Start-up / What to put into place, where\n",
"# ============================================\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import os\n",
"\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"\n",
"# Set working directory\n",
"os.chdir('/Users/wendlingd/Projects/webDS/_util')\n",
"\n",
"localDir = '05_Chart_the_trends_files/' # Different than others, see about changing\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 2. Load and clean a subset of data\n",
"# ===================================\n",
"\n",
"logAfterFuzzyMatch = pd.read_excel('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
"\n",
"# Limit to off-LAN, NLM Home\n",
"df1 = logAfterFuzzyMatch.loc[logAfterFuzzyMatch['StaffYN'].str.contains('N') == True]\n",
"searchfor = ['www.nlm.nih.gov$', 'www.nlm.nih.gov/$']\n",
"df1 = df1[df1.Referrer.str.contains('|'.join(searchfor))]\n",
"\n",
"'''\n",
"# If you want to remove unparsed\n",
"df1 = df1[df1.SemanticGroup.str.contains(\"Unparsed\") == False]\n",
"df1 = df1[df1.preferredTerm.str.contains(\"PubMed strategy, citation, unclear, etc.\") == False]\n",
"'''\n",
"\n",
"\n",
"# reduce cols\n",
"df2 = df1[['Timestamp', 'preferredTerm', 'SemanticTypeName', 'SemanticGroup']]\n",
"\n",
"# Get nan count, remove nan rows\n",
"Unassigned = df2['preferredTerm'].isnull().sum()\n",
"df2 = df2[~pd.isnull(df2['Timestamp'])]\n",
"df2 = df2[~pd.isnull(df2['preferredTerm'])]\n",
"df2 = df2[~pd.isnull(df2['SemanticTypeName'])]\n",
"df2 = df2[~pd.isnull(df2['SemanticGroup'])]\n",
"\n",
"# Limit to May and June and assign month name\n",
"df2.loc[(df2['Timestamp'] > '2018-05-01 00:00:00') & (df2['Timestamp'] < '2018-06-01 00:00:00'), 'Month'] = 'May'\n",
"df2.loc[(df2['Timestamp'] > '2018-06-01 00:00:00') & (df2['Timestamp'] < '2018-07-01 00:00:00'), 'Month'] = 'June'\n",
"df2 = df2.loc[(df2['Month'] != \"\")]\n",
"\n",
"\n",
"\n",
"\n",
"'''\n",
"--------------------------\n",
"IN CASE YOU COMPLETE CYCLE AND THEN SEE THAT LABELS SHOULD BE SHORTENED\n",
"\n",
"# Shorten names if needed\n",
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('National Center for Biotechnology Information', 'NCBI')\n",
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('Samples of Formatted Refs J Articles', 'Formatted Refs Authors J Articles')\n",
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('Formatted References for Authors of Journal Articles', 'Formatted Refs J Articles')\n",
"\n",
"dobby = df2.loc[df2['preferredTerm'].str.contains('Formatted') == True]\n",
"dobby = df2.loc[df2['preferredTerm'].str.contains('Biotech') == True]\n",
"\n",
"writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
"df2.to_excel(writer,'logAfterFuzzyMatch')\n",
"# df2.to_excel(writer,'Sheet2')\n",
"writer.save()\n",
"'''\n",
"\n",
"writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n",
"df2.to_excel(writer,'logAfterFuzzyMatch')\n",
"# df2.to_excel(writer,'Sheet2')\n",
"writer.save()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Count number of unique preferredTerm\n",
"\n",
"# May counts\n",
"May = df2.loc[df2['Month'].str.contains('May') == True]\n",
"MayCounts = May.groupby('preferredTerm').size()\n",
"MayCounts = pd.DataFrame({'MayCount':MayCounts})\n",
"# MayCounts = MayCounts.sort_values(by='timesSearched', ascending=False)\n",
"MayCounts = MayCounts.reset_index()\n",
"\n",
"# June counts\n",
"June = df2.loc[df2['Month'].str.contains('June') == True]\n",
"JuneCounts = June.groupby('preferredTerm').size()\n",
"JuneCounts = pd.DataFrame({'JuneCount':JuneCounts})\n",
"# JuneCounts = JuneCounts.sort_values(by='timesSearched', ascending=False)\n",
"JuneCounts = JuneCounts.reset_index()\n",
"\n",
"\n",
"# Remove rows with a count less than 10; next code would make some exponential.\n",
"MayCounts = MayCounts[MayCounts['MayCount'] >= 10]\n",
"JuneCounts = JuneCounts[JuneCounts['JuneCount'] >= 10]\n",
"\n",
"# Join, removing terms not searched in BOTH months \n",
"df3 = pd.merge(MayCounts, JuneCounts, how='inner', on='preferredTerm')\n",
"\n",
"# Assign the percentage of that month's search share\n",
"# MayPercent\n",
"df3['MayPercent'] = \"\"\n",
"MayTotal = df3.MayCount.sum()\n",
"df3['MayPercent'] = df3.MayCount / MayTotal * 100\n",
"\n",
"# JunePercent\n",
"df3['JunePercent'] = \"\"\n",
"JuneTotal = df3.JuneCount.sum()\n",
"df3['JunePercent'] = df3.JuneCount / JuneTotal * 100\n",
"\n",
"# Assign Percent Change\n",
"df3['PercentChange'] = \"\"\n",
"df3['PercentChange'] = df3.JunePercent - df3.MayPercent\n",
"\n",
"# Prep for next phase\n",
"\n",
"PercentChangeData = df3\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 3. Put stats into form that matplotlib can consume and export data\n",
"# ===================================================================\n",
"\n",
"PercentChangeData = PercentChangeData.sort_values(by='PercentChange', ascending=True)\n",
"PercentChangeData = PercentChangeData.reset_index()\n",
"PercentChangeData.drop(['index'], axis=1, inplace=True) \n",
" \n",
"negative_values = PercentChangeData.head(20)\n",
"\n",
"positive_values = PercentChangeData.tail(20)\n",
"positive_values = positive_values.sort_values(by='PercentChange', ascending=True)\n",
"positive_values = positive_values.reset_index()\n",
"positive_values.drop(['index'], axis=1, inplace=True) \n",
"\n",
"interesting_values = negative_values.append([positive_values])\n",
"\n",
"\n",
"# Write out full file and chart file\n",
"\n",
"writer = pd.ExcelWriter(localDir + 'PercentChangeData.xlsx')\n",
"PercentChangeData.to_excel(writer,'PercentChangeData')\n",
"# df2.to_excel(writer,'Sheet2')\n",
"writer.save()\n",
"\n",
"writer = pd.ExcelWriter(localDir + 'interesting_values.xlsx')\n",
"interesting_values.to_excel(writer,'interesting_values')\n",
"# df2.to_excel(writer,'Sheet2')\n",
"writer.save()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 4. Biggest movers bar chart - Percent change in search frequency\n",
"# =================================================================\n",
"'''\n",
"Re-start:\n",
"interesting_values = pd.read_excel(localDir + 'interesting_values.xlsx')\n",
"'''\n",
"\n",
"\n",
"# Percent change chart\n",
"cm = ListedColormap(['#0000aa', '#ff2020'])\n",
"colors = [cm(1) if c < 0 else cm(0)\n",
" for c in interesting_values.PercentChange]\n",
"ax = interesting_values.plot(x='preferredTerm', y='PercentChange',\n",
" kind='bar', \n",
" color=colors,\n",
" fontsize=10) # figsize=(30, 10), \n",
"ax.set_xlabel(\"preferredTerm\")\n",
"ax.set_ylabel(\"Percent change for June\")\n",
"ax.legend_.remove()\n",
"plt.axvline(x=19.4, linewidth=.5, color='gray')\n",
"plt.axvline(x=19.6, linewidth=.5, color='gray')\n",
"plt.subplots_adjust(bottom=0.4)\n",
"plt.ylabel(\"Percent change in search frequency\")\n",
"plt.xlabel(\"Standardized topic name from UMLS+\")\n",
"plt.xticks(rotation=60, ha=\"right\", fontsize=9)\n",
"plt.suptitle('Biggest movers - How June site searches were different from the past', fontsize=16, fontweight='bold')\n",
"plt.title('NLM Home page, classify-able search terms only. In June use of the terms on the left\\ndropped the most, and use of the terms on the right rose the most, compared to May.', fontsize=10)\n",
"plt.show()\n",
"\n",
"# How June was different than May\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Outlier check\n",
"# =================================================================\n",
"'''\n",
"Why did Bibliographic Entity increase by 4%?\n",
"'''\n",
"\n",
"huh = logAfterFuzzyMatch[logAfterFuzzyMatch.preferredTerm.str.startswith(\"Biblio\") == True] # retrieve records to eyeball\n",
"# huh = huh.groupby('preferredTerm').size()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit d1396ad

Please sign in to comment.