-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
8,398 additions
and
0 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,293 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Part 5. Chart the trends\n", | ||
"App to analyze web-site search logs (internal search)<br>\n", | ||
"**This script:** Biggest Movers / Percent change charts<br>\n", | ||
"Authors: [email protected], <br>\n", | ||
"Last modified: 2018-09-09\n", | ||
"\n", | ||
"\n", | ||
"## Script contents\n", | ||
"\n", | ||
"1. Start-up / What to put into place, where\n", | ||
"2. Load and clean a subset of data\n", | ||
"3. Put stats into form that matplotlib can consume and export data\n", | ||
"4. Biggest movers bar chart - Percent change in search frequency\n", | ||
"\n", | ||
"\n", | ||
"## FIXMEs\n", | ||
"\n", | ||
"Things Dan wrote for Dan; modify as needed. There are more FIXMEs in context.\n", | ||
"\n", | ||
"* [ ] \n", | ||
"\n", | ||
"\n", | ||
"## RESOURCES\n", | ||
"\n", | ||
"- Partly based on code from Mueller-Guido 2017, Visualize_coefficients, p 341.\n", | ||
"- https://stackoverflow.com/questions/tagged/matplotlib\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 1. Start-up / What to put into place, where\n", | ||
"# ============================================\n", | ||
"\n", | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"import matplotlib.pyplot as plt\n", | ||
"import os\n", | ||
"\n", | ||
"from matplotlib.colors import ListedColormap\n", | ||
"\n", | ||
"\n", | ||
"# Set working directory\n", | ||
"os.chdir('/Users/wendlingd/Projects/webDS/_util')\n", | ||
"\n", | ||
"localDir = '05_Chart_the_trends_files/' # Different than others, see about changing\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 2. Load and clean a subset of data\n", | ||
"# ===================================\n", | ||
"\n", | ||
"logAfterFuzzyMatch = pd.read_excel('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n", | ||
"\n", | ||
"# Limit to off-LAN, NLM Home\n", | ||
"df1 = logAfterFuzzyMatch.loc[logAfterFuzzyMatch['StaffYN'].str.contains('N') == True]\n", | ||
"searchfor = ['www.nlm.nih.gov$', 'www.nlm.nih.gov/$']\n", | ||
"df1 = df1[df1.Referrer.str.contains('|'.join(searchfor))]\n", | ||
"\n", | ||
"'''\n", | ||
"# If you want to remove unparsed\n", | ||
"df1 = df1[df1.SemanticGroup.str.contains(\"Unparsed\") == False]\n", | ||
"df1 = df1[df1.preferredTerm.str.contains(\"PubMed strategy, citation, unclear, etc.\") == False]\n", | ||
"'''\n", | ||
"\n", | ||
"\n", | ||
"# reduce cols\n", | ||
"df2 = df1[['Timestamp', 'preferredTerm', 'SemanticTypeName', 'SemanticGroup']]\n", | ||
"\n", | ||
"# Get nan count, remove nan rows\n", | ||
"Unassigned = df2['preferredTerm'].isnull().sum()\n", | ||
"df2 = df2[~pd.isnull(df2['Timestamp'])]\n", | ||
"df2 = df2[~pd.isnull(df2['preferredTerm'])]\n", | ||
"df2 = df2[~pd.isnull(df2['SemanticTypeName'])]\n", | ||
"df2 = df2[~pd.isnull(df2['SemanticGroup'])]\n", | ||
"\n", | ||
"# Limit to May and June and assign month name\n", | ||
"df2.loc[(df2['Timestamp'] > '2018-05-01 00:00:00') & (df2['Timestamp'] < '2018-06-01 00:00:00'), 'Month'] = 'May'\n", | ||
"df2.loc[(df2['Timestamp'] > '2018-06-01 00:00:00') & (df2['Timestamp'] < '2018-07-01 00:00:00'), 'Month'] = 'June'\n", | ||
"df2 = df2.loc[(df2['Month'] != \"\")]\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"'''\n", | ||
"--------------------------\n", | ||
"IN CASE YOU COMPLETE CYCLE AND THEN SEE THAT LABELS SHOULD BE SHORTENED\n", | ||
"\n", | ||
"# Shorten names if needed\n", | ||
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('National Center for Biotechnology Information', 'NCBI')\n", | ||
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('Samples of Formatted Refs J Articles', 'Formatted Refs Authors J Articles')\n", | ||
"df2['preferredTerm'] = df2['preferredTerm'].str.replace('Formatted References for Authors of Journal Articles', 'Formatted Refs J Articles')\n", | ||
"\n", | ||
"dobby = df2.loc[df2['preferredTerm'].str.contains('Formatted') == True]\n", | ||
"dobby = df2.loc[df2['preferredTerm'].str.contains('Biotech') == True]\n", | ||
"\n", | ||
"writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n", | ||
"df2.to_excel(writer,'logAfterFuzzyMatch')\n", | ||
"# df2.to_excel(writer,'Sheet2')\n", | ||
"writer.save()\n", | ||
"'''\n", | ||
"\n", | ||
"writer = pd.ExcelWriter('03_Fuzzy_match_files/logAfterFuzzyMatch.xlsx')\n", | ||
"df2.to_excel(writer,'logAfterFuzzyMatch')\n", | ||
"# df2.to_excel(writer,'Sheet2')\n", | ||
"writer.save()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"# Count number of unique preferredTerm\n", | ||
"\n", | ||
"# May counts\n", | ||
"May = df2.loc[df2['Month'].str.contains('May') == True]\n", | ||
"MayCounts = May.groupby('preferredTerm').size()\n", | ||
"MayCounts = pd.DataFrame({'MayCount':MayCounts})\n", | ||
"# MayCounts = MayCounts.sort_values(by='timesSearched', ascending=False)\n", | ||
"MayCounts = MayCounts.reset_index()\n", | ||
"\n", | ||
"# June counts\n", | ||
"June = df2.loc[df2['Month'].str.contains('June') == True]\n", | ||
"JuneCounts = June.groupby('preferredTerm').size()\n", | ||
"JuneCounts = pd.DataFrame({'JuneCount':JuneCounts})\n", | ||
"# JuneCounts = JuneCounts.sort_values(by='timesSearched', ascending=False)\n", | ||
"JuneCounts = JuneCounts.reset_index()\n", | ||
"\n", | ||
"\n", | ||
"# Remove rows with a count less than 10; next code would make some exponential.\n", | ||
"MayCounts = MayCounts[MayCounts['MayCount'] >= 10]\n", | ||
"JuneCounts = JuneCounts[JuneCounts['JuneCount'] >= 10]\n", | ||
"\n", | ||
"# Join, removing terms not searched in BOTH months \n", | ||
"df3 = pd.merge(MayCounts, JuneCounts, how='inner', on='preferredTerm')\n", | ||
"\n", | ||
"# Assign the percentage of that month's search share\n", | ||
"# MayPercent\n", | ||
"df3['MayPercent'] = \"\"\n", | ||
"MayTotal = df3.MayCount.sum()\n", | ||
"df3['MayPercent'] = df3.MayCount / MayTotal * 100\n", | ||
"\n", | ||
"# JunePercent\n", | ||
"df3['JunePercent'] = \"\"\n", | ||
"JuneTotal = df3.JuneCount.sum()\n", | ||
"df3['JunePercent'] = df3.JuneCount / JuneTotal * 100\n", | ||
"\n", | ||
"# Assign Percent Change\n", | ||
"df3['PercentChange'] = \"\"\n", | ||
"df3['PercentChange'] = df3.JunePercent - df3.MayPercent\n", | ||
"\n", | ||
"# Prep for next phase\n", | ||
"\n", | ||
"PercentChangeData = df3\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 3. Put stats into form that matplotlib can consume and export data\n", | ||
"# ===================================================================\n", | ||
"\n", | ||
"PercentChangeData = PercentChangeData.sort_values(by='PercentChange', ascending=True)\n", | ||
"PercentChangeData = PercentChangeData.reset_index()\n", | ||
"PercentChangeData.drop(['index'], axis=1, inplace=True) \n", | ||
" \n", | ||
"negative_values = PercentChangeData.head(20)\n", | ||
"\n", | ||
"positive_values = PercentChangeData.tail(20)\n", | ||
"positive_values = positive_values.sort_values(by='PercentChange', ascending=True)\n", | ||
"positive_values = positive_values.reset_index()\n", | ||
"positive_values.drop(['index'], axis=1, inplace=True) \n", | ||
"\n", | ||
"interesting_values = negative_values.append([positive_values])\n", | ||
"\n", | ||
"\n", | ||
"# Write out full file and chart file\n", | ||
"\n", | ||
"writer = pd.ExcelWriter(localDir + 'PercentChangeData.xlsx')\n", | ||
"PercentChangeData.to_excel(writer,'PercentChangeData')\n", | ||
"# df2.to_excel(writer,'Sheet2')\n", | ||
"writer.save()\n", | ||
"\n", | ||
"writer = pd.ExcelWriter(localDir + 'interesting_values.xlsx')\n", | ||
"interesting_values.to_excel(writer,'interesting_values')\n", | ||
"# df2.to_excel(writer,'Sheet2')\n", | ||
"writer.save()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# 4. Biggest movers bar chart - Percent change in search frequency\n", | ||
"# =================================================================\n", | ||
"'''\n", | ||
"Re-start:\n", | ||
"interesting_values = pd.read_excel(localDir + 'interesting_values.xlsx')\n", | ||
"'''\n", | ||
"\n", | ||
"\n", | ||
"# Percent change chart\n", | ||
"cm = ListedColormap(['#0000aa', '#ff2020'])\n", | ||
"colors = [cm(1) if c < 0 else cm(0)\n", | ||
" for c in interesting_values.PercentChange]\n", | ||
"ax = interesting_values.plot(x='preferredTerm', y='PercentChange',\n", | ||
" kind='bar', \n", | ||
" color=colors,\n", | ||
" fontsize=10) # figsize=(30, 10), \n", | ||
"ax.set_xlabel(\"preferredTerm\")\n", | ||
"ax.set_ylabel(\"Percent change for June\")\n", | ||
"ax.legend_.remove()\n", | ||
"plt.axvline(x=19.4, linewidth=.5, color='gray')\n", | ||
"plt.axvline(x=19.6, linewidth=.5, color='gray')\n", | ||
"plt.subplots_adjust(bottom=0.4)\n", | ||
"plt.ylabel(\"Percent change in search frequency\")\n", | ||
"plt.xlabel(\"Standardized topic name from UMLS+\")\n", | ||
"plt.xticks(rotation=60, ha=\"right\", fontsize=9)\n", | ||
"plt.suptitle('Biggest movers - How June site searches were different from the past', fontsize=16, fontweight='bold')\n", | ||
"plt.title('NLM Home page, classify-able search terms only. In June use of the terms on the left\\ndropped the most, and use of the terms on the right rose the most, compared to May.', fontsize=10)\n", | ||
"plt.show()\n", | ||
"\n", | ||
"# How June was different than May\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Outlier check\n", | ||
"# =================================================================\n", | ||
"'''\n", | ||
"Why did Bibliographic Entity increase by 4%?\n", | ||
"'''\n", | ||
"\n", | ||
"huh = logAfterFuzzyMatch[logAfterFuzzyMatch.preferredTerm.str.startswith(\"Biblio\") == True] # retrieve records to eyeball\n", | ||
"# huh = huh.groupby('preferredTerm').size()\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.5" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.