Skip to content

Commit

Permalink
Merge pull request #41 from PioneersHub/speaker_company_statistics
Browse files Browse the repository at this point in the history
add speaker statistics
  • Loading branch information
FinkeNils authored Jan 9, 2025
2 parents ef8be6b + 4e15b24 commit 0a9cd6c
Showing 1 changed file with 145 additions and 1 deletion.
146 changes: 145 additions & 1 deletion notebooks/pyconde-pydata-darmstadt-2025/10_submission_stats_v1.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,150 @@
"spkrs_df = speakers_as_df(spkrs, with_questions=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Speaker Statistics\n",
"\n",
"Determine the number of speakers per company."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Group spkrs_df by \"Q: Company / Institute\" and count the number of submission and return a dataframe\n",
"company_counts = spkrs_df.groupby('Q: Company / Institute').size().reset_index(name='count')\n",
"\n",
"# Sort the dataframe by the count of submissions\n",
"company_counts = company_counts.sort_values('count', ascending=False)\n",
"\n",
"# rename \"Q: Company / Institute\" to \"company_name\"\n",
"company_counts = company_counts.rename(columns={'Q: Company / Institute': 'company_name'})\n",
"\n",
"# company_counts.to_csv('company_counts.csv', index=False)\n",
"company_counts.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from difflib import get_close_matches\n",
"\n",
"# Step 0: Create a dictionary to map similar names to a consistent name\n",
"company_names = company_counts['company_name'].unique()\n",
"company_name_map = {}\n",
"for company_name in company_names:\n",
" company_names_exclude_name = [name for name in company_names if name != company_name]\n",
" \n",
" # except handling company name\n",
" lookup_name = company_name.strip()\n",
"\n",
" # do lookup without company form\n",
" if lookup_name.endswith('GmbH'):\n",
" lookup_name = lookup_name[:-4]\n",
" matches = get_close_matches(lookup_name, company_names_exclude_name, n=1, cutoff=.8) # Adjust the cutoff as needed\n",
" if matches:\n",
" company_name_map[company_name] = matches[0]\n",
"\n",
" # do lookup without company form\n",
" if not lookup_name.endswith('GmbH'):\n",
" lookup_name = f'{lookup_name} GmbH'\n",
" matches = get_close_matches(lookup_name, company_names_exclude_name, n=1, cutoff=.8) # Adjust the cutoff as needed\n",
" if matches:\n",
" company_name_map[company_name] = matches[0]\n",
"\n",
" # do lookup with original company name\n",
" matches = get_close_matches(company_name, company_names_exclude_name, n=1, cutoff=.8) # Adjust the cutoff as needed\n",
" if matches:\n",
" company_name_map[company_name] = matches[0]\n",
"\n",
"company_name_map"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from collections import defaultdict\n",
"\n",
"# Step 1: Build an adjacency list to represent the graph\n",
"adjacency_list = defaultdict(set)\n",
"for key, value in company_name_map.items():\n",
" adjacency_list[key].add(value)\n",
" adjacency_list[value].add(key)\n",
"\n",
"# Step 2: Perform a DFS or BFS to find all connected components\n",
"def find_groups(adjacency_list):\n",
" visited = set()\n",
" groups = []\n",
"\n",
" def dfs(node, group):\n",
" visited.add(node)\n",
" group.append(node)\n",
" for neighbor in adjacency_list[node]:\n",
" if neighbor not in visited:\n",
" dfs(neighbor, group)\n",
"\n",
" for node in adjacency_list:\n",
" if node not in visited:\n",
" group = []\n",
" dfs(node, group)\n",
" groups.append(group)\n",
" \n",
" return groups\n",
"\n",
"# Step 3: Get the grouped names\n",
"company_name_groups = find_groups(adjacency_list)\n",
"\n",
"# Step 4: for each key in company_name_map find its group in company_name_groups\n",
"company_name_group_map = {}\n",
"for key, value in company_name_map.items():\n",
" for group in company_name_groups:\n",
" if key in group:\n",
" company_name_group_map[key] = group\n",
" break\n",
"\n",
"# Step 5: Assign the group names if available\n",
"company_counts['company_name_grouping_arr'] = company_counts['company_name'].apply(lambda x: company_name_group_map.get(x, [x]))\n",
"\n",
"company_counts['company_name_grouping_str'] = company_counts['company_name_grouping_arr'].apply(lambda x: '[' + ', '.join(x) + ']')\n",
"\n",
"company_counts.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Group \"company_counts\" by company_name_grouping column and sum the count column\n",
"company_counts_by_group = company_counts.groupby('company_name_grouping_str')['count'].sum().reset_index()\n",
"\n",
"# Sort the dataframe by the count of submissions\n",
"company_counts_by_group = company_counts_by_group.sort_values('count', ascending=False)\n",
"\n",
"# company_counts_by_group.to_csv('company_counts_by_group.csv', index=False)\n",
"company_counts_by_group"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Submission Statistics\n",
"Determine number of speakers/submissions per track and main track."
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -289,7 +433,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
"version": "3.10.16"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 0a9cd6c

Please sign in to comment.