Skip to content

[Project-Week-6] Elvira Hänni #9

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4,726 changes: 4,726 additions & 0 deletions your-project/.ipynb_checkpoints/2-checkpoint. Cleaning

Large diffs are not rendered by default.

4,733 changes: 4,733 additions & 0 deletions your-project/.ipynb_checkpoints/2. Cleaning-checkpoint.ipynb

Large diffs are not rendered by default.

2,771 changes: 2,771 additions & 0 deletions your-project/.ipynb_checkpoints/3-checkpoint.Statistics

Large diffs are not rendered by default.

3,396 changes: 3,396 additions & 0 deletions your-project/.ipynb_checkpoints/3.Statistics-checkpoint.ipynb

Large diffs are not rendered by default.

5,500 changes: 5,500 additions & 0 deletions your-project/.ipynb_checkpoints/BernScrapping-checkpoint.ipynb

Large diffs are not rendered by default.

4,939 changes: 4,939 additions & 0 deletions your-project/.ipynb_checkpoints/LuzernScrapping -checkpoint.ipynb

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions your-project/.ipynb_checkpoints/Untitled-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 2
}
5,269 changes: 5,269 additions & 0 deletions your-project/.ipynb_checkpoints/ZurichScrapping-checkpoint.ipynb

Large diffs are not rendered by default.

3,772 changes: 3,772 additions & 0 deletions your-project/.ipynb_checkpoints/comparis_scrapping-checkpoint.ipynb

Large diffs are not rendered by default.

366 changes: 366 additions & 0 deletions your-project/.ipynb_checkpoints/linkedin scraping-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,366 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import math\n",
"import time\n",
"from selenium import webdriver\n",
"from selenium.webdriver.common.keys import Keys\n",
"from selenium.webdriver.common.action_chains import ActionChains\n",
"from selenium.webdriver.chrome.options import Options\n",
"from getpass import getpass\n",
"import csv"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"wdpath = \"C:/Users/haenni/Documents/GitHub/Project-Week-4/your-project/chromedriver\"\n",
"\n",
"\n",
"driver = webdriver.Chrome(wdpath)\n",
"\n",
"#driver = webdriver.Chrome()\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"driver.get(\"https://www.comparis.ch/immobilien/default\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enter your email:········\n"
]
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"username.send_keys(my_email)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"password = driver.find_element_by_name(\"session_password\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enter your password········\n"
]
}
],
"source": [
"my_password = getpass(\"Enter your password\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"password.send_keys(my_password)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"log_in_button = driver.find_element_by_class_name(\"sign-in-form__submit-btn\")"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"log_in_button.click()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"job_button = driver.find_element_by_id(\"jobs-nav-item\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"job_button.click()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"job_search = driver.find_elements_by_class_name(\"jobs-search-box__text-input\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enter job title: data analyst\n"
]
}
],
"source": [
"job_title = input(\"Enter job title: \")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"job_search[0].send_keys(job_title)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enter location:luzern\n"
]
}
],
"source": [
"job_city = input(\"Enter location:\")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"job_search[2].send_keys(job_city)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"job_search[2].send_keys(Keys.ENTER)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"## access all the links\n",
"n_xpaths = math.ceil(int(driver.find_element_by_class_name(\"t-12\").text.split()[0].replace(',', '')))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"n_links = 0"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"url = driver.current_url + '&start='"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"for i in range(0, n_xpaths+1, 25):\n",
" page_url = url + str(i)\n",
" driver.get(page_url)\n",
" time.sleep(1.32)\n",
" \n",
"for j in range(2):\n",
" driver.find_element_by_class_name('job-card-search__link-wrapper').send_keys(Keys.END)\n",
" time.sleep(1) \n",
"jobs_raw = driver.find_elements_by_class_name('job-card-search__link-wrapper')"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"job_links = list(set([job.get_attribute('href')[:45] for job in jobs_raw]))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"csv_path = job_title.replace(' ', '_') + \"_\" + job_city.replace(' ', '_') + \"_links.csv\""
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Page 1: 15 links scraped (15 total)\n"
]
}
],
"source": [
"n_links += len(job_links)\n",
"\n",
"print(f\"\\nPage {int(((i)/25)+1)}: {len(job_links)} links scraped ({n_links} total)\")\n",
"\n",
"if i == 0:\n",
"\n",
" if not os.path.isfile(csv_path):\n",
" with open(csv_path, 'w', newline='') as myfile:\n",
" wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
" wr.writerow(job_links)\n",
" else:\n",
" with open(csv_path, 'a', newline='') as myfile:\n",
" wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
" wr.writerow(job_links)\n",
"else:\n",
" with open(csv_path, 'a', newline='') as myfile:\n",
" wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)\n",
" wr.writerow(job_links)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"You scraped 15 data analyst job links in luzern. It's gonna be stored in data_analyst_luzern_links.csv\n",
"\n",
"Closing driver\n"
]
}
],
"source": [
"print(f\"\\nYou scraped {n_links} {job_title.replace('_', ' ')} job links in {job_city}. It\\'s gonna be stored in {csv_path}\") \n",
"\n",
"print('\\nClosing driver')\n",
"driver.quit()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading