-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBeautifulSoup_Scraping_Example.py
229 lines (160 loc) · 6.93 KB
/
BeautifulSoup_Scraping_Example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#!/usr/bin/env python
# coding: utf-8
# In[1]:
#Import all the requisite libraries for extraction
import os
import re
import wget
import glob
import time
import selenium
import requests
import numpy as np
import pandas as pd
import zipfile as z
from datetime import datetime
from bs4 import BeautifulSoup
from selenium import webdriver
import urllib.request as urllib2
# In[2]:
#Header required for urllib response
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
# In[3]:
# Enter the base website for fiercepharma
base_website = "https://fiercepharma.com/"
# Search suffix
ssuf = "search-results/"
# Enter the keyword to search for in the entire FiercePharma website
# Note: Input for this needs to be changed in below format:
# 2 terms: Breast Cancer as "breast%20cancer"
# 3 terms: Lung Cancer Treatment as "lung%20cancer%20treatment" and so on.....
sterm = "nsclc"
# Filter suffix
fsuf = "?filter="
# Select the type of filter to apply ("published_date" or "relevance")
filter_type = "published_date&page="
# In[4]:
# Initiate all possible lists
page = 1
lst_art_ttl = []
lst_nkeywords = []
lst_art_cat = []
lst_auth = []
lst_pub_date = []
lst_art_full = []
# In[5]:
# Main Code Block for Scraping the FiercePharma website for the given search conditions
while(page):
"Idea is to break the loop when it encounters a tag that has got no results message"
temp_web = base_website+ssuf+sterm+fsuf+filter_type+str(page)
#Check for the response from the website: If no response, try next page in except block
try:
response = requests.request("GET", temp_web, headers=hdr,verify=False)
except:
print("Error fetching a request for page: "+str(page))
page = page+1
continue
# Fetch the html data using BeautifulSoup
data = BeautifulSoup(response.text, 'html.parser')
# In case the div tag of "results-showing-counter" doesn't exist break the loop
if(data.find("div",{"class":"results-showing-counter"})):
# Get all the article links present in the page
tag_links_list = data.select("h2 a")
links_list = []
for i in tag_links_list:
links_list.append("https://fiercepharma.com"+i.attrs["href"])
# Loop through all the extracted links of the page to fetch information
for index,tweb in enumerate(links_list):
tweb_res = requests.request("GET", links_list[index], headers=hdr,verify=False)
temp_data = BeautifulSoup(tweb_res.text, 'html.parser')
check_list_meta = []
# First check if the elements are empty: if 'yes' replace by 'NA'
for tag in temp_data.find_all("meta"):
if(tag.get('name') is not None):
(check_list_meta.append(tag.get('name')))
if(tag.get('property') is not None):
(check_list_meta.append(tag.get('property')))
print(check_list_meta)
print("***********************")
# if('parsely-author' not in check_list_meta):
# print("NA")
# lst_auth.append('NA')
if('parsely-pub-date' not in check_list_meta):
lst_pub_date.append('NA')
if('news_keywords' not in check_list_meta):
lst_nkeywords.append('NA')
if('og:title' not in check_list_meta):
lst_art_ttl.append('NA')
auth_li = []
# Extract title and other related information
for tag in temp_data.find_all("meta"):
# Keywords extraction
if(tag.get("name")=='news_keywords'):
lst_nkeywords.append(tag.get("content"))
# Title Extraction
if(tag.get("property")=='og:title'):
lst_art_ttl.append(tag.get("content"))
# Author Name Extraction
# if((tag.get("name")=='parsely-author')):
# print(tag.get("content"))
# auth_li.append(tag.get("content"))
# Publication Date Extraction
if(tag.get("name")=='parsely-pub-date'):
lst_pub_date.append(tag.get('content'))
# lst_auth.append(auth_li)
try:
# Get the article associated
if((temp_data.find('div', {'property': 'schema:articleBody'}) is None) & (temp_data.find('div', {'class': 'node__content'}) is None)):
lst_art_full.append('NA')
elif (temp_data.find('div', {'property': 'schema:articleBody'})):
text_para = temp_data.find('div', {'property': 'schema:articleBody'})
lst_art_full.append(list(text_para.findChildren("p",recursive=True)))
else:
text_para = temp_data.find('div', {'class': 'node__content'})
lst_art_full.append(list(text_para.findChildren("p",recursive=True)))
except:
print("Error while fetching the textual content. Check the website structure")
# Get the associated article tag with this
try:
if(temp_data.find('div', {'class': 'taxonomy-term vocabulary-taxonomy'}) is None):
lst_art_cat.append('NA')
print("Hi")
cat_li = temp_data.find('div', {'class': 'taxonomy-term vocabulary-taxonomy'})
cl2 = str(cat_li.findChildren("h2" , recursive=True)[0])
lst_art_cat.append(re.findall("<div>(.*?)</div>", cl2))
print((re.findall("<div>(.*?)</div>", cl2)))
print(page)
print(lst_art_ttl[-1])
except:
print("Some error while fetching the text category from the page")
page = page+1
else:
print("Search Complete...")
break
# In[6]:
len(lst_art_ttl),len(lst_art_full),len(lst_art_cat),len(lst_pub_date),len(lst_nkeywords)
# In[8]:
search_df = pd.DataFrame()
search_df['Article_Title'] = lst_art_ttl
search_df['Article_Story'] = lst_art_full
# search_df['Article_Author'] = lst_auth
search_df['Article_Cat'] = lst_art_cat
search_df['Article_Pub_Date'] = lst_pub_date
search_df['Article_Keywords'] = lst_nkeywords
# In[10]:
search_df.shape
# In[11]:
search_term = 'nsclc'
filename = "fiercepharma_search_"+ search_term + "_"+ str(datetime.today().strftime('%Y_%m_%d'))+".csv"
filename
# In[12]:
search_df.to_csv(filename,index=False)
# In[ ]:
# In[ ]:
# In[ ]:
# In[ ]: