Skip to content

Commit 1938eee

Browse files
authored
Beginning working on merging data
1 parent 2c184b4 commit 1938eee

File tree

4 files changed

+156
-102
lines changed

4 files changed

+156
-102
lines changed

article_scrape.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1+
import newspaper.article
12
from newspaper import Article
23
import nltk
4+
import time
35

46

57
# create a function that will summarize the article based on inputted url
@@ -33,15 +35,21 @@ def summarize_article(url):
3335
top_image = str(article.top_image)
3436

3537
# grab all the images within the article
36-
image_string = "All Images: "
37-
for image in article.images:
38-
image_string += "\n\t" + image
38+
# image_string = "All Images: "
39+
# for image in article.images:
40+
# image_string += "\n\t" + image
3941
# print(image_string)
4042

41-
# print("A Quick Article Summary")
42-
# print("----------------------------")
43+
print("A Quick Article Summary")
44+
print("----------------------------")
4345
print(article.summary)
4446

47+
print("Link to Original Article")
48+
print("----------------------------")
49+
print(url)
50+
print()
51+
4552
# grab the article summary
4653
article_summary = article.summary
4754

55+
return author, formatted_date, top_image, article_summary

main.py

+7-21
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from article_scrape import *
21
from news_extract import *
2+
from merge_extracts import *
33
import time
44

55
print("Hi, welcome to Nuickly a fully functional news aggregator! \n"
@@ -8,31 +8,17 @@
88
"their favorite news, be provided with links to the actual article, \n"
99
"and be shown a quick summary what the article is about. ")
1010
print("------------------------------------------------------------------------------------------------------")
11-
print("To get started, please input a news agency you would to view! \n Al Jazeera \t BBC \t USA Today")
11+
print("To get started, please input a topic space you would to view! \n News \t Sports \t Economy")
1212
user_agency = input("Type here: ")
13-
user_topic = ""
14-
if(user_agency.lower() == "Al Jazeera".lower()):
15-
print("Which topic space would you like to view? \n News \t Sports \t Economy")
16-
user_topic = input("Type here: ")
17-
elif(user_agency.lower() == "BBC".lower()):
18-
print("Which topic space would you like to view? \n News \t Sports \t Economy")
19-
user_topic = input("Type here: ")
20-
elif(user_agency.lower() == "USA Today".lower()):
21-
print("Which topic space would you like to view? \n News \t Sports \t Money")
22-
user_topic = input("Type here: ")
2313

2414
print("------------------------------------------------------------------------------------------------------")
2515
print("Scraping articles...")
26-
links = get_content(user_agency, user_topic)
16+
input_links_alj, input_links_usa = get_content(user_agency)
2717
time.sleep(2)
2818
print()
2919

30-
print("Preparing summaries...")
31-
time.sleep(2)
32-
author = ""
33-
formatted_date = ""
34-
top_image = ""
35-
article_summary = ""
36-
for i in links:
20+
for i in input_links_alj:
3721
summarize_article(i)
38-
print("Link to article: " + i)
22+
23+
for j in input_links_usa:
24+
summarize_article(j)

merge_extracts.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from article_scrape import *
2+
from news_extract import*
3+
4+
5+
# create function that will merge lists into an dataframe
6+
def merge(input_links_alj, input_links_usa):
7+
8+
# grab all the information from Al Jazeera
9+
authors_alj = []
10+
formatted_dates_alj = []
11+
top_images_alj = []
12+
article_summaries_alj = []
13+
links_alj = []
14+
for i in input_links_alj:
15+
author, formatted_date, top_images, article_summary = summarize_article(i)
16+
authors_alj.append(author)
17+
formatted_dates_alj.append(formatted_date)
18+
top_images_alj.append(top_images)
19+
article_summaries_alj.append(article_summary)
20+
links_alj.append(i)
21+
22+
# grab all the information from BBC
23+
# authors_bbc = []
24+
# formatted_dates_bbc = []
25+
# top_images_bbc = []
26+
# article_summaries_bbc = []
27+
# links_bbc = []
28+
# for i in input_links_bbc:
29+
# author, formatted_date, top_images, article_summary = summarize_article(i)
30+
# authors_bbc.append(author)
31+
# formatted_dates_bbc.append(formatted_date)
32+
# top_images_bbc.append(top_images)
33+
# article_summaries_bbc.append(article_summary)
34+
# links_bbc.append(i)
35+
36+
# grab all the information from USA Today
37+
authors_usa = []
38+
formatted_dates_usa = []
39+
top_images_usa = []
40+
article_summaries_usa = []
41+
links_usa = []
42+
for i in input_links_usa:
43+
author, formatted_date, top_images, article_summary = summarize_article(i)
44+
authors_usa.append(author)
45+
formatted_dates_usa.append(formatted_date)
46+
top_images_usa.append(top_images)
47+
article_summaries_usa.append(article_summary)
48+
links_usa.append(i)
49+
50+
total_authors = authors_alj + authors_usa
51+
total_dates = formatted_dates_alj + formatted_dates_usa
52+
total_top_images = top_images_alj + top_images_usa
53+
total_article_summaries = article_summaries_alj + article_summaries_usa
54+
total_links = links_alj + links_usa
55+
56+
return total_authors, total_dates, total_top_images, total_article_summaries, total_links
57+

news_extract.py

+79-76
Original file line numberDiff line numberDiff line change
@@ -2,142 +2,145 @@
22
from bs4 import BeautifulSoup
33

44

5-
def get_content(user_agency, user_topic):
5+
def get_content(user_topic):
66
# determine the correct url for the given agency and topic
7-
url = ""
8-
if(user_agency.lower() == "Al Jazeera".lower()):
9-
if(user_topic.lower() == "News".lower()):
10-
url = "https://www.aljazeera.com/news/"
11-
elif(user_topic.lower() == "Sports".lower()):
12-
url = "https://www.aljazeera.com/sports/"
13-
elif(user_topic.lower() == "Economy".lower()):
14-
url = "https://www.aljazeera.com/economy/"
15-
elif(user_agency.lower() == "BBC".lower()):
16-
if (user_topic.lower() == "News".lower()):
17-
url = "https://www.bbc.com/news"
18-
elif (user_topic.lower() == "Sports".lower()):
19-
url = "https://www.bbc.com/sport"
20-
elif (user_topic.lower() == "Economy".lower()):
21-
url = "https://www.bbc.com/news/business/economy"
22-
elif (user_agency.lower() == "USA Today".lower()):
23-
if (user_topic.lower() == "News".lower()):
24-
url = "https://www.usatoday.com/news/"
25-
elif (user_topic.lower() == "Sports".lower()):
26-
url = "https://www.usatoday.com/sports/"
27-
elif (user_topic.lower() == "Money".lower()):
28-
url = "https://www.usatoday.com/money/"
7+
url_alj = ""
8+
# url_bbc = ""
9+
url_usa = ""
10+
if(user_topic.lower() == "News".lower()):
11+
url_alj = "https://www.aljazeera.com/news/"
12+
# url_bbc = "https://www.bbc.com/news"
13+
url_usa = "https://www.usatoday.com/news/"
14+
elif(user_topic.lower() == "Sports".lower()):
15+
url_alj = "https://www.aljazeera.com/sports/"
16+
# url_bbc = "https://www.bbc.com/sport"
17+
url_usa = "https://www.usatoday.com/sports/"
18+
elif(user_topic.lower() == "Economy".lower()):
19+
url_alj = "https://www.aljazeera.com/economy/"
20+
# url_bbc = "https://www.bbc.com/news/business/economy"
21+
url_usa = "https://www.usatoday.com/money/"
2922

30-
page = requests.get(url)
31-
page_soup = BeautifulSoup(page.content, 'html.parser')
23+
page_alj = requests.get(url_alj)
24+
# page_bbc = requests.get(url_bbc)
25+
page_usa = requests.get(url_usa)
26+
page_alj_soup = BeautifulSoup(page_alj.content, 'html.parser')
27+
# page_bbc_soup = BeautifulSoup(page_bbc.content, 'html.parser')
28+
page_usa_soup = BeautifulSoup(page_usa.content, 'html.parser')
3229
# print(page_soup)
3330

3431
# depending on what news agencies to parse, the container will hold different items
35-
content = []
36-
links = []
37-
if(url == "https://www.aljazeera.com/news/"):
38-
for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
39-
content.append(i.get('href'))
32+
content_alj = []
33+
# content_bbc = []
34+
content_usa = []
35+
links_alj = []
36+
# links_bbc = []
37+
links_usa = []
38+
39+
if(url_alj == "https://www.aljazeera.com/news/"):
40+
for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
41+
content_alj.append(i.get('href'))
4042
# print(content)
4143
# so content holds all the paths to the links after "https://www.aljazeera.com/news"
4244
# so lets append that to the beginning of them
4345
prefix = "https://www.aljazeera.com"
44-
for j in content:
45-
links.append(prefix + j)
46+
for j in content_alj:
47+
links_alj.append(prefix + j)
4648
# looks like it worked
4749
# print(links)
4850

49-
if (url == "https://www.aljazeera.com/sports/"):
50-
for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
51-
content.append(i.get('href'))
51+
if (url_alj == "https://www.aljazeera.com/sports/"):
52+
for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
53+
content_alj.append(i.get('href'))
5254
# print(content)
5355
# so content holds all the paths to the links after "https://www.aljazeera.com"
5456
# so lets append that to the beginning of them
5557
prefix = "https://www.aljazeera.com"
56-
for j in content:
57-
links.append(prefix + j)
58+
for j in content_alj:
59+
links_alj.append(prefix + j)
5860
# looks like it worked
5961
# print(links)
6062

61-
if (url == "https://www.aljazeera.com/economy/"):
62-
for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
63-
content.append(i.get('href'))
63+
if (url_alj == "https://www.aljazeera.com/economy/"):
64+
for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
65+
content_alj.append(i.get('href'))
6466
# print(content)
6567
# so content holds all the paths to the links after "https://www.aljazeera.com"
6668
# so lets append that to the beginning of them
6769
prefix = "https://www.aljazeera.com"
68-
for j in content:
69-
links.append(prefix + j)
70+
for j in content_alj:
71+
links_alj.append(prefix + j)
7072
# looks like it worked
7173
# print(links)
72-
73-
if (url == "https://www.bbc.com/news"):
74-
for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
75-
content.append(i.get('href'))
74+
"""
75+
if (url_bbc == "https://www.bbc.com/news"):
76+
for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
77+
content_bbc.append(i.get('href'))
7678
# print(content)
7779
# so content holds all the paths to the links after "https://www.bbc.com"
7880
#so append that to the beginning of them
7981
prefix = "https://www.bbc.com"
80-
for j in content:
81-
links.append(prefix + j)
82+
for j in content_bbc:
83+
links_bbc.append(prefix + j)
8284
# print(links)
8385
84-
if (url == "https://www.bbc.com/sport"):
85-
for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}):
86-
content.append(i.get('href'))
86+
if (url_bbc == "https://www.bbc.com/sport"):
87+
for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}):
88+
content_bbc.append(i.get('href'))
8789
# print(content)
8890
# so content holds all the paths to the links after "https://www.bbc.com"
8991
#so append that to the beginning of them
9092
prefix = "https://www.bbc.com"
91-
for j in content:
92-
links.append(prefix + j)
93+
for j in content_bbc:
94+
links_bbc.append(prefix + j)
9395
#print(links)
9496
95-
if (url == "https://www.bbc.com/news/business/economy"):
96-
for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
97-
content.append(i.get('href'))
97+
if (url_bbc == "https://www.bbc.com/news/business/economy"):
98+
for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
99+
content_bbc.append(i.get('href'))
98100
# print(content)
99101
# so content holds all the paths to the links after "https://www.bbc.com"
100102
#so append that to the beginning of them
101103
prefix = "https://www.bbc.com"
102-
for j in content:
103-
links.append(prefix + j)
104-
# print(links)
104+
for j in content_bbc:
105+
links_bbc.append(prefix + j)
106+
# print(links)
107+
"""
105108

106-
if (url == "https://www.usatoday.com/news/"):
107-
for i in page_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}):
108-
content.append(i.get('href'))
109+
if (url_usa == "https://www.usatoday.com/news/"):
110+
for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}):
111+
content_usa.append(i.get('href'))
109112
# print(content)
110113
# so content holds all the paths to the links after "https://www.usatoday.com"
111114
prefix = "https://www.usatoday.com"
112-
for j in content:
113-
links.append(prefix + j)
115+
for j in content_usa:
116+
links_usa.append(prefix + j)
114117
# print(links)
115118

116-
if (url == "https://www.usatoday.com/money/"):
117-
for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}):
118-
content.append(i.get('href'))
119+
if (url_usa == "https://www.usatoday.com/money/"):
120+
for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}):
121+
content_usa.append(i.get('href'))
119122
# print(content)
120123
# we see that the first index is actually link to the "the Daily money"
121124
# delete the first index
122-
del content[0]
125+
del content_usa[0]
123126
# we also see that some empty strings have been found, remove them
124-
finalized_content = list(filter(None, content))
127+
finalized_content = list(filter(None, content_usa))
125128
# so finalized content holds all the paths to the links after "https://www.usatoday.com"
126129
prefix = "https://www.usatoday.com"
127130
for j in finalized_content:
128-
links.append(prefix + j)
131+
links_usa.append(prefix + j)
129132
# print(links)
130133

131-
if (url == "https://www.usatoday.com/sports/"):
132-
for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}):
133-
content.append(i.get('href'))
134+
if (url_usa == "https://www.usatoday.com/sports/"):
135+
for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}):
136+
content_usa.append(i.get('href'))
134137
# print(content)
135138
# we can see that some empty strings have been found, remove them
136-
finalized_content = list(filter(None, content))
139+
finalized_content = list(filter(None, content_usa))
137140
# so content holds all the paths to the links after "https://www.usatoday.com"
138141
prefix = "https://www.usatoday.com"
139142
for j in finalized_content:
140-
links.append(prefix + j)
143+
links_usa.append(prefix + j)
141144
# print(links)
142145

143-
return links
146+
return links_alj, links_usa

0 commit comments

Comments
 (0)