|
2 | 2 | from bs4 import BeautifulSoup
|
3 | 3 |
|
4 | 4 |
|
5 |
| -def get_content(user_agency, user_topic): |
| 5 | +def get_content(user_topic): |
6 | 6 | # determine the correct url for the given agency and topic
|
7 |
| - url = "" |
8 |
| - if(user_agency.lower() == "Al Jazeera".lower()): |
9 |
| - if(user_topic.lower() == "News".lower()): |
10 |
| - url = "https://www.aljazeera.com/news/" |
11 |
| - elif(user_topic.lower() == "Sports".lower()): |
12 |
| - url = "https://www.aljazeera.com/sports/" |
13 |
| - elif(user_topic.lower() == "Economy".lower()): |
14 |
| - url = "https://www.aljazeera.com/economy/" |
15 |
| - elif(user_agency.lower() == "BBC".lower()): |
16 |
| - if (user_topic.lower() == "News".lower()): |
17 |
| - url = "https://www.bbc.com/news" |
18 |
| - elif (user_topic.lower() == "Sports".lower()): |
19 |
| - url = "https://www.bbc.com/sport" |
20 |
| - elif (user_topic.lower() == "Economy".lower()): |
21 |
| - url = "https://www.bbc.com/news/business/economy" |
22 |
| - elif (user_agency.lower() == "USA Today".lower()): |
23 |
| - if (user_topic.lower() == "News".lower()): |
24 |
| - url = "https://www.usatoday.com/news/" |
25 |
| - elif (user_topic.lower() == "Sports".lower()): |
26 |
| - url = "https://www.usatoday.com/sports/" |
27 |
| - elif (user_topic.lower() == "Money".lower()): |
28 |
| - url = "https://www.usatoday.com/money/" |
| 7 | + url_alj = "" |
| 8 | + # url_bbc = "" |
| 9 | + url_usa = "" |
| 10 | + if(user_topic.lower() == "News".lower()): |
| 11 | + url_alj = "https://www.aljazeera.com/news/" |
| 12 | + # url_bbc = "https://www.bbc.com/news" |
| 13 | + url_usa = "https://www.usatoday.com/news/" |
| 14 | + elif(user_topic.lower() == "Sports".lower()): |
| 15 | + url_alj = "https://www.aljazeera.com/sports/" |
| 16 | + # url_bbc = "https://www.bbc.com/sport" |
| 17 | + url_usa = "https://www.usatoday.com/sports/" |
| 18 | + elif(user_topic.lower() == "Economy".lower()): |
| 19 | + url_alj = "https://www.aljazeera.com/economy/" |
| 20 | + # url_bbc = "https://www.bbc.com/news/business/economy" |
| 21 | + url_usa = "https://www.usatoday.com/money/" |
29 | 22 |
|
30 |
| - page = requests.get(url) |
31 |
| - page_soup = BeautifulSoup(page.content, 'html.parser') |
| 23 | + page_alj = requests.get(url_alj) |
| 24 | + # page_bbc = requests.get(url_bbc) |
| 25 | + page_usa = requests.get(url_usa) |
| 26 | + page_alj_soup = BeautifulSoup(page_alj.content, 'html.parser') |
| 27 | + # page_bbc_soup = BeautifulSoup(page_bbc.content, 'html.parser') |
| 28 | + page_usa_soup = BeautifulSoup(page_usa.content, 'html.parser') |
32 | 29 | # print(page_soup)
|
33 | 30 |
|
34 | 31 | # depending on what news agencies to parse, the container will hold different items
|
35 |
| - content = [] |
36 |
| - links = [] |
37 |
| - if(url == "https://www.aljazeera.com/news/"): |
38 |
| - for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}): |
39 |
| - content.append(i.get('href')) |
| 32 | + content_alj = [] |
| 33 | + # content_bbc = [] |
| 34 | + content_usa = [] |
| 35 | + links_alj = [] |
| 36 | + # links_bbc = [] |
| 37 | + links_usa = [] |
| 38 | + |
| 39 | + if(url_alj == "https://www.aljazeera.com/news/"): |
| 40 | + for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}): |
| 41 | + content_alj.append(i.get('href')) |
40 | 42 | # print(content)
|
41 | 43 | # so content holds all the paths to the links after "https://www.aljazeera.com/news"
|
42 | 44 | # so lets append that to the beginning of them
|
43 | 45 | prefix = "https://www.aljazeera.com"
|
44 |
| - for j in content: |
45 |
| - links.append(prefix + j) |
| 46 | + for j in content_alj: |
| 47 | + links_alj.append(prefix + j) |
46 | 48 | # looks like it worked
|
47 | 49 | # print(links)
|
48 | 50 |
|
49 |
| - if (url == "https://www.aljazeera.com/sports/"): |
50 |
| - for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}): |
51 |
| - content.append(i.get('href')) |
| 51 | + if (url_alj == "https://www.aljazeera.com/sports/"): |
| 52 | + for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}): |
| 53 | + content_alj.append(i.get('href')) |
52 | 54 | # print(content)
|
53 | 55 | # so content holds all the paths to the links after "https://www.aljazeera.com"
|
54 | 56 | # so lets append that to the beginning of them
|
55 | 57 | prefix = "https://www.aljazeera.com"
|
56 |
| - for j in content: |
57 |
| - links.append(prefix + j) |
| 58 | + for j in content_alj: |
| 59 | + links_alj.append(prefix + j) |
58 | 60 | # looks like it worked
|
59 | 61 | # print(links)
|
60 | 62 |
|
61 |
| - if (url == "https://www.aljazeera.com/economy/"): |
62 |
| - for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}): |
63 |
| - content.append(i.get('href')) |
| 63 | + if (url_alj == "https://www.aljazeera.com/economy/"): |
| 64 | + for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}): |
| 65 | + content_alj.append(i.get('href')) |
64 | 66 | # print(content)
|
65 | 67 | # so content holds all the paths to the links after "https://www.aljazeera.com"
|
66 | 68 | # so lets append that to the beginning of them
|
67 | 69 | prefix = "https://www.aljazeera.com"
|
68 |
| - for j in content: |
69 |
| - links.append(prefix + j) |
| 70 | + for j in content_alj: |
| 71 | + links_alj.append(prefix + j) |
70 | 72 | # looks like it worked
|
71 | 73 | # print(links)
|
72 |
| - |
73 |
| - if (url == "https://www.bbc.com/news"): |
74 |
| - for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}): |
75 |
| - content.append(i.get('href')) |
| 74 | + """ |
| 75 | + if (url_bbc == "https://www.bbc.com/news"): |
| 76 | + for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}): |
| 77 | + content_bbc.append(i.get('href')) |
76 | 78 | # print(content)
|
77 | 79 | # so content holds all the paths to the links after "https://www.bbc.com"
|
78 | 80 | #so append that to the beginning of them
|
79 | 81 | prefix = "https://www.bbc.com"
|
80 |
| - for j in content: |
81 |
| - links.append(prefix + j) |
| 82 | + for j in content_bbc: |
| 83 | + links_bbc.append(prefix + j) |
82 | 84 | # print(links)
|
83 | 85 |
|
84 |
| - if (url == "https://www.bbc.com/sport"): |
85 |
| - for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}): |
86 |
| - content.append(i.get('href')) |
| 86 | + if (url_bbc == "https://www.bbc.com/sport"): |
| 87 | + for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}): |
| 88 | + content_bbc.append(i.get('href')) |
87 | 89 | # print(content)
|
88 | 90 | # so content holds all the paths to the links after "https://www.bbc.com"
|
89 | 91 | #so append that to the beginning of them
|
90 | 92 | prefix = "https://www.bbc.com"
|
91 |
| - for j in content: |
92 |
| - links.append(prefix + j) |
| 93 | + for j in content_bbc: |
| 94 | + links_bbc.append(prefix + j) |
93 | 95 | #print(links)
|
94 | 96 |
|
95 |
| - if (url == "https://www.bbc.com/news/business/economy"): |
96 |
| - for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}): |
97 |
| - content.append(i.get('href')) |
| 97 | + if (url_bbc == "https://www.bbc.com/news/business/economy"): |
| 98 | + for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}): |
| 99 | + content_bbc.append(i.get('href')) |
98 | 100 | # print(content)
|
99 | 101 | # so content holds all the paths to the links after "https://www.bbc.com"
|
100 | 102 | #so append that to the beginning of them
|
101 | 103 | prefix = "https://www.bbc.com"
|
102 |
| - for j in content: |
103 |
| - links.append(prefix + j) |
104 |
| - # print(links) |
| 104 | + for j in content_bbc: |
| 105 | + links_bbc.append(prefix + j) |
| 106 | + # print(links) |
| 107 | + """ |
105 | 108 |
|
106 |
| - if (url == "https://www.usatoday.com/news/"): |
107 |
| - for i in page_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}): |
108 |
| - content.append(i.get('href')) |
| 109 | + if (url_usa == "https://www.usatoday.com/news/"): |
| 110 | + for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}): |
| 111 | + content_usa.append(i.get('href')) |
109 | 112 | # print(content)
|
110 | 113 | # so content holds all the paths to the links after "https://www.usatoday.com"
|
111 | 114 | prefix = "https://www.usatoday.com"
|
112 |
| - for j in content: |
113 |
| - links.append(prefix + j) |
| 115 | + for j in content_usa: |
| 116 | + links_usa.append(prefix + j) |
114 | 117 | # print(links)
|
115 | 118 |
|
116 |
| - if (url == "https://www.usatoday.com/money/"): |
117 |
| - for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}): |
118 |
| - content.append(i.get('href')) |
| 119 | + if (url_usa == "https://www.usatoday.com/money/"): |
| 120 | + for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}): |
| 121 | + content_usa.append(i.get('href')) |
119 | 122 | # print(content)
|
120 | 123 | # we see that the first index is actually link to the "the Daily money"
|
121 | 124 | # delete the first index
|
122 |
| - del content[0] |
| 125 | + del content_usa[0] |
123 | 126 | # we also see that some empty strings have been found, remove them
|
124 |
| - finalized_content = list(filter(None, content)) |
| 127 | + finalized_content = list(filter(None, content_usa)) |
125 | 128 | # so finalized content holds all the paths to the links after "https://www.usatoday.com"
|
126 | 129 | prefix = "https://www.usatoday.com"
|
127 | 130 | for j in finalized_content:
|
128 |
| - links.append(prefix + j) |
| 131 | + links_usa.append(prefix + j) |
129 | 132 | # print(links)
|
130 | 133 |
|
131 |
| - if (url == "https://www.usatoday.com/sports/"): |
132 |
| - for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}): |
133 |
| - content.append(i.get('href')) |
| 134 | + if (url_usa == "https://www.usatoday.com/sports/"): |
| 135 | + for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}): |
| 136 | + content_usa.append(i.get('href')) |
134 | 137 | # print(content)
|
135 | 138 | # we can see that some empty strings have been found, remove them
|
136 |
| - finalized_content = list(filter(None, content)) |
| 139 | + finalized_content = list(filter(None, content_usa)) |
137 | 140 | # so content holds all the paths to the links after "https://www.usatoday.com"
|
138 | 141 | prefix = "https://www.usatoday.com"
|
139 | 142 | for j in finalized_content:
|
140 |
| - links.append(prefix + j) |
| 143 | + links_usa.append(prefix + j) |
141 | 144 | # print(links)
|
142 | 145 |
|
143 |
| - return links |
| 146 | + return links_alj, links_usa |
0 commit comments