Beginning working on merging data

darwich6 · web-flow · commit 1938eee6479b · 2022-03-04T18:17:19.000-05:00
diff --git a/article_scrape.py b/article_scrape.py
@@ -1,5 +1,7 @@
+import newspaper.article
 from newspaper import Article
 import nltk
+import time
 
 
 # create a function that will summarize the article based on inputted url
@@ -33,15 +35,21 @@ def summarize_article(url):
     top_image = str(article.top_image)
 
     # grab all the images within the article
-    image_string = "All Images: "
-    for image in article.images:
-        image_string += "\n\t" + image
+    # image_string = "All Images: "
+    # for image in article.images:
+    #    image_string += "\n\t" + image
     # print(image_string)
 
-    # print("A Quick Article Summary")
-    # print("----------------------------")
+    print("A Quick Article Summary")
+    print("----------------------------")
     print(article.summary)
 
+    print("Link to Original Article")
+    print("----------------------------")
+    print(url)
+    print()
+
     # grab the article summary
     article_summary = article.summary
 
+    return author, formatted_date, top_image, article_summary
diff --git a/main.py b/main.py
@@ -1,5 +1,5 @@
-from article_scrape import *
 from news_extract import *
+from merge_extracts import *
 import time
 
 print("Hi, welcome to Nuickly a fully functional news aggregator! \n"
@@ -8,31 +8,17 @@
       "their favorite news, be provided with links to the actual article, \n"
       "and be shown a quick summary what the article is about. ")
 print("------------------------------------------------------------------------------------------------------")
-print("To get started, please input a news agency you would to view! \n Al Jazeera \t BBC \t USA Today")
+print("To get started, please input a topic space you would to view! \n News \t Sports \t Economy")
 user_agency = input("Type here: ")
-user_topic = ""
-if(user_agency.lower() == "Al Jazeera".lower()):
-      print("Which topic space would you like to view? \n News \t Sports \t Economy")
-      user_topic = input("Type here: ")
-elif(user_agency.lower() == "BBC".lower()):
-      print("Which topic space would you like to view? \n News \t Sports \t Economy")
-      user_topic = input("Type here: ")
-elif(user_agency.lower() == "USA Today".lower()):
-      print("Which topic space would you like to view? \n News \t Sports \t Money")
-      user_topic = input("Type here: ")
 
 print("------------------------------------------------------------------------------------------------------")
 print("Scraping articles...")
-links = get_content(user_agency, user_topic)
+input_links_alj, input_links_usa = get_content(user_agency)
 time.sleep(2)
 print()
 
-print("Preparing summaries...")
-time.sleep(2)
-author = ""
-formatted_date = ""
-top_image = ""
-article_summary = ""
-for i in links:
+for i in input_links_alj:
       summarize_article(i)
-      print("Link to article: " + i)
+
+for j in input_links_usa:
+      summarize_article(j)
diff --git a/merge_extracts.py b/merge_extracts.py
@@ -0,0 +1,57 @@
+from article_scrape import *
+from news_extract import*
+
+
+# create function that will merge lists into an dataframe
+def merge(input_links_alj, input_links_usa):
+
+    # grab all the information from Al Jazeera
+    authors_alj = []
+    formatted_dates_alj = []
+    top_images_alj = []
+    article_summaries_alj = []
+    links_alj = []
+    for i in input_links_alj:
+        author, formatted_date, top_images, article_summary = summarize_article(i)
+        authors_alj.append(author)
+        formatted_dates_alj.append(formatted_date)
+        top_images_alj.append(top_images)
+        article_summaries_alj.append(article_summary)
+        links_alj.append(i)
+
+    # grab all the information from BBC
+    # authors_bbc = []
+    # formatted_dates_bbc = []
+    # top_images_bbc = []
+    # article_summaries_bbc = []
+    # links_bbc = []
+    # for i in input_links_bbc:
+    #    author, formatted_date, top_images, article_summary = summarize_article(i)
+    #    authors_bbc.append(author)
+    #    formatted_dates_bbc.append(formatted_date)
+    #    top_images_bbc.append(top_images)
+    #    article_summaries_bbc.append(article_summary)
+    #    links_bbc.append(i)
+
+    # grab all the information from USA Today
+    authors_usa = []
+    formatted_dates_usa = []
+    top_images_usa = []
+    article_summaries_usa = []
+    links_usa = []
+    for i in input_links_usa:
+        author, formatted_date, top_images, article_summary = summarize_article(i)
+        authors_usa.append(author)
+        formatted_dates_usa.append(formatted_date)
+        top_images_usa.append(top_images)
+        article_summaries_usa.append(article_summary)
+        links_usa.append(i)
+
+    total_authors = authors_alj + authors_usa
+    total_dates = formatted_dates_alj + formatted_dates_usa
+    total_top_images = top_images_alj + top_images_usa
+    total_article_summaries = article_summaries_alj + article_summaries_usa
+    total_links = links_alj + links_usa
+
+    return total_authors, total_dates, total_top_images, total_article_summaries, total_links
+
diff --git a/news_extract.py b/news_extract.py
@@ -2,142 +2,145 @@
 from bs4 import BeautifulSoup
 
 
-def get_content(user_agency, user_topic):
+def get_content(user_topic):
     # determine the correct url for the given agency and topic
-    url = ""
-    if(user_agency.lower() == "Al Jazeera".lower()):
-        if(user_topic.lower() == "News".lower()):
-            url = "https://www.aljazeera.com/news/"
-        elif(user_topic.lower() == "Sports".lower()):
-            url = "https://www.aljazeera.com/sports/"
-        elif(user_topic.lower() == "Economy".lower()):
-            url = "https://www.aljazeera.com/economy/"
-    elif(user_agency.lower() == "BBC".lower()):
-        if (user_topic.lower() == "News".lower()):
-            url = "https://www.bbc.com/news"
-        elif (user_topic.lower() == "Sports".lower()):
-            url = "https://www.bbc.com/sport"
-        elif (user_topic.lower() == "Economy".lower()):
-            url = "https://www.bbc.com/news/business/economy"
-    elif (user_agency.lower() == "USA Today".lower()):
-        if (user_topic.lower() == "News".lower()):
-            url = "https://www.usatoday.com/news/"
-        elif (user_topic.lower() == "Sports".lower()):
-            url = "https://www.usatoday.com/sports/"
-        elif (user_topic.lower() == "Money".lower()):
-            url = "https://www.usatoday.com/money/"
+    url_alj = ""
+    # url_bbc = ""
+    url_usa = ""
+    if(user_topic.lower() == "News".lower()):
+        url_alj = "https://www.aljazeera.com/news/"
+        # url_bbc = "https://www.bbc.com/news"
+        url_usa = "https://www.usatoday.com/news/"
+    elif(user_topic.lower() == "Sports".lower()):
+        url_alj = "https://www.aljazeera.com/sports/"
+        # url_bbc = "https://www.bbc.com/sport"
+        url_usa = "https://www.usatoday.com/sports/"
+    elif(user_topic.lower() == "Economy".lower()):
+        url_alj = "https://www.aljazeera.com/economy/"
+        # url_bbc = "https://www.bbc.com/news/business/economy"
+        url_usa = "https://www.usatoday.com/money/"
 
-    page = requests.get(url)
-    page_soup = BeautifulSoup(page.content, 'html.parser')
+    page_alj = requests.get(url_alj)
+    # page_bbc = requests.get(url_bbc)
+    page_usa = requests.get(url_usa)
+    page_alj_soup = BeautifulSoup(page_alj.content, 'html.parser')
+    # page_bbc_soup = BeautifulSoup(page_bbc.content, 'html.parser')
+    page_usa_soup = BeautifulSoup(page_usa.content, 'html.parser')
     # print(page_soup)
 
     # depending on what news agencies to parse, the container will hold different items
-    content = []
-    links = []
-    if(url == "https://www.aljazeera.com/news/"):
-        for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
-            content.append(i.get('href'))
+    content_alj = []
+    # content_bbc = []
+    content_usa = []
+    links_alj = []
+    # links_bbc = []
+    links_usa = []
+
+    if(url_alj == "https://www.aljazeera.com/news/"):
+        for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
+            content_alj.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.aljazeera.com/news"
         # so lets append that to the beginning of them
         prefix = "https://www.aljazeera.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_alj:
+            links_alj.append(prefix + j)
         # looks like it worked
         # print(links)
 
-    if (url == "https://www.aljazeera.com/sports/"):
-        for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
-            content.append(i.get('href'))
+    if (url_alj == "https://www.aljazeera.com/sports/"):
+        for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
+            content_alj.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.aljazeera.com"
         # so lets append that to the beginning of them
         prefix = "https://www.aljazeera.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_alj:
+            links_alj.append(prefix + j)
         # looks like it worked
         # print(links)
 
-    if (url == "https://www.aljazeera.com/economy/"):
-        for i in page_soup.find_all('a', {"class": "u-clickable-card__link"}):
-            content.append(i.get('href'))
+    if (url_alj == "https://www.aljazeera.com/economy/"):
+        for i in page_alj_soup.find_all('a', {"class": "u-clickable-card__link"}):
+            content_alj.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.aljazeera.com"
         # so lets append that to the beginning of them
         prefix = "https://www.aljazeera.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_alj:
+            links_alj.append(prefix + j)
         # looks like it worked
         # print(links)
-
-    if (url == "https://www.bbc.com/news"):
-        for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
-            content.append(i.get('href'))
+    """
+    if (url_bbc == "https://www.bbc.com/news"):
+        for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
+            content_bbc.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.bbc.com"
         #so append that to the beginning of them
         prefix = "https://www.bbc.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_bbc:
+            links_bbc.append(prefix + j)
         # print(links)
 
-    if (url == "https://www.bbc.com/sport"):
-        for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}):
-            content.append(i.get('href'))
+    if (url_bbc == "https://www.bbc.com/sport"):
+        for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link sp-o-link-split__anchor gel-double-pica-bold"}):
+            content_bbc.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.bbc.com"
         #so append that to the beginning of them
         prefix = "https://www.bbc.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_bbc:
+            links_bbc.append(prefix + j)
         #print(links)
 
-    if (url == "https://www.bbc.com/news/business/economy"):
-        for i in page_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
-            content.append(i.get('href'))
+    if (url_bbc == "https://www.bbc.com/news/business/economy"):
+        for i in page_bbc_soup.find_all('a', {"class": "gs-c-promo-heading gs-o-faux-block-link__overlay-link gel-pica-bold nw-o-link-split__anchor"}):
+            content_bbc.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.bbc.com"
         #so append that to the beginning of them
         prefix = "https://www.bbc.com"
-        for j in content:
-            links.append(prefix + j)
-        # print(links)
+        for j in content_bbc:
+            links_bbc.append(prefix + j)
+        # print(links) 
+    """
 
-    if (url == "https://www.usatoday.com/news/"):
-        for i in page_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}):
-            content.append(i.get('href'))
+    if (url_usa == "https://www.usatoday.com/news/"):
+        for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a gnt_lbl_pm gnt_m_flm_a__pm"}):
+            content_usa.append(i.get('href'))
         # print(content)
         # so content holds all the paths to the links after "https://www.usatoday.com"
         prefix = "https://www.usatoday.com"
-        for j in content:
-            links.append(prefix + j)
+        for j in content_usa:
+            links_usa.append(prefix + j)
         # print(links)
 
-    if (url == "https://www.usatoday.com/money/"):
-        for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}):
-            content.append(i.get('href'))
+    if (url_usa == "https://www.usatoday.com/money/"):
+        for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}):
+            content_usa.append(i.get('href'))
         # print(content)
         # we see that the first index is actually link to the "the Daily money"
         # delete the first index
-        del content[0]
+        del content_usa[0]
         # we also see that some empty strings have been found, remove them
-        finalized_content = list(filter(None, content))
+        finalized_content = list(filter(None, content_usa))
         # so finalized content holds all the paths to the links after "https://www.usatoday.com"
         prefix = "https://www.usatoday.com"
         for j in finalized_content:
-            links.append(prefix + j)
+            links_usa.append(prefix + j)
         # print(links)
 
-    if (url == "https://www.usatoday.com/sports/"):
-        for i in page_soup.find_all('a', {"class": "gnt_m_flm_a"}):
-            content.append(i.get('href'))
+    if (url_usa == "https://www.usatoday.com/sports/"):
+        for i in page_usa_soup.find_all('a', {"class": "gnt_m_flm_a"}):
+            content_usa.append(i.get('href'))
         # print(content)
         # we can see that some empty strings have been found, remove them
-        finalized_content = list(filter(None, content))
+        finalized_content = list(filter(None, content_usa))
         # so content holds all the paths to the links after "https://www.usatoday.com"
         prefix = "https://www.usatoday.com"
         for j in finalized_content:
-            links.append(prefix + j)
+            links_usa.append(prefix + j)
         # print(links)
 
-    return links
+    return links_alj, links_usa