TrendLens/fetch.py at main · shane-downs/TrendLens · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
import csv
from timeit import default_timer as timer
from ordered_map import OrderedMap
from unordered_map import unordered_map


class Article:
    def __init__(self, title="", year="", month="", url="", keywords=""):
        self.title = title
        self.year = year
        self.month = month
        self.url = url
        self.keyword = keywords


def writeArticlesToRawCSV(array):    # never used again since we only needed to do it once
    col_headers = ["Title", "Year", "Month", "Url", "Keyword"]
    filename = "nyt_data.csv"
    with open(filename, 'w', encoding='utf-8', newline='') as csvfile:
        # creating a csv writer object
        csvwriter = csv.writer(csvfile)

        # writing the fields
        csvwriter.writerow(col_headers)

        for i in array:
            # writing the data rows
            row = [i.title, i.year, i.month, i.url, ", ".join(i.keyword)]
            csvwriter.writerow(row)


# def getArticlesFromAPI(array, startYear, endYear):     # never used again since we only needed to do it once
#     # api_key use here
#
#     # iterating through entire new york times archive api
#     for i in range(startYear, endYear):
#         for j in range(1, 13):
#             year = i
#             month = j
#
#             #url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}'
#
#             # API call
#             response = requests.get(url)
#
#             # checking success code
#             if response.status_code == 200:
#                 # parsing json file
#                 data = response.json()
#
#                 # checking for response and docs inside the data
#                 if 'response' in data and 'docs' in data['response']:
#
#                     for article_data in data['response']['docs']:
#                         # checking for publish dates aka checking if the month of articles is empty
#                         if 'pub_date' not in article_data:
#                             continue
#
#                         # titles, url, time
#                         title = article_data['headline']['main']
#                         url = article_data['web_url']
#                         # time structure is "year-month-day-hours:minutes:seconds+0000"
#                         # ex: 2019-01-01T05:00:00+0000"
#                         # T is just a string to represent time
#
#                         time = article_data['pub_date']
#                         year_pub = time[0:4]
#                         month_pub = time[6:7]
#
#                         # extracting  keywords
#                         keywords = [article_data.get('keywords', [])[0]['value']] if article_data.get(
#                             'keywords') else []
#                         if not keywords:
#                             continue
#
#                         array.append(Article(title, year_pub, month_pub, url, keywords))
#
#                     # for article_info in array:
#                     #     print("Title:", article_info.title)
#                     #     print("URL:", article_info.url)
#                     #     print("Date Published:", article_info.year, "-", article_info.month)
#                     #     print("Keywords:", ", ".join(article_info.keyword))
#                     #     print("\n" + "=" * 50 + "\n")
#                 else:
#                     print("Error: 'response' or 'docs' keys not found in the API response.")
#             else:
#                 # Print an error message if the request was not successful
#                 print(f"Error: {response.status_code}, {response.text}")
#
#             return array


def getArticlesFromMapsAndInsertToCSV(keyword, startYear, endYear, unorderedMap, orderedMap):
    # we need to track time so the following is time for unordered map
    startTimeUnordered = timer()
    garbage = unorderedMap[keyword]       # get the data (should take a bit), but don't store it because we don't need two
    endTimeUnordered = timer()
    UnorderedElapsed = endTimeUnordered - startTimeUnordered

    # we need to track time so the following is time for ordered map
    startTimeOrdered = timer()
    dataList = orderedMap[keyword]  # get the data (should take a bit)
    endTimeOrdered = timer()
    OrderedElapsed = endTimeOrdered - startTimeOrdered

    usageMap = {}                               # map to hold key of year and value of usages in that year

    for i in range(len(dataList)):       # go through list to check article dates
        if ((startYear <= int(dataList[i].year)) and (endYear >= int(dataList[i].year))):   # if we are within the year range
            if dataList[i].year not in usageMap:
                usageMap[dataList[i].year] = 1
            else:
                usageMap[dataList[i].year] += 1     # increment the usage of the keyword at that year in the map
        else:       # if we are not in the right range, continue
            continue

    # now we have a map of all the right usages, we need to put that into a list and then write that list to a new CSV
    formattedList = [[year, count] for year, count in usageMap.items()]   # add each piece of data to the new list
    # now let's write to the CSV
    filePath = 'formatted_nyt_data.csv'
    formatColumns = ["Year", "Usage"]
    formattedRunTimes = [round(UnorderedElapsed, 11), round(OrderedElapsed, 11)]

    with open(filePath, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(formattedRunTimes)
        writer.writerow(formatColumns)
        writer.writerows(formattedList)