-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfetch.py
More file actions
129 lines (114 loc) · 5.47 KB
/
fetch.py
File metadata and controls
129 lines (114 loc) · 5.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
import csv
from timeit import default_timer as timer
from ordered_map import OrderedMap
from unordered_map import unordered_map
class Article:
def __init__(self, title="", year="", month="", url="", keywords=""):
self.title = title
self.year = year
self.month = month
self.url = url
self.keyword = keywords
def writeArticlesToRawCSV(array): # never used again since we only needed to do it once
col_headers = ["Title", "Year", "Month", "Url", "Keyword"]
filename = "nyt_data.csv"
with open(filename, 'w', encoding='utf-8', newline='') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(col_headers)
for i in array:
# writing the data rows
row = [i.title, i.year, i.month, i.url, ", ".join(i.keyword)]
csvwriter.writerow(row)
# def getArticlesFromAPI(array, startYear, endYear): # never used again since we only needed to do it once
# # api_key use here
#
# # iterating through entire new york times archive api
# for i in range(startYear, endYear):
# for j in range(1, 13):
# year = i
# month = j
#
# #url = f'https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={api_key}'
#
# # API call
# response = requests.get(url)
#
# # checking success code
# if response.status_code == 200:
# # parsing json file
# data = response.json()
#
# # checking for response and docs inside the data
# if 'response' in data and 'docs' in data['response']:
#
# for article_data in data['response']['docs']:
# # checking for publish dates aka checking if the month of articles is empty
# if 'pub_date' not in article_data:
# continue
#
# # titles, url, time
# title = article_data['headline']['main']
# url = article_data['web_url']
# # time structure is "year-month-day-hours:minutes:seconds+0000"
# # ex: 2019-01-01T05:00:00+0000"
# # T is just a string to represent time
#
# time = article_data['pub_date']
# year_pub = time[0:4]
# month_pub = time[6:7]
#
# # extracting keywords
# keywords = [article_data.get('keywords', [])[0]['value']] if article_data.get(
# 'keywords') else []
# if not keywords:
# continue
#
# array.append(Article(title, year_pub, month_pub, url, keywords))
#
# # for article_info in array:
# # print("Title:", article_info.title)
# # print("URL:", article_info.url)
# # print("Date Published:", article_info.year, "-", article_info.month)
# # print("Keywords:", ", ".join(article_info.keyword))
# # print("\n" + "=" * 50 + "\n")
# else:
# print("Error: 'response' or 'docs' keys not found in the API response.")
# else:
# # Print an error message if the request was not successful
# print(f"Error: {response.status_code}, {response.text}")
#
# return array
def getArticlesFromMapsAndInsertToCSV(keyword, startYear, endYear, unorderedMap, orderedMap):
# we need to track time so the following is time for unordered map
startTimeUnordered = timer()
garbage = unorderedMap[keyword] # get the data (should take a bit), but don't store it because we don't need two
endTimeUnordered = timer()
UnorderedElapsed = endTimeUnordered - startTimeUnordered
# we need to track time so the following is time for ordered map
startTimeOrdered = timer()
dataList = orderedMap[keyword] # get the data (should take a bit)
endTimeOrdered = timer()
OrderedElapsed = endTimeOrdered - startTimeOrdered
usageMap = {} # map to hold key of year and value of usages in that year
for i in range(len(dataList)): # go through list to check article dates
if ((startYear <= int(dataList[i].year)) and (endYear >= int(dataList[i].year))): # if we are within the year range
if dataList[i].year not in usageMap:
usageMap[dataList[i].year] = 1
else:
usageMap[dataList[i].year] += 1 # increment the usage of the keyword at that year in the map
else: # if we are not in the right range, continue
continue
# now we have a map of all the right usages, we need to put that into a list and then write that list to a new CSV
formattedList = [[year, count] for year, count in usageMap.items()] # add each piece of data to the new list
# now let's write to the CSV
filePath = 'formatted_nyt_data.csv'
formatColumns = ["Year", "Usage"]
formattedRunTimes = [round(UnorderedElapsed, 11), round(OrderedElapsed, 11)]
with open(filePath, 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(formattedRunTimes)
writer.writerow(formatColumns)
writer.writerows(formattedList)