-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrottenTomatoesAPI.py
More file actions
140 lines (111 loc) · 4.96 KB
/
rottenTomatoesAPI.py
File metadata and controls
140 lines (111 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
from io import StringIO
from StringIO import StringIO
import sys
import re
import string
import json
import requests
from rottentomatoes import RT
import time
#----------------------------------------------------------------------------------------------
# Looks up movie reviews from RT id
# - Input: TR id and api key
# - Output: prints up to 100 reviews for the movie.
#----------------------------------------------------------------------------------------------
def getAllReviews(api_key, movie_id):
url = 'http://api.rottentomatoes.com/api/public/v1.0/movies/%s/reviews.json' % movie_id
#these are the "get parameters" for the first 50 reviews (ie. pages 1-50)
options = {'review_type': 'all','page_limit': 50, 'page': 1, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
if 'reviews' in data:
# print the reviews
reviews = data['reviews']
for record in reviews:
record["movieID"] = movie_id
print json.dumps(record, indent=2)
if len(reviews) >= 50:
#these are the "get parameters" for the second 50 reviews (ie. pages 50-100)
options = {'review_type': 'all','page_limit': 50, 'page': 50, 'apikey': api_key}
data = requests.get(url, params=options).text
data = json.loads(data) # load a json string into a collection of lists and dicts
# print the reviews
reviews = data['reviews']
for record in reviews:
record["movieID"] = movie_id
print json.dumps(record, indent=2)
else:
f1=open('./moviesNotFound.txt', 'a')
temp = 'No reviews: ' + str(movie_id) + '\n'
f1.write(temp)
#----------------------------------------------------------------------------------------------
# Create an array of IMBD movie ids from the file passes in
#----------------------------------------------------------------------------------------------
def createMovieArray(movies):
# create an array to store the imbd ids
movieIDs = []
# get each of the lines in the input file
for x in movies:
movies = re.split('\r',x)
# for each of the lines get the id
for movie in movies:
record = re.split('\t',movie)
movieID = record[2]
# since the ids range from 5-7 digits add 0s infront to make them all 7 digits
if len(movieID) == 5:
movieID = '00' + movieID
elif len(movieID) == 6:
movieID = '0' + movieID
# add movie imbd id to the list
movieIDs.append(movieID)
# return the list of imbd ids
return movieIDs
#----------------------------------------------------------------------------------------------
# Loosely based on code from http://nbviewer.ipython.org/github/xbsd/content/blob/master/HW3_solutions.ipynb
# - Queries the RT movie_alias API. Returns the RT id associated with an IMDB ID
# - When RT id is returned it prints it to imbd-rottenTomato.txt
# - Makes a call to get reviews for the RT id
# ----------------------------------------------------------------------------------------------
def rt_id_by_imdb(imdbIDs, api_key):
url = 'http://api.rottentomatoes.com/api/public/v1.0/movie_alias.json'
# create new array to store ids
rtID = []
# for each of the IMBD ids look up RT id & get reviews
for movieID in imdbIDs:
# added a .2 second timeout so that we don't exceed the max 5 calls per second
time.sleep(.2)
# these are the "get parameters"
# http://api.rottentomatoes.com/api/public/v1.0/movie_alias.json
params = dict(id=movieID, type='imdb', apikey=api_key)
r = requests.get(url, params=params).text
r = json.loads(r)
if 'id' in r:
# add RT id to the array
rtID.append(r['id'])
# write imbd & RT id to logging file
f1=open('./imbd-rottenTomato.txt', 'a')
temp = movieID + ',' + str(r['id']) + '\n'
f1.write(temp)
# get reviews for the RT id
getAllReviews(api_key, r['id'])
elif r['error'] == 'Could not find a movie with the specified id':
f1=open('./moviesNotFound.txt', 'a')
f1.write(movieID + '\n')
else:
print r
return
return rtID
#----------------------------------------------------------------------------------------------
# main
#----------------------------------------------------------------------------------------------
def main():
# pass in api key as argument 1
api_key = sys.argv[1]
#pass in file with IMDB id as argument 2 (ie. lens dataset)
lensData = open(sys.argv[2])
# creates a list of IMBD ids from the lens data
movieIDs = createMovieArray(lensData)
# looks up the rotten tomatoes id for each of the IMBD ids & prints reviews
rt_id_by_imdb(movieIDs, api_key)
if __name__ == '__main__':
main()