-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdataRetrieval.py
143 lines (126 loc) · 3.92 KB
/
dataRetrieval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin
import csv
import requests
import json
import re
from xlrd import open_workbook
import pandas as pd # to merge the omdb data with the tomato data that we pull from the website using GET requests.
# data to extract from the api call:
# 1. tomatoUserRating
# 2. tomatoRating
# 3. tomatoReviews
# 4. tomatoFresh
# 5. tomatoRotten
# 6. tomatoUserMeter
# 7. tomatoUserReviews
# 8. imdbVotes
# 9. Metascore
# all the genres: set([u'Sci-Fi', u'Crime', u'Romance', u'Animation', u'Music', u'Comedy', u'War', u'genres', u'Horror', u'Film-Noir', u'Adventure', u'News', u'Reality-TV', u'Thriller', u'Western', u'Mystery', u'Short', u'Drama', u'Action', u'Documentary', u'Musical', u'History', u'Family', u'Fantasy', u'Game-Show', u'Sport', u'Biography'])
sep = '!'
def mergeCSV():
print("starting to merge")
a = pd.read_csv("movie_data.csv")
b = pd.read_csv("tomatoData.csv")
a = a.merge(b, on='movie_title')
print("both files merged successfully")
def makeAPIcall(movieTitle):
r = requests.get("http://www.omdbapi.com/?t="+movieTitle+"&tomatoes=true")
jsonObject = json.loads(r.content)
return jsonObject
def getData():
keyList = []
keyList.append('tomatoUserRating')
keyList.append('tomatoRating')
keyList.append('tomatoReviews')
keyList.append('tomatoFresh')
keyList.append('tomatoRotten')
keyList.append('tomatoUserMeter')
keyList.append('tomatoUserReviews')
keyList.append('imdbVotes')
keyList.append('Metascore')
data = []
ofile = open('tomatoData.csv', "wb")
writer = csv.writer(ofile, delimiter=',')
with open('movie_data.csv', 'rb') as file:
try:
reader = csv.reader(file)
for row in reader:
movieTitle = row[11]
movieTitle = movieTitle.strip()
movieTitle = re.sub('[^a-zA-Z0-9 \n\.]', '', movieTitle)
print("dealing with movie " + movieTitle +"\n")
jsonResponse = makeAPIcall(movieTitle)
if(jsonResponse):
data.append(movieTitle)
for item in keyList:
if(item in jsonResponse):
data.append(jsonResponse[item])
tomatoUserRating = jsonResponse[item]
print("The " + item+ " for " + movieTitle + " is " + tomatoUserRating)
print("--------------------")
print(data)
writer.writerow(data)
data = []
except:
print("ENCOUNTERED AN EXCEPTION")
pass
finally:
file.close()
print("closed the file")
def getGenre(sep):
genres = set()
try:
workbook = open_workbook('movie_data.xls')
for s in workbook.sheets():
for row in range(s.nrows):
value = (s.cell(row,26).value)
array = value.split(sep)
for item in array:
genres.add(item)
return genres
except Exception as e:
print("exception while splitting the genres", e)
pass
def getIndexOfGenre(item, genresList):
return genresList.index(item)
def splitGenre(sep, genresSet):
genresList = list(genresSet)
ofile = open('genres.csv', "wb")
writer = csv.writer(ofile, delimiter=',')
counter = 1
# writer.writerow(genresList)
try:
workbook = open_workbook('movie_data.xls')
for sheet in workbook.sheets():
for row in range(1,sheet.nrows):
name = sheet.cell(row,0).value
genres = sheet.cell(row,26).value
print(name , "-->",genres)
convertedList = []
for i in range(0,len(genresList)):
convertedList.append(0)
genreListFromRow = []
genreListFromRow = genres.split(sep)
for item in genreListFromRow:
if item in genresSet:
index = getIndexOfGenre(item,genresList)
convertedList[index] = 1
convertedList.append(name)
print(name, convertedList)
writer.writerow(convertedList)
counter = counter + 1
print("total of " + str(counter) +" rows" + "sheet nrows "+ str(sheet.nrows))
except Exception as exception:
print("encountered an exception while splitting the genre" , exception)
pass
def getCharacter():
str = "Action|Adventure|Fantasy|Sci-Fi"
sep = str[6]
return sep
sep = getCharacter()
# getData()
# mergeCSV()
genres = getGenre(sep)
print(genres)
# print(genres)
splitGenre(sep, genres)