-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_parsing.py
95 lines (79 loc) · 2.11 KB
/
data_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# %%
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import warnings
import ast
import re
warnings.filterwarnings('ignore')
'''
INSPECT CSVS
'''
# %%
ratings = pd.read_csv('data/ratings.csv', index_col='timestamp')
print(ratings.shape)
ratings.head(5)
# %%
# Parition ratings into batch-layer input and speed-layer input (pre-2018 and post-2018)
# ratings[ratings.index <= 1514846688].to_csv(path_or_buf='ratings_batch_input.csv')
# ratings[ratings.index > 1514846688].to_csv(path_or_buf='ratings_stream.csv')
# %%
movies = pd.read_csv('data/movies.csv', index_col='movieId')
print(movies.shape)
movies.head(5)
# %%
metadata = pd.read_csv('data/movies_metadata.csv', index_col='imdb_id')
metadata.shape
metadata.columns
metadata.head(5)
# %%
credits = pd.read_csv('data/credits.csv', index_col='id')
credits.shape
credits.head(5)
# %%
links = pd.read_csv('data/links.csv', index_col='imdbId')
links.shape
links.head(5)
'''
DATA PARSING
update "movies" csv to indicate each genre type as hot-encoded boolean
also add "year" column for future table joins
'''
# find total set of genres
# %%
gset = set()
for _, row in movies.iterrows():
genres = row['genres'].split('|')
gset |= set(genres)
sorted(gset)
# %%
# Add columns for each genre type
gset.remove('(no genres listed)') # will default to 0's everywhere
for g in gset:
movies[g] = False
# %%
# Iterate through rows again and mark genre-columns
for i, row in movies.iterrows():
genres = row['genres'].split('|')
for g in genres:
if g != '(no genres listed)':
movies.at[i, g] = True
# %%
# Extract year from title
movies['year'] = 'na'
for i, row in movies.iterrows():
match = re.search(r'(\(\d+\))', row['title'])
if match:
movies.at[i, 'year'] = match[1][1:-1]
# %% remove movies without a Year
movies[movies.year == 'na'].shape
# %%
movie_genres = movies.drop('genres', axis=1)[movies.year != 'na']
movie_genres.head(5)
# movie_genres.to_csv(path_or_buf='movie_genres.csv')
# %%
# new table
genres = pd.read_csv('movie_genres.csv', index_col='movieId')
print(genres.shape)
genres.head(5)