import numpy as np
import pandas as pd
import os


import pandas as pd
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import numpy as np


#Import movies
movies = pd.read_csv('../input/movie-lens-small-latest-dataset/movies.csv')

# Import Ratings
ratings = pd.read_csv('../input/movie-lens-small-latest-dataset/ratings.csv')


movies.head()


ratings.head()


ratings_twogenre = get_genre_ratings(ratings, movies, ['Horror', 'Thriller'], ['avg_horror_rating', 'avg_thriller_rating'])
ratings_twogenre.head()


refined_dataset_twogenre = bias_genre_rating_dataset(ratings_twogenre, 3.5, 2.5)
refined_dataset_twogenre.head()


draw_scatterplot(refined_dataset_twogenre['avg_horror_rating'],'Avg Horror rating', refined_dataset_twogenre['avg_thriller_rating'], 'Avg Thriller rating')


X = refined_dataset_twogenre[['avg_horror_rating','avg_thriller_rating']].values
from sklearn.cluster import KMeans
kmeans_two_genre = KMeans(n_clusters=2, random_state=0)
predictions1 = kmeans_two_genre.fit_predict(X)


draw_clusters(refined_dataset_twogenre, predictions1)


kmeans_two_genre1 = KMeans(n_clusters=3, random_state=1)
predictions2 = kmeans_two_genre1.fit_predict(X)


draw_clusters(refined_dataset_twogenre, predictions2)


kmeans_two_genre2 = KMeans(n_clusters=4, random_state=1)
predictions3 = kmeans_two_genre2.fit_predict(X)


draw_clusters(refined_dataset_twogenre, predictions3)


list_of_k = range(2, len(X)+1, 5)

# Calculate error values for all above
errors_list = [clustering_errors(k, X) for k in list_of_k]


# Plot the each value of K vs. the silhouette score at that value
fig, ax = plt.subplots(figsize=(16, 6))
ax.set_xlabel('Value of K')
ax.set_ylabel('Score (higher is better)')
ax.plot(list_of_k, errors_list)

# Ticks and grid
xticks = np.arange(min(list_of_k), max(list_of_k)+1, 5.0)
ax.set_xticks(xticks, minor=False)
ax.set_xticks(xticks, minor=True)
ax.xaxis.grid(True, which='both')
yticks = np.arange(round(min(errors_list), 2), max(errors_list), .05)
ax.set_yticks(yticks, minor=False)
ax.set_yticks(yticks, minor=True)
ax.yaxis.grid(True, which='both')


kmeans_two_genre3 = KMeans(n_clusters=12, random_state=6)

# TODO: use fit_predict to cluster the dataset
predictions4 = kmeans_two_genre3.fit_predict(X)


draw_clusters(refined_dataset_twogenre, predictions4, cmap='Accent')


refined_dataset_3genre = get_genre_ratings(ratings, movies, 
                                                     ['Horror', 'Thriller', 'Fantasy'], 
                                                     ['avg_horror_rating', 'avg_thriller_rating', 'avg_fantasy_rating'])
refined_dataset_3genre = bias_genre_rating_dataset(refined_dataset_3genre, 3.5, 2.5).dropna()
refined_dataset_3genre.head()


X_fantasy = refined_dataset_3genre[['avg_horror_rating', 'avg_thriller_rating', 'avg_fantasy_rating']].values


kmeans_three_genre1 = KMeans(n_clusters=12)
predictions_1_1 = kmeans_three_genre1.fit_predict(X_fantasy)


draw_clusters_3d(refined_dataset_3genre, predictions_1_1)


titles_df = pd.merge(ratings, movies[['movieId', 'title']], on='movieId' )
ratings_users = pd.pivot_table(titles_df, index='userId', columns= 'title', values='rating')
ratings_users.iloc[:5]


# shrinking the dataset for better visualization
num_movies = 30
num_users = 18
most_rated_sorted = sort_by_rating_density(ratings_users, num_movies, num_users)
most_rated_sorted.head()


draw_movies_heatmap(most_rated_sorted)


ratings_df_subset =  pd.pivot_table(titles_df, index='userId', columns= 'title', values='rating')
filtered_most_rated = get_most_rated_movies(ratings_df_subset, 2000)


# Remove all nulls
tmpmovies=filtered_most_rated.copy()
tmpmovies=tmpmovies.fillna(0)
dtcols=filtered_most_rated.columns
tmpdict={}
for v in dtcols:
    tmpdict[v]=pd.arrays.SparseArray(tmpmovies[v])
sparseFrame=pd.DataFrame(tmpdict)
sparse_ratings = csr_matrix(sparseFrame)


new_k_values = range(2, 100+1, 5)
sparse_errors_k = [clustering_errors(k, sparse_ratings) for k in new_k_values]


fig, ax = plt.subplots(figsize=(16, 6))
ax.set_xlabel('number of clusters')
ax.set_ylabel('Score (higher is better)')
ax.plot(new_k_values, sparse_errors_k)


xticks = np.arange(min(new_k_values), max(new_k_values)+1, 5.0)
ax.set_xticks(xticks, minor=False)
ax.set_xticks(xticks, minor=True)
ax.xaxis.grid(True, which='both')
yticks = np.arange(round(min(sparse_errors_k), 2), max(sparse_errors_k), .05)
ax.set_yticks(yticks, minor=False)
ax.set_yticks(yticks, minor=True)
ax.yaxis.grid(True, which='both')


predictions_sparse_1 = KMeans(n_clusters=12, algorithm='full').fit_predict(sparse_ratings)


predict_cluster = pd.concat([filtered_most_rated.reset_index(), pd.DataFrame({'group':predictions_sparse_1})], axis=1)
predict_cluster.head()


cluster_id = 5
newnum_users = 70
newnum_movies = 300
cluster_1 = predict_cluster[predict_cluster.group == cluster_id].drop(['index', 'group'], axis=1)
cluster_1 = sort_by_rating_density(cluster_1, newnum_movies, newnum_users)
draw_movies_heatmap(cluster_1, axis_labels=False)


cluster_1.fillna('').head()


picked_movie='Braveheart (1995)'


cluster_1[picked_movie].mean()

4.339285714285714


cluster_1.mean().head(10)

Batman (1989)                       3.308333
Pulp Fiction (1994)                 3.931034
Apollo 13 (1995)                    3.929825
True Lies (1994)                    3.584746
Dances with Wolves (1990)           3.870370
Braveheart (1995)                   4.339286
Fugitive, The (1993)                4.189655
Forrest Gump (1994)                 4.372727
Batman Forever (1995)               3.120000
Shawshank Redemption, The (1994)    4.415094
dtype: float64


cluster_1.fillna('').head()


user_id = 83
sel_user_ratings  = cluster_1.loc[user_id, :]
all_unrated =  sel_user_ratings[sel_user_ratings.isnull()]
mean_ratings = pd.concat([all_unrated, cluster_1.mean()], axis=1, join='inner').loc[:,0]


mean_ratings.sort_values(ascending=False)[:5]

Three Colors: Blue (Trois couleurs: Bleu) (1993)          5.000000
Wallace & Gromit: The Best of Aardman Animation (1996)    5.000000
Saving Private Ryan (1998)                                4.750000
Kids (1995)                                               4.750000
Three Colors: White (Trzy kolory: Bialy) (1994)           4.666667
Name: 0, dtype: float64

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931

	avg_horror_rating	avg_thriller_rating
1	3.47	4.15
2	3.00	3.70
3	4.69	4.14
4	4.25	3.55
5	3.00	3.56

	index	avg_horror_rating	avg_thriller_rating
0	2	3.00	3.70
1	5	3.00	3.56
2	6	3.26	3.54
3	7	4.00	3.43
4	9	1.80	2.55

	index	avg_horror_rating	avg_thriller_rating	avg_fantasy_rating
1	5	3.00	3.56	4.14
2	6	3.26	3.54	3.54
3	7	4.00	3.43	3.07
4	9	1.80	2.55	5.00
5	10	1.75	3.08	3.44

A Movie Recommender System using K-Means Clustering¶

Imports and Get the Dataset¶

Data Analysis¶

2 Genre Analysis¶

Select K or number of clusters¶

Lets add one more Genre¶

Cluster the Movies¶

Prepare the dataset¶

Get the Sparse matrix¶

Perform Predictions¶

Predictions from the cluster¶

These are the movies which will be recommended to the user based on this algorithm¶

title	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	Schindler's List (1993)	...	Star Wars: Episode VI - Return of the Jedi (1983)	Godfather, The (1972)	Fugitive, The (1993)	Batman (1989)	Saving Private Ryan (1998)	Lord of the Rings: The Two Towers, The (2002)	Lord of the Rings: The Return of the King, The (2003)	Aladdin (1992)	Fargo (1996)	Sixth Sense, The (1999)
413	5.0	5.0	5.0	4.0	5.0	5.0	4.0	5.0	5.0	4.0	...	5.0	5.0	5.0	4.0	5.0	5.0	4.0	4.0	5.0	3.0
589	5.0	4.5	4.5	3.5	4.0	5.0	4.0	4.0	4.5	5.0	...	4.5	5.0	4.0	3.5	4.0	5.0	4.5	4.0	4.0	3.5
473	3.0	5.0	4.0	4.5	4.5	4.0	4.5	3.0	4.0	5.0	...	4.0	5.0	5.0	4.0	3.0	5.0	5.0	4.0	4.0	5.0
479	5.0	5.0	4.0	4.5	5.0	4.5	5.0	5.0	4.5	5.0	...	3.5	5.0	3.5	4.5	4.5	4.5	4.0	4.0	4.0	4.0
67	3.5	3.0	2.0	3.5	4.5	5.0	3.5	2.5	3.5	4.0	...	5.0	4.0	4.5	4.0	4.0	4.0	4.5	3.5	2.5	2.5

	index	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	...	Rush (2013)	Badlands (1973)	Thinner (1996)	Nine to Five (a.k.a. 9 to 5) (1980)	Horrible Bosses 2 (2014)	Fall, The (2006)	Saturday Night Fever (1977)	Hedwig and the Angry Inch (2000)	All Dogs Go to Heaven 2 (1996)	group
0	0	4.0	NaN	3.0	4.0	5.0	5.0	4.0	4.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1	1	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
2	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
3	3	NaN	NaN	1.0	5.0	1.0	5.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
4	4	NaN	3.0	5.0	NaN	NaN	NaN	NaN	4.0	3.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5

title	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	Schindler's List (1993)	...	Star Wars: Episode VI - Return of the Jedi (1983)	Godfather, The (1972)	Fugitive, The (1993)	Batman (1989)	Saving Private Ryan (1998)	Lord of the Rings: The Two Towers, The (2002)	Lord of the Rings: The Return of the King, The (2003)	Aladdin (1992)	Fargo (1996)	Sixth Sense, The (1999)
413	5.0	5.0	5.0	4.0	5.0	5.0	4.0	5.0	5.0	4.0	...	5.0	5.0	5.0	4.0	5.0	5.0	4.0	4.0	5.0	3.0
589	5.0	4.5	4.5	3.5	4.0	5.0	4.0	4.0	4.5	5.0	...	4.5	5.0	4.0	3.5	4.0	5.0	4.5	4.0	4.0	3.5
473	3.0	5.0	4.0	4.5	4.5	4.0	4.5	3.0	4.0	5.0	...	4.0	5.0	5.0	4.0	3.0	5.0	5.0	4.0	4.0	5.0
479	5.0	5.0	4.0	4.5	5.0	4.5	5.0	5.0	4.5	5.0	...	3.5	5.0	3.5	4.5	4.5	4.5	4.0	4.0	4.0	4.0
67	3.5	3.0	2.0	3.5	4.5	5.0	3.5	2.5	3.5	4.0	...	5.0	4.0	4.5	4.0	4.0	4.0	4.5	3.5	2.5	2.5

	index	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	...	Rush (2013)	Badlands (1973)	Thinner (1996)	Nine to Five (a.k.a. 9 to 5) (1980)	Horrible Bosses 2 (2014)	Fall, The (2006)	Saturday Night Fever (1977)	Hedwig and the Angry Inch (2000)	All Dogs Go to Heaven 2 (1996)	group
0	0	4.0	NaN	3.0	4.0	5.0	5.0	4.0	4.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1	1	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
2	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
3	3	NaN	NaN	1.0	5.0	1.0	5.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
4	4	NaN	3.0	5.0	NaN	NaN	NaN	NaN	4.0	3.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5

A Movie Recommender System using K-Means Clustering¶

Imports and Get the Dataset¶

Data Analysis¶

2 Genre Analysis¶

Select K or number of clusters¶

Lets add one more Genre¶

Cluster the Movies¶

Prepare the dataset¶

Get the Sparse matrix¶

Perform Predictions¶

Predictions from the cluster¶

Recommend Movies¶

These are the movies which will be recommended to the user based on this algorithm¶

title	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	Schindler's List (1993)	...	Star Wars: Episode VI - Return of the Jedi (1983)	Godfather, The (1972)	Fugitive, The (1993)	Batman (1989)	Saving Private Ryan (1998)	Lord of the Rings: The Two Towers, The (2002)	Lord of the Rings: The Return of the King, The (2003)	Aladdin (1992)	Fargo (1996)	Sixth Sense, The (1999)
413	5.0	5.0	5.0	4.0	5.0	5.0	4.0	5.0	5.0	4.0	...	5.0	5.0	5.0	4.0	5.0	5.0	4.0	4.0	5.0	3.0
589	5.0	4.5	4.5	3.5	4.0	5.0	4.0	4.0	4.5	5.0	...	4.5	5.0	4.0	3.5	4.0	5.0	4.5	4.0	4.0	3.5
473	3.0	5.0	4.0	4.5	4.5	4.0	4.5	3.0	4.0	5.0	...	4.0	5.0	5.0	4.0	3.0	5.0	5.0	4.0	4.0	5.0
479	5.0	5.0	4.0	4.5	5.0	4.5	5.0	5.0	4.5	5.0	...	3.5	5.0	3.5	4.5	4.5	4.5	4.0	4.0	4.0	4.0
67	3.5	3.0	2.0	3.5	4.5	5.0	3.5	2.5	3.5	4.0	...	5.0	4.0	4.5	4.0	4.0	4.0	4.5	3.5	2.5	2.5

	index	Forrest Gump (1994)	Shawshank Redemption, The (1994)	Pulp Fiction (1994)	Silence of the Lambs, The (1991)	Matrix, The (1999)	Star Wars: Episode IV - A New Hope (1977)	Jurassic Park (1993)	Braveheart (1995)	Terminator 2: Judgment Day (1991)	...	Rush (2013)	Badlands (1973)	Thinner (1996)	Nine to Five (a.k.a. 9 to 5) (1980)	Horrible Bosses 2 (2014)	Fall, The (2006)	Saturday Night Fever (1977)	Hedwig and the Angry Inch (2000)	All Dogs Go to Heaven 2 (1996)	group
0	0	4.0	NaN	3.0	4.0	5.0	5.0	4.0	4.0	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1	1	NaN	3.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
2	2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3
3	3	NaN	NaN	1.0	5.0	1.0	5.0	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
4	4	NaN	3.0	5.0	NaN	NaN	NaN	NaN	4.0	3.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5

	Batman (1989)	Pulp Fiction (1994)	Apollo 13 (1995)	True Lies (1994)	Dances with Wolves (1990)	Braveheart (1995)	Fugitive, The (1993)	Forrest Gump (1994)	Batman Forever (1995)	Shawshank Redemption, The (1994)	...	Space Jam (1996)	Crow: City of Angels, The (1996)	Ghost and the Darkness, The (1996)	All Dogs Go to Heaven 2 (1996)
1	3	2	4	4	5	5	5	5	3	5	...		3	5	4
23	3	4	4	4	4	5	4	4	3	5	...
83	3	3	3	3		4	3	4	3	5	...
62	3	5	5	4	5	4	4	5		4	...
43	3	2	4	3	4	5	3	5	3	3	...	5