from surprise import Dataset, Reader, NMF, SVD
from surprise.model_selection import train_test_split
import pandas as pd

# Load the MovieLens 10M dataset from the specified file paths
ratings_df = pd.read_csv('new-ml10m/ml-10M100K/ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')
movies_df = pd.read_csv('new-ml10m/ml-10M100K/movies.dat', sep='::', header=None, names=['movieId', 'title', 'genres'], engine='python')

# Step 1: Keep only the top 200 most viewed movies
movie_counts = ratings_df['movieId'].value_counts()
top_200_movies = movie_counts.head(200).index.tolist()
top200_df = ratings_df[ratings_df['movieId'].isin(top_200_movies)]
remaining_data = ratings_df[~ratings_df['movieId'].isin(top_200_movies)]

# print(remaining_data)
# Define a Reader object to parse the data
reader = Reader(rating_scale=(1, 5))

# Load the data into a Surprise Dataset
train_data = Dataset.load_from_df(remaining_data[['userId', 'movieId', 'rating']], reader)
test_data = Dataset.load_from_df(top200_df[['userId', 'movieId', 'rating']], reader)

# Split the dataset into training and test sets
trainset, _ = train_test_split(train_data, test_size=0.01)
testset, _ = train_test_split(test_data, test_size=0.01)

print(testset)
print(trainset)

# Step 2: Create an SVD model with a 5-dimensional embedding
model = SVD(n_factors=5, biased=True, verbose=True)  # Set the number of factors to 5

# Train the model on the training data
model.fit(trainset)

# Get user and item (movie) embeddings
user_embeddings = model.pu  # User embeddings
item_embeddings = model.qi  # Item embeddings

# Print the shape of user and movie embeddings
print("Shape of user embeddings:", user_embeddings.shape)
print("Shape of movie embeddings:", item_embeddings.shape)

# Example: Get the embedding for a specific user (e.g., user with ID 1)
user_id = 1
user_embedding = user_embeddings[user_id]

# Example: Get the embedding for a specific movie (e.g., movie with ID 1)
movie_id = 1
movie_embedding = item_embeddings[movie_id]

# Now you have 5-dimensional embeddings for users and movies, and only the top 50 most viewed movies are considered.

print(user_embeddings.shape)
print(item_embeddings.shape)

from sklearn.cluster import KMeans
import numpy as np

# Assuming you have already obtained user embeddings and stored them in 'user_embeddings'
# 'user_embeddings' should be a numpy array with shape (number_of_users, embedding_dimension)

# Define the number of clusters (groups) you want, in this case, 500
num_clusters = 1000

# Create a K-Means clustering model
kmeans = KMeans(n_clusters=num_clusters, random_state=0)

# Fit the model to user embeddings
user_clusters = kmeans.fit_predict(user_embeddings)

# Now 'user_clusters' contains the cluster labels for each user
# Each cluster should have approximately the same number of users, around 500 in this case

# You can print the cluster labels for each user
for user_id, cluster_label in enumerate(user_clusters):
    print(f"User ID: {user_id}, Cluster Label: {cluster_label}")

# Create dictionaries to store user embeddings and item embeddings for each cluster
cluster_user_embeddings = []  # Dictionary to store user embeddings for each cluster
cluster_item_embeddings = []  # Dictionary to store item embeddings for each cluster
cluster_ratings = []
cluster_ratings_mask = []

# Loop through each cluster
for cluster_label in range(num_clusters):
    print(cluster_label)
    # Get the indices of users in the current cluster
    cluster_users_indices = np.where(user_clusters == cluster_label)[0]
    # print(len(cluster_users_indices))
    
    
    # Filter ratings for movies rated by users in the current cluster
    cluster_user_movie_ratings = top200_df[top200_df['userId'].isin(cluster_users_indices)]
    # print(cluster_user_movie_ratings['userId'].nunique())
    # Initialize an empty array of size len(movie_id_to_sorted_index)
    user_ratings_array = np.zeros((cluster_user_movie_ratings['userId'].nunique(), 200))    
    # Initialize a mask array to indicate which indices were filled
    mask = np.zeros((cluster_user_movie_ratings['userId'].nunique(), 200), dtype=bool)

    # cluster_ratings[cluster_label] = cluster_user_movie_ratings
    # Create a Surprise dataset for the filtered ratings
    cluster_data = Dataset.load_from_df(cluster_user_movie_ratings[['userId', 'movieId', 'rating']], reader)
    
    # Train an SVD model on the filtered ratings
    cluster_trainset = cluster_data.build_full_trainset()
    cluster_model = NMF(n_factors=5, biased=False)
    cluster_model.fit(cluster_trainset)
    
    if(cluster_model.qi.shape[0] == 200):
        # Get item embeddings from the trained model
        # print("Yes")
        cluster_item_embeddings.append(cluster_model.qi)
        # Store user embeddings in the dictionary
        cluster_user_embeddings.append(cluster_model.pu)
        # print(cluster_user_embeddings[cluster_label].shape)
        # Iterate through unique userIds in the current dataframe
        unique_movie_ids = cluster_user_movie_ratings['movieId'].unique()
        sorted_unique_movie_ids = sorted(unique_movie_ids)
        # Create a dictionary mapping unique movie IDs to sorted indices
        movie_id_to_sorted_index = {movie_id: index for index, movie_id in enumerate(sorted_unique_movie_ids)}
        i = 0
        for user_id in cluster_user_movie_ratings['userId'].unique():
            # Filter the dataframe for the current userId
            user_df = cluster_user_movie_ratings[cluster_user_movie_ratings['userId'] == user_id]
            # Iterate through the rows of the filtered dataframe
            for index, row in user_df.iterrows():
                movie_id = row['movieId']
                rating = row['rating']
                
                # Use movie_id_to_sorted_index to find the index to fill
                sorted_index = movie_id_to_sorted_index.get(movie_id)
                
                # If the movieId exists in movie_id_to_sorted_index, fill the rating
                if sorted_index is not None:
                    user_ratings_array[i][sorted_index] = rating
                    mask[i][sorted_index] = True
            i += 1
        cluster_ratings.append(user_ratings_array)
        cluster_ratings_mask.append(mask)

pred = np.matmul(cluster_user_embeddings[0], cluster_item_embeddings[0].T)
print(pred.shape)
print(cluster_ratings[0].shape)

import pickle

# Define the file name
file_name = "test_cluster_data_{}.pkl".format(num_clusters)

# Create a dictionary to store all the data
data_to_save = {
    "cluster_user_embeddings": cluster_user_embeddings,
    "cluster_item_embeddings": cluster_item_embeddings,
    "cluster_ratings": cluster_ratings,
    "cluster_mask": cluster_ratings_mask
}

# Serialize and save the data to a file
with open(file_name, 'wb') as file:
    pickle.dump(data_to_save, file)

print(f"Data saved to {file_name}")