itsthedevman

Thursday, September 19, 2024 3:19:26 PM
> FinalProject.py
##############################################################
#	Bryan
#	Final Project
#	Using data from MovieLens, prove or disprove the hypothesis
#   that people 25 years and younger will rate fantasy movies
#   higher than people 35 years and older
###############################################################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Read in the data. Wooo
# movie Data
movie_data = pd.read_csv('movielens\\movies.dat', delimiter='::', names=['MovieID','Title','Genre'], engine='python')

# Rating data, only use columns 0,1, and 2 because we don't care about the timestamp for our hypothosis
rating_data = np.genfromtxt('movielens\\ratings.dat', delimiter='::', dtype=None, names=['UserID','MovieID','Rating'], usecols=(0,1,2))

# User Data, only use columns 0, 1, and 2 because we don't care about occupation or zipcode
user_data = np.genfromtxt('movielens\\users.dat', delimiter='::', dtype=None, names=['UserID','Gender','Age'], usecols=(0,1,2))

# Used to save the data
movie_data_filtered = []
rating_data_filtered = {}
user_data_filtered_old = []
user_data_filtered_young = []

# Pull the movieID for drama movies
for index, movie in movie_data.iterrows():

	# Filter fantasy out
	if "fantasy" in movie["Genre"].lower():

		# Add the MovieID to our list
		movie_data_filtered.append(movie["MovieID"])

# Convert it back to an ndarray
movie_data_filtered = np.array(movie_data_filtered)

# Get the ratings for these movies
for rating in rating_data:

	# Check to see if the movieID is in our filtered out movies
	if rating["MovieID"] in movie_data_filtered:
		# Get their userID
		userID = rating["UserID"]

		# Get any previous ratings
		rating_arr = rating_data_filtered.get(userID, [])

		# Add the ratings
		rating_arr.append(rating["Rating"])

		# Save!
		rating_data_filtered[userID] = rating_arr

# Filter out the users who rated our movies
for user in user_data:
	userID = user["UserID"]

	# Check to see if our user has rated a filtered movie
	if user["Age"] >= 35:
		rating = rating_data_filtered.get(userID,-1)
		if rating != -1:
			user_data_filtered_old += rating
	elif user["Age"] <= 25:
		rating = rating_data_filtered.get(userID,-1)
		if rating != -1:
			user_data_filtered_young += rating

# Calulate the mean for both groups/
oldMean = np.mean(user_data_filtered_old)
youngMean = np.mean(user_data_filtered_young)

# Print out the data so we can see it in plain text
print "35yrs and older"
print "   Mean: {}".format(oldMean)
print "   Count: {}".format(len(user_data_filtered_old))

print "25yrs and younger"
print "   Mean: {}".format(youngMean)
print "   Count: {}".format(len(user_data_filtered_young))

# Create the X-axis for both graphs
objects = ('25yrs and younger', '35yrs and older')
y_pos = np.arange(len(objects))

# Average rating graph
fig1 = plt.figure('Average Rating')
pl1 = fig1.add_subplot(1,1,1)
pl1.bar(y_pos, [youngMean, oldMean], align='center', alpha=0.5)
pl1.set_xticks((0,1))
pl1.set_xticklabels(objects)
pl1.set_yticks((1,2,3,4,5))
pl1.set_ylabel('Rating')
pl1.set_title('Average Rating for Fantasy Movies')

# Number of votes graph
fig2 = plt.figure('Number of Votes')
pl2 = fig2.add_subplot(1,1,1)
pl2.bar(y_pos, [len(user_data_filtered_young), len(user_data_filtered_old)], align='center', alpha=0.5)
pl2.set_xticks((0,1))
pl2.set_xticklabels(objects)
pl2.set_ylabel('Number of Votes')
pl2.set_title('Number of Votes')

# Show em!
plt.show()
Customize

Authenticate