-
Notifications
You must be signed in to change notification settings - Fork 0
/
css4p01.py
117 lines (81 loc) · 4.31 KB
/
css4p01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 31 10:31:56 2024
@author: User
"""
import pandas as pd
data = pd.read_csv("C:/Users/User/Desktop/css_2024_uj/movie_dataset.csv", index_col=0)
df = pd.DataFrame(data)
df.dropna(inplace=True)
#Movie with the Highest Rating
highest_rating_movie = df.loc[df['Rating'].idxmax()]
print(highest_rating_movie)
#Average Revenue of All movies in the Dataset
average_movie_revenue = df["Revenue (Millions)"].mean()
print(f'The average revenue in million is ${average_movie_revenue:.2f}')
'''
average revenue of movies from 2015 to 2017
'''
#create a dataframe for 2015- 2017
df_15_17 = df[(df['Year']>=2015) & (df['Year'] <=2017)]
average_movie_revenue_15_17 = df_15_17["Revenue (Millions)"].mean()
print(f'The average revenue for movies produced between 2015 and 2017 in million is ${average_movie_revenue_15_17:.2f}')
#How many movies released in 2016
#2016_count = df["Year"]==2016).count()
#count_2016 = (df["Year"] == 2016).count()
#print(f'the number of movies made in 2016 is {count_2016}')
movies_2016_count = (data["Year"] == 2016).sum()
print(f'the number of movies made in 2016 is {movies_2016_count}')
#Hw many movies were made by Chris Nolan
# Count the number of movies directed by Christopher Nolan
count_nolan_movies = (data["Director"] == "Christopher Nolan").sum()
print(f'Number of movies directed by Christopher Nolan: {count_nolan_movies}')
#movies in the dataset have a rating of at least 8.0
movies_8_rating = (data["Rating"] >= 8.0).sum()
print(f'Number of movies with atleast 8.0 rating: {movies_8_rating}')
#What is the median rating of movies directed by Christopher Nolan?
median_nolan_rating = data[data["Director"] == "Christopher Nolan"]["Rating"].median()
print(f'Median rating of movies directed by Christopher Nolan: {median_nolan_rating}')
# Sort the DataFrame by the "Rating" column for movies directed by Christopher Nolan
sorted_nolan_movies = data[data["Director"] == "Christopher Nolan"].sort_values(by="Rating")
# Calculate the median rating
median_nolan_rating = sorted_nolan_movies["Rating"].median()
print(f'Median rating of movies directed by Christopher Nolan: {median_nolan_rating}')
# the year with the highest average rating
average_rating_by_year = data.groupby("Year")['Rating'].mean()
# Find the year with the highest average rating
year_highest_average_rating = average_rating_by_year.idxmax()
highest_average_rating = average_rating_by_year.max()
print(f'Year with the highest average rating: {year_highest_average_rating} (Average Rating: {highest_average_rating:.2f})')
# percentage increase in number of movies made between 2006 and 2016
# movie_count_2006 = len(data["Year"] == 2006)
# movie_count_2016 = len(data["Year"] == 2016)
# percentage_2006_2016 = (movie_count_2016 - movie_count_2006) *100/movie_count_2006
# print(f'The percentage increase : {percentage_2006_2016:.2f}')
# Filter the DataFrame for movies made in 2006 and 2016
movies_2006 = data[data['Year'] == 2006]
movies_2016 = data[data['Year'] == 2016]
# Calculate the number of movies made in each year
count_movies_2006 = len(movies_2006)
count_movies_2016 = len(movies_2016)
# Calculate the percentage increase
percentage_increase = ((count_movies_2016 - count_movies_2006) / count_movies_2006) * 100
print(f'Percentage increase in the number of movies made between 2006 and 2016: {percentage_increase:.2f}%')
# Create a new DataFrame with each actor in a separate row
actors_df = data['Actors'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).to_frame('Actor')
# Find the most common actor
most_common_actor = actors_df['Actor'].mode().iloc[0]
print(f'Most common actor in all movies: {most_common_actor}')
#How many unique genres are there in the dataset?
unique_genre = data['Genre'].str.split(',').explode().str.strip()
unique_genre_num = unique_genre.nunique()
print(f' The number of unique genres is {unique_genre_num}')
'''
Do a correlation of the numerical features, what insights can you deduce?
Mention at least 5 insights.
And what advice can you give directors to produce better movies?
'''
# Select only numeric columns
numeric_cols = data.select_dtypes(include=['number']).columns
# Calculate the correlation matrix
corr_matrix = data[numeric_cols].corr()