ruk·si

Data Analysis

Updated at 2023-12-24 09:56

Data analysis is the process of understanding data structure. It's done to learn what the data is telling you. It's about finding patterns and relations hidden in the data.

Exploratory data analysis is developing understanding what your data looks like and what kind of answers might be hiding in the information. It is the first step in data analysis.

This is roughly what it looks like, usually done in Jupyter Notebook:

import json
from datetime import datetime

import numpy as np
import pandas as pd
import seaborn as sns

# load the data
movies_df = pd.read_csv("./tmdb-5000-movie-dataset/tmdb_5000_movies.csv")

# check out what the data looks like
movies_df.head()

# remove unrelated parts
del_col_list = [
    "keywords", "homepage", "status", "tagline", "original_language",
    "homepage", "overview", "production_companies", "original_title",
    "title_y"
]
movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()

# remove duplicate rows
print(movies_df.shape)
movies_df = movies_df.drop_duplicates(keep="first")
print(movies_df.shape)

# replace zero budget and revenue with nans
zero_cols = ["budget", "revenue"]
movies_df[zero_cols] = movies_df[zero_cols].replace(0, np.nan)

# remove rows with nan budget or revenue
movies_df.dropna(subset=zero_cols, inplace=True)
movies_df.shape

# turn dates to Python objects
movies_df.release_date = pd.to_datetime(movies_df["release_date"])
movies_df["release_year"] = movies_df["release_date"].dt.year
movies_df.head()

# turn numbers to Python integers
change_cols = ["budget", "revenue"]
movies_df[change_cols] = movies_df[change_cols].applymap(np.int64)
movies_df.dtypes


# turn JSON fields to flat lists
def parse_col_json(column, key):
    for index, i in zip(movies_df.index, movies_df[column].apply(json.loads)):
        list1 = []
        for j in range(len(i)):
            list1.append(
                (i[j][key]))  # the key "name" contains the name of the genre
        movies_df.loc[index, column] = str(list1)


parse_col_json("genres", "name")
parse_col_json("spoken_languages", "name")
parse_col_json("cast", "name")
parse_col_json("production_countries", "name")
movies_df.head()

# find the most expensive movies
expensive_movies_df = movies_df.sort_values(by="budget", ascending=False).head()
expensive_movies_df

# find the most profitable movies
movies_df["profit"] = movies_df["revenue"] - movies_df["budget"]
cols = ["budget",
        "profit",
        "revenue",
        "genres",
        "id",
        "popularity",
        "production_countries",
        "release_date",
        "release_year",
        "runtime",
        "spoken_languages",
        "title",
        "cast",
        "vote_average",
        "vote_count"
        ]
movies_df = movies_df[cols]
movies_df.sort_values(by=["budget"], ascending=False).head()

# find the most talked about movies
popular_movies_df = movies_df.sort_values(by="budget", ascending=False).head()
popular_movies_df.head()

# average runtime of movies
movies_df["runtime"].mean()

# filter movies according to ratings
movies_df[movies_df["vote_average"] >= 7.0]