Exploring IMBD ratings for the popular TV series,
Law & Order: Special Victims Unit

In this project I examine IMBD ratings of every episode in Law & Order: SVU. The goal of this project was to practice scraping web data, visualize trends in IMBD ratings across seasons, and identify episode outliers. I programmed this project in python and created plots with seaborn and matplotlib.

Part 1: Scraping data from imbd.com

I first extracted relevant data from imbd.com using BeautifulSoup in python. I obtained episode titles, episode numbers, release dates, ratings, and number of reviews using the following script.

from requests import get
from bs4 import BeautifulSoup

#url = "https://www.imdb.com/title/tt0203259/episodes?season=1"
#response = get(url)
#soup = BeautifulSoup(response.text, 'html.parser')

titles = []
release_date = []
rating = []
num_reviews = []
episode_num = []

# At the time of data extraction (March 16, 2021), the last episode released was S22, Ep8
seasons = list(range(1, 23))

for season in seasons:
    url = "https://www.imdb.com/title/tt0203259/episodes?season=" + str(season)
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    odd_ep = soup.find_all('div', class_='list_item odd')
    even_ep = soup.find_all('div', class_='list_item even')
# Odd episodes
    for i in range(len(odd_ep)):
    # title
        title = odd_ep[i].strong.text
        titles.append(title)
    # episode number
        episode = odd_ep[i].div.text.strip()
        episode_num.append(episode)
    # release date
        release = odd_ep[i].find("div", class_="airdate").text.strip()
        release_date.append(release)
    # rating
        rate = odd_ep[i].find("span", class_="ipl-rating-star__rating").text
        rating.append(rate)
    # number of reviews
        reviews = odd_ep[i].find("span", class_="ipl-rating-star__total-votes").text
        num_reviews.append(reviews)
# Even episodes
    for i in range(len(even_ep)):
    # title
        title = even_ep[i].strong.text
        titles.append(title)
    # episode number
        episode = even_ep[i].div.text.strip()
        episode_num.append(episode)
    # release date
        release = even_ep[i].find("div", class_="airdate").text.strip()
        release_date.append(release)
    # rating
        rate = even_ep[i].find("span", class_="ipl-rating-star__rating").text
        rating.append(rate)
    # number of reviews
        reviews = even_ep[i].find("span", class_="ipl-rating-star__total-votes").text
        num_reviews.append(reviews)

    sleep(1)

import pandas as pd
SVU = pd.DataFrame(list(zip(episode_num, titles, release_date, rating, num_reviews)), 
columns = ['episode', 'title', 'release_date', 'rating', 'number_reviews'])

SVU.to_csv("/Users/rachelforbes/Desktop/SVU/SVU_imbd.csv", index=False)

Part 2: Data cleaning

Minor data cleaning was required prior to data visualization due to format of the extracted data from imbd.com (e.g., create separate columns for season and episode, remove unnecessary characters, convert data types).

import pandas as pd
import numpy as np
import datetime

# split episode column into season number and episode number columns
ep_split = data.episode.str.split(',', expand=True)
data['season'] = ep_split[0]
data['episode'] = ep_split[1]

# remove 'S' from season values and 'Ep' from episode values
data.season.replace('S', '', inplace=True, regex=True)
data.episode.replace('Ep', '', inplace=True, regex=True)

# change season and episode from str to int
data.season = data.season.astype(int)
data.episode = data.episode.astype(int)

# Remove ',' from number_reviews
data.number_reviews.replace(',', '', inplace=True, regex=True)
data.number_reviews = data.number_reviews.astype(int)

# Remove '.' from date
data.release_date = data.release_date.str.replace('.', '', regex=True)

# Change date object to datetime object
data.release_date = pd.to_datetime(data.release_date)

Part 3: Data visualization and interpretation

Episode Ratings by Release Date

This plot visualizes IMBD ratings across episode release date. The pattern indicates more variance in ratings of latter seasons, though also suggests a slight downward trend episode ratings.

Mean Episode Ratings by Season

This plot visualizes the mean IMBD ratings by season, more clearly indicating a slight downward trend over time. The plot shows that season 16 in particular has low episode ratings.

# Plot of episode ratings by release date
import matplotlib.dates as mdates
#from brokenaxes import brokenaxes

# Initialize the figure
plt.figure(figsize=(12,6))

# Plot the data
release_plot = sns.lineplot(x = data.release_date, y = data.rating, linewidth = 2.0)

# Format labels and axes 
plt.title('IMBD Ratings by Episode Release Date', fontsize=15)
plt.xlabel('Release Year', fontsize=15)
plt.ylabel('IMBD Rating', fontsize=15)
plt.tick_params(labelsize=12)
plt.ylim([0,10])
plt.yticks(list(range(0,11)))
plt.xticks(rotation=45)

# Hide the right and top axes
release_plot.spines['right'].set_visible(False)
release_plot.spines['top'].set_visible(False)

# Change x axis to include all years
release_plot.xaxis.set_major_locator(mdates.YearLocator(1))
release_plot.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

# Add horizontal gridlines and make them slightly transparent
release_plot.yaxis.grid(alpha = .5)

plt.savefig('ratings_date.png')
# Plot of means IMBD ratings by season with confidence intervals

mean_season = sns.lmplot(x="season", y="rating", data=data, x_estimator=np.mean,aspect =2)
plt.title('Mean IMBD Ratings by Season', fontsize=15)
plt.xlabel('Season', fontsize=15)
plt.ylabel('IMBD Rating', fontsize=15)
plt.tick_params(labelsize=12)

mean_season.set(ylim = (0,10),
                xticks = list(range(1, 23)),
                yticks = list(range(0, 11)))

# Add horizontal gridlines and make them slightly transparent
plt.grid(alpha = .5, axis='y')

Episode Outliers

Which episodes have particularly low ratings? I used boxplots to visualize whether episode outliers exist and an interactive plot to obtain more information on episode outliers.

plt.figure(figsize=(12,6))
outlier_plot = sns.boxplot(x='season', y='rating', data=data)
plt.title('IMBD Ratings by Episode Across All Seasons', fontsize=18)
plt.xlabel('Season', fontsize=18)
plt.ylabel('IMBD Rating', fontsize=18)
outlier_plot.set(ylim = (0,10),
                yticks = list(range(0, 11)))

plt.savefig('szn_boxplot.png')

Season 16 sticks out as the most negatively rated season, but is the average IMBD rating of season 16 significantly different from the average IMBD ratings of the other seasons?

To address this question, I decided to conduct a simple t-test. However, given the variance indicated by the plots, I thought it was likely that the variances of each sample would be unequal. A Levene test for equality of variances indicated that indeed, the groups had unequal variances. Therefore, I ran a Welch’s t-tests that does not assume equal variances between groups.

Welch’s t-test revealed a significant difference between season 16 and the other seasons (t = 4.33, p = 0.0003). Season 16 was rated significantly more negatively than the other seasons, illustrated in the plot below.

# Dummy code seasons
data['season_dummy'] = np.where(data['season'] == 16, 1, 0)
szn_data = data[['season_dummy', 'rating']]

import scipy
season16 = szn_data[szn_data.season_dummy == 1]
other_seasons= szn_data[szn_data.season_dummy == 0]

scipy.stats.levene(season16['rating'], other_seasons['rating']) # violates assumption of equal variances, must use Welch's t-test

stats.ttest_ind(other_seasons['rating'], season16['rating'], equal_var = False)
# Plot means by season (16 or other)
season_plot = sns.catplot(x="season_dummy", y="rating", kind="bar", data=data)
season_plot.set_xticklabels(["All Other Seasons", "Season 16"])
plt.title('Comparing Season 16 Ratings to All Other Seasons')
plt.ylabel('IMBD Rating')
plt.xlabel('')
plt.ylim([0,10])

plt.savefig('szn_barplot.png')

Unfortunately, at this time wordpress does not support the addition of .html files to website pages. Therefore, I created a gif to illustrate the plotly interactive plot I created to examine more details about outlier episodes. The plot can be recreated using the code below.

As the quality is very subpar, below I’ve provided images of the highlighted points in the gif in order of appearance.

# Interactive plot of season by rating and episode, with size indicating number of IMBD reviews and a best fit trend line of rating across seasons
import plotly.express as px
import plotly

fig = px.scatter(
    data_frame=data, 
    x="season", 
    y="rating", 
    size= "number_reviews",
    color="episode", 
    hover_name="title",
    size_max=60,
    trendline="ols",
    labels=dict(season="Season", rating="IMBD Rating", number_reviews="Number of IMBD reviews", episode= 'Episode'),
)

fig.update_layout(
    xaxis = dict(
        tickmode = 'array',
        tickvals = list(data.season.unique())
    )
)

fig.update_yaxes(range=[0,10], nticks=20)
    
fig.update_layout(
    title={
        'text': "IMBD Rating Across Seasons by Episode",
        'y':0.96,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    font=dict(size = 18))


fig.show()

plotly.offline.plot(fig, filename='SVU_imbd.html')

Based on my exploration into the data, it is clear that one episode in particular “Imitation Game” – Season 16, had particularly negative reviews. A quick google search indicates that this episode was not received well because of it’s portrayal of the Gamergate controversy in 2014 where several women in the videogame industry were targets of online harassment.