# Matplotlib 101

In [1]:
# Assoc. Prof. Dr. Piyabute Fuangkhon
# Department of Digital Business Management
# Martin de Tours School of Management and Economics
# Assumption University
# Update: 22/05/2024

# Introduction to Matplotlib 101 (Visualizing COVID-19 Data)

In this notebook, we will explore the basics of Matplotlib while visualizing data from the OWID COVID-19 dataset. We will cover various types of plots including line plots, bar plots, scatter plots, histograms, and more.

In [2]:
# Global file location
file_location = 'owid-covid-data.csv'

In [3]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-08,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-09,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


## Line Plot
Let's start with a simple line plot to visualize the daily new COVID-19 cases in the United States.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Filter data for the United States
us_data = df[df['location'] == 'United States']

# Plot daily new cases
plt.figure(figsize=(10, 5))
plt.plot(us_data['date'], us_data['new_cases'], label='New Cases', color='blue')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.title('Daily New COVID-19 Cases in the United States')
plt.legend()
plt.show()

## Bar Plot
Next, let's create a bar plot to compare the total number of COVID-19 cases for each continent.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Group data by continent and get the maximum total cases
continent_cases = df.groupby('continent')['total_cases'].max().dropna()

# Plot the bar chart
plt.figure(figsize=(10, 5))
continent_cases.plot(kind='bar', color='orange')
plt.xlabel('Continent')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases by Continent')
plt.show()

## Scatter Plot
Now, let's generate a scatter plot to show the relationship between total COVID-19 cases and total deaths for each country.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Get the latest data for each country
latest_data = df[df['date'] == df['date'].max()]

# Plot the scatter plot
plt.figure(figsize=(10, 5))
plt.scatter(latest_data['total_cases'], latest_data['total_deaths'], alpha=0.5)
plt.xlabel('Total Cases')
plt.ylabel('Total Deaths')
plt.title('Total COVID-19 Cases vs. Total Deaths by Country')
plt.show()

## Histogram
Let's create a histogram to visualize the distribution of daily new cases globally.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Plot the histogram
plt.figure(figsize=(10, 5))
plt.hist(df['new_cases'].dropna(), bins=50, color='green', alpha=0.7)
plt.xlabel('New Cases')
plt.ylabel('Frequency')
plt.title('Distribution of Daily New COVID-19 Cases Globally')
plt.show()

## Box Plot
Let's compare the distribution of daily new cases per continent using a box plot.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Prepare data for box plot
df['new_cases_per_million'] = df['new_cases'] / df['population'] * 1e6
plt.figure(figsize=(12, 6))
sns.boxplot(x='continent', y='new_cases_per_million', data=df)
plt.xlabel('Continent')
plt.ylabel('New Cases per Million')
plt.title('Distribution of Daily New COVID-19 Cases per Continent')
plt.show()

## Pie Chart
Finally, let's create a pie chart showing the proportion of total COVID-19 deaths by continent.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Group data by continent and get the maximum total deaths
continent_deaths = df.groupby('continent')['total_deaths'].max().dropna()

# Plot the pie chart
plt.figure(figsize=(8, 8))
continent_deaths.plot(kind='pie', autopct='%1.1f%%', startangle=140)
plt.ylabel('')
plt.title('Proportion of Total COVID-19 Deaths by Continent')
plt.show()

## Advanced Line Plot with Moving Averages and Annotations
This example plots daily new cases for multiple countries with moving averages and annotations for significant business events.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)
df['date'] = pd.to_datetime(df['date'])

# Select countries for comparison
countries = ['United States', 'India', 'Brazil']
filtered_data = df[df['location'].isin(countries)]

# Plot the data
plt.figure(figsize=(12, 6))
for country in countries:
    country_data = filtered_data[filtered_data['location'] == country]
    plt.plot(country_data['date'], country_data['new_cases'].rolling(window=7).mean(), label=f'{country} (7-day MA)')

# Add annotations for business events
business_events = {
    '2020-03-11': 'WHO Declares Pandemic',
    '2020-12-14': 'First US Vaccination'
}
for date, event in business_events.items():
    plt.axvline(pd.to_datetime(date), color='gray', linestyle='--')
    plt.text(pd.to_datetime(date), plt.ylim()[1] * 0.9, event, rotation=90, verticalalignment='top')

plt.xlabel('Date')
plt.ylabel('New Cases (7-day MA)')
plt.title('Daily New COVID-19 Cases with Business Events Annotations')
plt.legend()
plt.show()

## Grouped Bar Plot for Total Cases by Continent and Quarter
This example creates a grouped bar plot showing the total number of COVID-19 cases by continent for each quarter, useful for business trend analysis.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)
df['date'] = pd.to_datetime(df['date'])
df['quarter'] = df['date'].dt.to_period('Q')

# Aggregate data by continent and quarter
continent_quarterly = df.groupby(['continent', 'quarter'])['total_cases'].sum().unstack().fillna(0)

# Plot the grouped bar plot
continent_quarterly.T.plot(kind='bar', stacked=False, figsize=(15, 8))
plt.xlabel('Quarter')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases by Continent and Quarter')
plt.legend(title='Continent')
plt.show()

## Scatter Plot with Bubble Sizes Representing Population
This example generates a scatter plot showing the relationship between total cases and total deaths with bubble sizes representing population size.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(file_location)
latest_data = df[df['date'] == df['date'].max()]

# Plot the scatter plot with bubble sizes
plt.figure(figsize=(10, 6))
sns.scatterplot(data=latest_data, x='total_cases', y='total_deaths', size='population', sizes=(20, 200), alpha=0.5)
plt.xlabel('Total Cases')
plt.ylabel('Total Deaths')
plt.title('Total COVID-19 Cases vs. Total Deaths by Country (Bubble Size: Population)')
plt.show()

## Histogram with Log Scale
This example creates a histogram for daily new cases globally and applies a log scale to better visualize the distribution.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(file_location)

# Plot the histogram with log scale
plt.figure(figsize=(10, 6))
sns.histplot(df['new_cases'].dropna(), bins=50, color='purple', log_scale=True)
plt.xlabel('New Cases (Log Scale)')
plt.ylabel('Frequency')
plt.title('Distribution of Daily New COVID-19 Cases Globally (Log Scale)')
plt.show()

## Box Plot with Points Overlayed by Continent
This example creates a box plot to compare the distribution of daily new cases per continent and overlays points colored by continent.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv(file_location)
df['new_cases_per_million'] = df['new_cases'] / df['population'] * 1e6

# Plot the box plot with points overlayed
plt.figure(figsize=(12, 6))
sns.boxplot(x='continent', y='new_cases_per_million', data=df)
sns.stripplot(x='continent', y='new_cases_per_million', data=df, color='black', alpha=0.3, jitter=True)
plt.xlabel('Continent')
plt.ylabel('New Cases per Million')
plt.title('Distribution of Daily New COVID-19 Cases per Continent with Points Overlayed')
plt.show()

## Donut Pie Chart for Vaccine Distribution
This example creates a donut pie chart showing the proportion of total vaccinations by continent.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv(file_location)

# Group data by continent and get the maximum total vaccinations
continent_vaccinations = df.groupby('continent')['total_vaccinations'].max().dropna()

# Plot the donut pie chart
plt.figure(figsize=(8, 8))
plt.pie(continent_vaccinations, labels=continent_vaccinations.index, autopct='%1.1f%%', startangle=140, wedgeprops=dict(width=0.3))
plt.title('Proportion of Total COVID-19 Vaccinations by Continent')
plt.show()