import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('insurance.csv')

#Set up custom colors and color palettes
beige = '#EBDDBE'
maroon = '#550000'
dark_grey = '#666666'
light_green = '#96EE9C'
yellow = '#FFEC17'
dark_blue = '#001155'
red = '#BF0000'
purple = '#610054'

colors_sex = sns.color_palette([beige, maroon]) 
colors_smokers = sns.color_palette([dark_grey, light_green])
colors_children = sns.dark_palette(yellow, reverse=True)

columns = list(data.columns)

print('The dataset contains {rows} rows containing data about {columns}.'
.format(
    rows = len(data), 
    columns = ', '.join(columns)
    ))

print('\nSample of the data:')
data.sample(5)

The dataset contains 1338 rows containing data about age, sex, bmi, children, smoker, region, charges.

Sample of the data:

# Check that there are no missing data in the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB

# Summary statistics
data.describe()

# Age distribution
ax = sns.histplot(data=data, x='age', binwidth=1, color=dark_blue)
ax.set(xlabel='Age', ylabel='Number of data points', 
        title='Age distribution')

[Text(0.5, 0, 'Age'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Age distribution')]

# Create bins for age groups
bins_age = [18, 25, 35, 45, 55, 64]
labels_age = ['18-24', '25-34', '35-44', '45-54', '55-64']
data['age_group'] = pd.cut(data['age'], bins=bins_age, 
    labels=labels_age, right=False)

# Age distribution
ax = sns.histplot(data=data, x='age_group', bins=bins_age, 
                    color=dark_blue)
ax.set(xlabel='Age', ylabel='Number of data points', 
        title='Age distribution')

[Text(0.5, 0, 'Age'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Age distribution')]

# Percentage of men and women in the dataset
number_total = len(data)
number_female = len(data[data['sex'] == 'female'])
number_male = len(data[data['sex'] == 'male'])
percent_female = round(number_female/number_total*100,2)
percent_male = round(number_male/number_total*100,2)
print('In the dataset, {w}% of the people are women and {m}% are ' \
    'men.'.format(w=percent_female, m=percent_male))

ax = sns.countplot(data=data, x='sex', hue='sex', palette=colors_sex, 
                    stat='percent')
ax.set(xlabel=None, ylabel='Percent in the dataset', 
        title='Women and men in the dataset')

In the dataset, 49.48% of the people are women and 50.52% are men.

[Text(0.5, 0, ''),
 Text(0, 0.5, 'Percent in the dataset'),
 Text(0.5, 1.0, 'Women and men in the dataset')]

# We want to check that the sex is balanced in each age group
print('\nNumber of datapoints divided by age and sex:')
print(pd.crosstab(data['age_group'], data['sex']))

ax = sns.countplot(data=data, x='age_group', 
                    hue='sex', palette=colors_sex)
ax.set(xlabel='Age group', ylabel='Number of data points', 
        title='Number of data points per sex and age group')

Number of datapoints divided by age and sex:
sex        female  male
age_group              
18-24         134   144
25-34         132   139
35-44         129   131
45-54         144   143
55-64         112   108

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Number of data points per sex and age group')]

# Percentage of smokers and nonsmokers in the dataset
number_total = len(data)
number_smoker = len(data[data['smoker'] == 'yes'])
number_nonsmoker = len(data[data['smoker'] == 'no'])
percent_smoker = round(number_smoker/number_total*100,2)
percent_nonsmoker = round(number_nonsmoker/number_total*100,2)
print('In the dataset, {y}% of the people are smokers and {n}% ' \
    'are nonsmokers.'.format(y=percent_smoker, n=percent_nonsmoker))

ax = sns.countplot(data=data, x='smoker', hue='smoker', 
                    palette=colors_smokers, stat='percent')
ax.set(xlabel=None, ylabel='Percent in the dataset', 
        title='Smokers and nonsmokers in the dataset')
ax.set_xticks([0,1], labels=['smoker', 'nonsmoker'])

In the dataset, 20.48% of the people are smokers and 79.52% are nonsmokers.

[<matplotlib.axis.XTick at 0x1f69dfb9f90>,
 <matplotlib.axis.XTick at 0x1f69dff4550>]

# Number of data points for smokers and nonsmokers per age group
print('\nNumber of datapoints divided by age and smoker status:')
print(pd.crosstab(data['age_group'], data['smoker']))

ax = sns.countplot(data=data, x='age_group', 
                    hue='smoker', palette=colors_smokers)
ax.set(xlabel='Age group', ylabel='Number of data points', 
       title='Number of (non)smokers in the dataset per age group')

Number of datapoints divided by age and smoker status:
smoker      no  yes
age_group          
18-24      218   60
25-34      215   56
35-44      199   61
45-54      232   55
55-64      185   35

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Number of (non)smokers in the dataset per age group')]

ax = sns.countplot(data=data, x='children', hue='children', 
                   palette=colors_children, stat='percent')
ax.set(xlabel='Number of children', ylabel='Percent in the dataset', 
       title='Percent of dataset points per number of children')

[Text(0.5, 0, 'Number of children'),
 Text(0, 0.5, 'Percent in the dataset'),
 Text(0.5, 1.0, 'Percent of dataset points per number of children')]

# Number of datapoints of number of children per age group
ax = sns.countplot(data=data, x='age_group', hue='children', 
                   palette=colors_children)
ax.set(xlabel='Age group', ylabel='Number of data points', 
       title='Number of dataset points per number of children')

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Number of dataset points per number of children')]

# BMI distribution
ax = sns.histplot(data=data, x='bmi', binwidth=1, color=purple)
ax.set(xlabel='BMI', ylabel='Number of data points', 
       title='Distribution of BMI')

[Text(0.5, 0, 'BMI'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Distribution of BMI')]

#
bins_bmi = range(15,60,5)
data['bmi_group'] = pd.cut(data['bmi'], bins=bins_bmi, right=False)

ax = sns.histplot(data=data, x='bmi', bins=bins_bmi, color=purple)
ax.set(xlabel='BMI', ylabel='Number of data points', 
       title='Distribution of BMI')

[Text(0.5, 0, 'BMI'),
 Text(0, 0.5, 'Number of data points'),
 Text(0.5, 1.0, 'Distribution of BMI')]

ax = sns.barplot(data=data, x='age_group', y='charges', errorbar=None, 
                 color=dark_blue)
ax.set(xlabel='Age group', ylabel='Charges (USD)', 
       title='Average insurance cost per age group')

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Average insurance cost per age group')]

ax = sns.scatterplot(data=data, x='age', y='charges', color=dark_blue)
ax.set(xlabel='Age', ylabel='Charges (USD)', 
       title='Comparing insurance costs with age')

[Text(0.5, 0, 'Age'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Comparing insurance costs with age')]

ax = sns.scatterplot(data=data, x='age', y='charges', hue='smoker', 
                     palette=colors_smokers)
ax.set(xlabel='Age', ylabel='Charges (USD)', 
       title='Comparing insurance costs with age')

[Text(0.5, 0, 'Age'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Comparing insurance costs with age')]

mean_charges_female = data[data['sex'] == 'female']['charges'].mean()
mean_charges_male = data[data['sex'] == 'male']['charges'].mean()
print('Average cost is USD {w}  for women and USD {m} for men. ' \
        '\nIt means that on average, men pay USD {d} more than women.'.format(
            w=round(mean_charges_female,2), 
            m=round(mean_charges_male,2), 
            d=round(mean_charges_male,2)-round(mean_charges_female,2)
            )
    )

ax = sns.boxplot(data, x='sex', y='charges', 
    hue='sex', palette=colors_sex)
ax.set(xlabel = None, ylabel='Charges (USD)', 
       title='Spread of insurance cost per sex')

Average cost is USD 12569.58  for women and USD 13956.75 for men. 
It means that on average, men pay USD 1387.17 more than women.

[Text(0.5, 0, ''),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Spread of insurance cost per sex')]

mean_charges_smoker = data[data['smoker'] == 'yes']['charges'].mean()
mean_charges_nonsmoker = data[data['smoker'] == 'no']['charges'].mean()
print('Average cost is USD {y}  for smokers and USD {n} for nonsmokers. ' \
        '\nIt means that on average, smokers pay USD {d} more than nonsmokers.'.format(
            y=round(mean_charges_smoker,2), 
            n=round(mean_charges_nonsmoker,2), 
            d=round(mean_charges_smoker,2)-round(mean_charges_nonsmoker,2)
            )
    )


ax = sns.boxplot(data, x='smoker', y='charges', 
    hue='smoker', palette=colors_smokers)
ax.set(xlabel=None, ylabel='Charges (USD)', 
       title='Spread of insurance cost for smokers and nonsmokers')
ax.set_xticks([0,1], labels=['smoker', 'nonsmoker'])

Average cost is USD 32050.23  for smokers and USD 8434.27 for nonsmokers. 
It means that on average, smokers pay USD 23615.96 more than nonsmokers.

[<matplotlib.axis.XTick at 0x1f69f546d50>,
 <matplotlib.axis.XTick at 0x1f69f57a210>]

fig, axes = plt.subplots(1, 2, figsize=(12, 6))

for ax, (sex, group) in zip(axes, data.groupby('sex')):
    counts = group['smoker'].value_counts()
    ax.pie(
        counts,
        labels=counts.index,
        autopct='%1.1f%%',
        colors=[light_green, dark_grey]
    )
    ax.set_title(f'Smoker Distribution — {sex}')

plt.suptitle('Smoker Proportion by Sex')
plt.show()

ax = sns.barplot(data=data, x='bmi_group', y='charges', errorbar=None, 
                 color=purple)
ax.set(xlabel='BMI', ylabel='Charges (USD)', 
       title='Average insurance cost per BMI group')

[Text(0.5, 0, 'BMI'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Average insurance cost per BMI group')]

ax = sns.scatterplot(data=data, x='bmi', y='charges', color=purple)
ax.set(xlabel='BMI', ylabel='Charges (USD)', 
       title='Relation between insurance cost and BMI')

[Text(0.5, 0, 'BMI'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Relation between insurance cost and BMI')]

ax = sns.scatterplot(data=data, x='bmi', y='charges', hue='smoker', 
                     palette=colors_smokers)
ax.set(xlabel='BMI', ylabel='Charges (USD)', 
       title='Relation between insurance cost and BMI')

[Text(0.5, 0, 'BMI'),
 Text(0, 0.5, 'Charges (USD)'),
 Text(0.5, 1.0, 'Relation between insurance cost and BMI')]

ax = sns.barplot(data=data, x='children', y='charges', errorbar=None, 
                 hue='children', palette=colors_children, legend=False)
ax.set(xlabel='Number of children', ylabel='Average charges (USD)', 
       title='Average insurance cost per number of children')

[Text(0.5, 0, 'Number of children'),
 Text(0, 0.5, 'Average charges (USD)'),
 Text(0.5, 1.0, 'Average insurance cost per number of children')]

ax = sns.barplot(data=data, x='age_group', y='charges', errorbar=None, 
                 hue='children', palette=colors_children)
ax.set(xlabel='Age group', ylabel='Average cost (USD)', 
       title='Average insurance cost per number of children')

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Average cost (USD)'),
 Text(0.5, 1.0, 'Average insurance cost per number of children')]

# Replacing categorical variables with numerical
# Necessary for Pearson correlation
data_encoded = data.copy()[[
    'smoker', 'charges', 'age', 'bmi', 'children', 'sex'
    ]]
data_encoded['smoker'] = data_encoded['smoker'].map({'yes': 1, 'no': 0})
data_encoded['sex'] = data_encoded['sex'].map({'male': 1, 'female': 0})

ax = sns.heatmap(data_encoded.corr(), annot=True, fmt='.2f', vmin=-1, vmax=1, 
                 cmap=sns.color_palette('coolwarm', as_cmap=True))
ax.set(title='Pearson correlation')

[Text(0.5, 1.0, 'Pearson correlation')]

data_avg_charges_smoker_age = data.groupby(
    ['age_group', 'smoker'], observed=True
    )['charges'].mean()


ax = sns.barplot(data=data, x='age_group', y='charges', errorbar=None,
    hue='smoker', palette=colors_smokers)
ax.set(xlabel='Age group', ylabel='Average charges (USD)', 
       title='Average insurance cost per age group')

[Text(0.5, 0, 'Age group'),
 Text(0, 0.5, 'Average charges (USD)'),
 Text(0.5, 1.0, 'Average insurance cost per age group')]

	age	bmi	children	charges
count	1338.000000	1338.000000	1338.000000	1338.000000
mean	39.207025	30.663397	1.094918	13270.422265
std	14.049960	6.098187	1.205493	12110.011237
min	18.000000	15.960000	0.000000	1121.873900
25%	27.000000	26.296250	0.000000	4740.287150
50%	39.000000	30.400000	1.000000	9382.033000
75%	51.000000	34.693750	2.000000	16639.912515
max	64.000000	53.130000	5.000000	63770.428010

US Medical Insurance Cost¶

Table of contents¶

Dataset overview¶

Exploring the diversity of the data¶

What influences the cost of medical insurance?¶

Age¶

Sex¶

Smoking¶

BMI¶

Number of children¶

Correlations¶

Conclusion¶

	age	sex	bmi	children	smoker	region	charges
752	64	male	37.905	0	no	northwest	14210.53595
1022	47	male	36.080	1	yes	southeast	42211.13820
342	60	female	27.550	0	no	northeast	13217.09450
121	18	male	23.750	0	no	northeast	1705.62450
1104	37	male	29.800	0	no	southwest	20420.60465