Data Visualization

Part II

Agenda

We are going to visually analyze two datasets and see if we can tell stories from the visuals.

Setting up

Let’s first load the ggplot2 package:

if (!require(ggplot2)) {
  install.packages("ggplot2") # install if not already installed
}
library (ggplot2)

Titanic Survival

Load and explore the dataset

titanic = read.csv ("https://raw.githubusercontent.com/ahmedmoustafa/datasets/main/titanic/titanic.csv")
head(titanic)
name survived sex age class
Allen, Miss. Elisabeth Walton yes female 29.0000 1st
Allison, Master. Hudson Trevor yes male 0.9167 1st
Allison, Miss. Helen Loraine no female 2.0000 1st
Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
Anderson, Mr. Harry yes male 48.0000 1st
titanic$survived = factor(titanic$survived)
titanic$sex = factor(titanic$sex)
titanic$class = factor(titanic$class, levels = c("1st", "2nd", "3rd"))
head(titanic)
name survived sex age class
Allen, Miss. Elisabeth Walton yes female 29.0000 1st
Allison, Master. Hudson Trevor yes male 0.9167 1st
Allison, Miss. Helen Loraine no female 2.0000 1st
Allison, Mr. Hudson Joshua Crei no male 30.0000 1st
Allison, Mrs. Hudson J C (Bessi no female 25.0000 1st
Anderson, Mr. Harry yes male 48.0000 1st
summary(titanic)
name survived sex age class
Length:1309 no :809 female:466 Min. : 0.1667 1st:323
Class :character yes:500 male :843 1st Qu.:21.0000 2nd:277
Mode :character NA NA Median :28.0000 3rd:709
NA NA NA Mean :29.8811 NA
NA NA NA 3rd Qu.:39.0000 NA
NA NA NA Max. :80.0000 NA
NA NA NA NA’s :263 NA

Survival by class

ggplot(titanic) +
  geom_bar(aes(x = class, fill = survived), position = "dodge")

ggplot(titanic) +
  geom_bar(aes(x = class, fill = survived), position = "fill")

Survival by sex

ggplot(titanic) +
  geom_bar(aes(x = sex, fill = survived), position = "dodge")

ggplot(titanic) +
  geom_bar(aes(x = sex, fill = survived), position = "fill")

Survival by age

ggplot(titanic) +
  geom_density(aes(x = age, fill = survived), alpha = 0.5)

ggplot(titanic) +
  geom_boxplot(aes(x = survived, y = age))

ggplot(titanic) +
  geom_violin(aes(x = survived, y = age))

Survival by age & sex

ggplot(titanic) +
  geom_boxplot(aes(x = sex, y = age, fill = survived), alpha = 0.5)

Survival by class & age

ggplot(titanic) +
  geom_boxplot(aes(x = class, y = age, fill = survived), alpha = 0.5)

Smoking and Pregnancy

Load and explore the dataset

smoking = read.csv("https://raw.githubusercontent.com/ahmedmoustafa/datasets/main/smoking/smoking.csv")
head(smoking)
id date gestation weight parity mom.race mom.age mom.edu mom.height mom.weight dad.race dad.age dad.edu dad.height dad.weight marital income smoke quit.time cigs
15 1411 284 120 1 asian 27 5 62 100 asian 31 5 65 110 1 1 never 0 0
20 1499 282 113 2 white 33 5 64 135 white 38 5 70 148 1 4 never 0 0
100 1673 286 136 4 white 25 2 62 93 white 28 2 64 130 1 4 until_pregnancy 2 2
129 1562 245 132 2 black 23 1 65 140 black 23 4 71 192 1 2 never 0 0
142 1408 289 120 3 white 25 4 62 125 white 26 1 70 180 0 2 never 0 0
171 1593 282 144 4 white 32 2 64 124 white 36 1 74 185 1 2 now 1 1
smoking$parity = factor(smoking$parity)
smoking$mom.race = factor(smoking$mom.race)
smoking$mom.edu = factor(smoking$mom.edu)
smoking$dad.race = factor(smoking$dad.race)
smoking$dad.edu = factor(smoking$dad.edu)
smoking$marital = factor(smoking$marital)
smoking$income = factor(smoking$income)
smoking$smoke = factor(smoking$smoke)
smoking$quit.time = factor(smoking$quit.time)
smoking$cigs = factor(smoking$cigs)
head(smoking)
id date gestation weight parity mom.race mom.age mom.edu mom.height mom.weight dad.race dad.age dad.edu dad.height dad.weight marital income smoke quit.time cigs
15 1411 284 120 1 asian 27 5 62 100 asian 31 5 65 110 1 1 never 0 0
20 1499 282 113 2 white 33 5 64 135 white 38 5 70 148 1 4 never 0 0
100 1673 286 136 4 white 25 2 62 93 white 28 2 64 130 1 4 until_pregnancy 2 2
129 1562 245 132 2 black 23 1 65 140 black 23 4 71 192 1 2 never 0 0
142 1408 289 120 3 white 25 4 62 125 white 26 1 70 180 0 2 never 0 0
171 1593 282 144 4 white 32 2 64 124 white 36 1 74 185 1 2 now 1 1
summary(smoking)
id date gestation weight parity mom.race mom.age mom.edu mom.height mom.weight dad.race dad.age dad.edu dad.height dad.weight marital income smoke quit.time cigs
Min. : 15 Min. :1350 Min. :148.0 Min. : 55.0 1 :152 asian : 24 Min. :15.00 0: 3 Min. :54.0 Min. : 87.0 asian : 25 Min. :18.00 2 :189 Min. :60.00 Min. :110.0 0: 2 2 :101 never :282 0 :282 0 :282
1st Qu.:5426 1st Qu.:1469 1st Qu.:272.2 1st Qu.:108.2 0 :145 black :131 1st Qu.:23.00 1: 85 1st Qu.:62.0 1st Qu.:115.0 black :132 1st Qu.:25.00 5 :165 1st Qu.:68.00 1st Qu.:155.0 1:600 1 :100 now :216 1 :216 5 :101
Median :6907 Median :1574 Median :280.0 Median :120.0 2 :117 mexican: 18 Median :27.00 2:229 Median :64.0 Median :125.0 mexican: 17 Median :30.00 4 :131 Median :71.00 Median :170.0 2: 6 3 :100 once_not_now : 60 2 : 52 1 : 75
Mean :6090 Mean :1559 Mean :278.8 Mean :119.3 3 : 82 mixed : 14 Mean :27.54 3: 32 Mean :64.1 Mean :128.9 mixed : 15 Mean :30.44 1 : 91 Mean :70.27 Mean :170.6 3: 1 7 : 90 until_pregnancy: 52 3 : 15 2 : 73
3rd Qu.:7792 3rd Qu.:1650 3rd Qu.:288.0 3rd Qu.:131.0 4 : 49 white :423 3rd Qu.:31.00 4:152 3rd Qu.:66.0 3rd Qu.:140.0 white :421 3rd Qu.:35.00 3 : 20 3rd Qu.:72.00 3rd Qu.:185.0 5: 1 4 : 71 NA 4 : 13 3 : 40
Max. :9263 Max. :1714 Max. :338.0 Max. :174.0 5 : 31 NA Max. :43.00 5:109 Max. :72.0 Max. :220.0 NA Max. :53.00 0 : 12 Max. :78.00 Max. :260.0 NA 5 : 65 NA 7 : 13 6 : 17
NA NA NA NA (Other): 34 NA NA NA NA NA NA NA (Other): 2 NA NA NA (Other): 83 NA (Other): 19 (Other): 22

Mom’s smoking and baby’s weight

ggplot(smoking) +
  geom_boxplot(aes(x = smoke, y = weight))

Mom’s smoking and baby’s weight with reordered x-axis

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(smoke, weight, FUN = median), y = weight))

Mom’s race and baby’s weight

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(mom.race, weight, FUN = median), y = weight))

Dad’s race and baby’s weight

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(dad.race, weight, FUN = median), y = weight))

Mom’s race and baby’s weight and dad’s race

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(mom.race, weight, FUN = median), y = weight)) +
  geom_jitter(aes(x = reorder(mom.race, weight, FUN = median), y = weight, color = dad.race), alpha = 0.5, width = 0.3)

Mom’s height and moms’s weight

ggplot(smoking) +
  geom_point(aes(x = mom.height, y = mom.weight, color = mom.race), alpha = 0.5) +
  geom_smooth(aes(x = mom.height, y = mom.weight), method = "lm")

model = lm (data = smoking, formula = mom.weight ~ mom.height)
summary(model)

Call:
lm(formula = mom.weight ~ mom.height, data = smoking)

Residuals:
    Min      1Q  Median      3Q     Max 
-38.579 -11.933  -3.515   7.276  94.839 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -86.174     18.671  -4.615 4.79e-06 ***
mom.height     3.354      0.291  11.526  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.55 on 608 degrees of freedom
Multiple R-squared:  0.1793,    Adjusted R-squared:  0.178 
F-statistic: 132.8 on 1 and 608 DF,  p-value: < 2.2e-16

Dad’s height and dad’s weight

ggplot(smoking) +
  geom_point(aes(x = dad.height, y = dad.weight, color = mom.race), alpha = 0.5) +
  geom_smooth(aes(x = dad.height, y = dad.weight), method = "lm")

model = lm (data = smoking, formula = dad.weight ~ dad.height)
summary(model)

Call:
lm(formula = dad.weight ~ dad.height, data = smoking)

Residuals:
    Min      1Q  Median      3Q     Max 
-48.067 -13.067  -1.825  10.554  86.243 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -132.2898    18.8057  -7.035  5.4e-12 ***
dad.height     4.3105     0.2674  16.120  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 19.02 on 608 degrees of freedom
Multiple R-squared:  0.2994,    Adjusted R-squared:  0.2983 
F-statistic: 259.9 on 1 and 608 DF,  p-value: < 2.2e-16

Mom’s weight and dad’s weight

ggplot(smoking) +
  geom_point(aes(x = mom.weight, y = dad.weight, color = mom.race, shape = dad.race), alpha = 0.5)

model = lm (data = smoking, formula = dad.weight ~ mom.weight)
summary(model)

Call:
lm(formula = dad.weight ~ mom.weight, data = smoking)

Residuals:
    Min      1Q  Median      3Q     Max 
-57.481 -15.817  -2.051  14.097  93.646 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 141.54810    5.74900  24.621  < 2e-16 ***
mom.weight    0.22551    0.04407   5.117 4.16e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 22.25 on 608 degrees of freedom
Multiple R-squared:  0.04129,   Adjusted R-squared:  0.03972 
F-statistic: 26.19 on 1 and 608 DF,  p-value: 4.159e-07

Mom’s smoking and mom’s education

ggplot(smoking) +
  geom_bar(aes(x = mom.edu, fill = smoke), position = "fill")

ggplot(smoking) +
  geom_bar(aes(x = factor(mom.edu), fill = smoke), position = "fill")

Mom’s smoking and the family’s income

ggplot(smoking) +
  geom_bar(aes(x = factor(income), fill = smoke), position = "fill")

Mom’s race and mom’s weight

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(mom.race, mom.weight, FUN = median), y = mom.weight))

Dad’s race and dad’s weight

ggplot(smoking) +
  geom_boxplot(aes(x = reorder(dad.race, dad.weight, FUN = median), y = dad.weight))

The End