Part II
We are going to visually analyze two datasets and see if we can tell stories from the visuals.
Let’s first load the ggplot2
package:
titanic = read.csv ("https://raw.githubusercontent.com/ahmedmoustafa/datasets/main/titanic/titanic.csv")
head(titanic)
name | survived | sex | age | class |
---|---|---|---|---|
Allen, Miss. Elisabeth Walton | yes | female | 29.0000 | 1st |
Allison, Master. Hudson Trevor | yes | male | 0.9167 | 1st |
Allison, Miss. Helen Loraine | no | female | 2.0000 | 1st |
Allison, Mr. Hudson Joshua Crei | no | male | 30.0000 | 1st |
Allison, Mrs. Hudson J C (Bessi | no | female | 25.0000 | 1st |
Anderson, Mr. Harry | yes | male | 48.0000 | 1st |
titanic$survived = factor(titanic$survived)
titanic$sex = factor(titanic$sex)
titanic$class = factor(titanic$class, levels = c("1st", "2nd", "3rd"))
head(titanic)
name | survived | sex | age | class |
---|---|---|---|---|
Allen, Miss. Elisabeth Walton | yes | female | 29.0000 | 1st |
Allison, Master. Hudson Trevor | yes | male | 0.9167 | 1st |
Allison, Miss. Helen Loraine | no | female | 2.0000 | 1st |
Allison, Mr. Hudson Joshua Crei | no | male | 30.0000 | 1st |
Allison, Mrs. Hudson J C (Bessi | no | female | 25.0000 | 1st |
Anderson, Mr. Harry | yes | male | 48.0000 | 1st |
name | survived | sex | age | class | |
---|---|---|---|---|---|
Length:1309 | no :809 | female:466 | Min. : 0.1667 | 1st:323 | |
Class :character | yes:500 | male :843 | 1st Qu.:21.0000 | 2nd:277 | |
Mode :character | NA | NA | Median :28.0000 | 3rd:709 | |
NA | NA | NA | Mean :29.8811 | NA | |
NA | NA | NA | 3rd Qu.:39.0000 | NA | |
NA | NA | NA | Max. :80.0000 | NA | |
NA | NA | NA | NA’s :263 | NA |
smoking = read.csv("https://raw.githubusercontent.com/ahmedmoustafa/datasets/main/smoking/smoking.csv")
head(smoking)
id | date | gestation | weight | parity | mom.race | mom.age | mom.edu | mom.height | mom.weight | dad.race | dad.age | dad.edu | dad.height | dad.weight | marital | income | smoke | quit.time | cigs |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15 | 1411 | 284 | 120 | 1 | asian | 27 | 5 | 62 | 100 | asian | 31 | 5 | 65 | 110 | 1 | 1 | never | 0 | 0 |
20 | 1499 | 282 | 113 | 2 | white | 33 | 5 | 64 | 135 | white | 38 | 5 | 70 | 148 | 1 | 4 | never | 0 | 0 |
100 | 1673 | 286 | 136 | 4 | white | 25 | 2 | 62 | 93 | white | 28 | 2 | 64 | 130 | 1 | 4 | until_pregnancy | 2 | 2 |
129 | 1562 | 245 | 132 | 2 | black | 23 | 1 | 65 | 140 | black | 23 | 4 | 71 | 192 | 1 | 2 | never | 0 | 0 |
142 | 1408 | 289 | 120 | 3 | white | 25 | 4 | 62 | 125 | white | 26 | 1 | 70 | 180 | 0 | 2 | never | 0 | 0 |
171 | 1593 | 282 | 144 | 4 | white | 32 | 2 | 64 | 124 | white | 36 | 1 | 74 | 185 | 1 | 2 | now | 1 | 1 |
smoking$parity = factor(smoking$parity)
smoking$mom.race = factor(smoking$mom.race)
smoking$mom.edu = factor(smoking$mom.edu)
smoking$dad.race = factor(smoking$dad.race)
smoking$dad.edu = factor(smoking$dad.edu)
smoking$marital = factor(smoking$marital)
smoking$income = factor(smoking$income)
smoking$smoke = factor(smoking$smoke)
smoking$quit.time = factor(smoking$quit.time)
smoking$cigs = factor(smoking$cigs)
head(smoking)
id | date | gestation | weight | parity | mom.race | mom.age | mom.edu | mom.height | mom.weight | dad.race | dad.age | dad.edu | dad.height | dad.weight | marital | income | smoke | quit.time | cigs |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
15 | 1411 | 284 | 120 | 1 | asian | 27 | 5 | 62 | 100 | asian | 31 | 5 | 65 | 110 | 1 | 1 | never | 0 | 0 |
20 | 1499 | 282 | 113 | 2 | white | 33 | 5 | 64 | 135 | white | 38 | 5 | 70 | 148 | 1 | 4 | never | 0 | 0 |
100 | 1673 | 286 | 136 | 4 | white | 25 | 2 | 62 | 93 | white | 28 | 2 | 64 | 130 | 1 | 4 | until_pregnancy | 2 | 2 |
129 | 1562 | 245 | 132 | 2 | black | 23 | 1 | 65 | 140 | black | 23 | 4 | 71 | 192 | 1 | 2 | never | 0 | 0 |
142 | 1408 | 289 | 120 | 3 | white | 25 | 4 | 62 | 125 | white | 26 | 1 | 70 | 180 | 0 | 2 | never | 0 | 0 |
171 | 1593 | 282 | 144 | 4 | white | 32 | 2 | 64 | 124 | white | 36 | 1 | 74 | 185 | 1 | 2 | now | 1 | 1 |
id | date | gestation | weight | parity | mom.race | mom.age | mom.edu | mom.height | mom.weight | dad.race | dad.age | dad.edu | dad.height | dad.weight | marital | income | smoke | quit.time | cigs | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Min. : 15 | Min. :1350 | Min. :148.0 | Min. : 55.0 | 1 :152 | asian : 24 | Min. :15.00 | 0: 3 | Min. :54.0 | Min. : 87.0 | asian : 25 | Min. :18.00 | 2 :189 | Min. :60.00 | Min. :110.0 | 0: 2 | 2 :101 | never :282 | 0 :282 | 0 :282 | |
1st Qu.:5426 | 1st Qu.:1469 | 1st Qu.:272.2 | 1st Qu.:108.2 | 0 :145 | black :131 | 1st Qu.:23.00 | 1: 85 | 1st Qu.:62.0 | 1st Qu.:115.0 | black :132 | 1st Qu.:25.00 | 5 :165 | 1st Qu.:68.00 | 1st Qu.:155.0 | 1:600 | 1 :100 | now :216 | 1 :216 | 5 :101 | |
Median :6907 | Median :1574 | Median :280.0 | Median :120.0 | 2 :117 | mexican: 18 | Median :27.00 | 2:229 | Median :64.0 | Median :125.0 | mexican: 17 | Median :30.00 | 4 :131 | Median :71.00 | Median :170.0 | 2: 6 | 3 :100 | once_not_now : 60 | 2 : 52 | 1 : 75 | |
Mean :6090 | Mean :1559 | Mean :278.8 | Mean :119.3 | 3 : 82 | mixed : 14 | Mean :27.54 | 3: 32 | Mean :64.1 | Mean :128.9 | mixed : 15 | Mean :30.44 | 1 : 91 | Mean :70.27 | Mean :170.6 | 3: 1 | 7 : 90 | until_pregnancy: 52 | 3 : 15 | 2 : 73 | |
3rd Qu.:7792 | 3rd Qu.:1650 | 3rd Qu.:288.0 | 3rd Qu.:131.0 | 4 : 49 | white :423 | 3rd Qu.:31.00 | 4:152 | 3rd Qu.:66.0 | 3rd Qu.:140.0 | white :421 | 3rd Qu.:35.00 | 3 : 20 | 3rd Qu.:72.00 | 3rd Qu.:185.0 | 5: 1 | 4 : 71 | NA | 4 : 13 | 3 : 40 | |
Max. :9263 | Max. :1714 | Max. :338.0 | Max. :174.0 | 5 : 31 | NA | Max. :43.00 | 5:109 | Max. :72.0 | Max. :220.0 | NA | Max. :53.00 | 0 : 12 | Max. :78.00 | Max. :260.0 | NA | 5 : 65 | NA | 7 : 13 | 6 : 17 | |
NA | NA | NA | NA | (Other): 34 | NA | NA | NA | NA | NA | NA | NA | (Other): 2 | NA | NA | NA | (Other): 83 | NA | (Other): 19 | (Other): 22 |
ggplot(smoking) +
geom_point(aes(x = mom.height, y = mom.weight, color = mom.race), alpha = 0.5) +
geom_smooth(aes(x = mom.height, y = mom.weight), method = "lm")
Call:
lm(formula = mom.weight ~ mom.height, data = smoking)
Residuals:
Min 1Q Median 3Q Max
-38.579 -11.933 -3.515 7.276 94.839
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -86.174 18.671 -4.615 4.79e-06 ***
mom.height 3.354 0.291 11.526 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 18.55 on 608 degrees of freedom
Multiple R-squared: 0.1793, Adjusted R-squared: 0.178
F-statistic: 132.8 on 1 and 608 DF, p-value: < 2.2e-16
ggplot(smoking) +
geom_point(aes(x = dad.height, y = dad.weight, color = mom.race), alpha = 0.5) +
geom_smooth(aes(x = dad.height, y = dad.weight), method = "lm")
Call:
lm(formula = dad.weight ~ dad.height, data = smoking)
Residuals:
Min 1Q Median 3Q Max
-48.067 -13.067 -1.825 10.554 86.243
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -132.2898 18.8057 -7.035 5.4e-12 ***
dad.height 4.3105 0.2674 16.120 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 19.02 on 608 degrees of freedom
Multiple R-squared: 0.2994, Adjusted R-squared: 0.2983
F-statistic: 259.9 on 1 and 608 DF, p-value: < 2.2e-16
ggplot(smoking) +
geom_point(aes(x = mom.weight, y = dad.weight, color = mom.race, shape = dad.race), alpha = 0.5)
Call:
lm(formula = dad.weight ~ mom.weight, data = smoking)
Residuals:
Min 1Q Median 3Q Max
-57.481 -15.817 -2.051 14.097 93.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 141.54810 5.74900 24.621 < 2e-16 ***
mom.weight 0.22551 0.04407 5.117 4.16e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 22.25 on 608 degrees of freedom
Multiple R-squared: 0.04129, Adjusted R-squared: 0.03972
F-statistic: 26.19 on 1 and 608 DF, p-value: 4.159e-07