Merge pull request #36 from Nima-Jamshidi/milestone-03

final changes without changing explore_data.R
STAT547-UBC-2019-20 · Mar 17, 2020 · b947d1e · b947d1e
2 parents b777c6f + 0d6833e
commit b947d1e
Show file tree

Hide file tree

Showing 13 changed files with 42 additions and 33 deletions.
diff --git a/data/linear_model/augmented.rds b/data/linear_model/augmented.rds
diff --git a/data/linear_model/glanced.rds b/data/linear_model/glanced.rds
diff --git a/data/linear_model/model.rds b/data/linear_model/model.rds
diff --git a/data/linear_model/tidied.rds b/data/linear_model/tidied.rds
diff --git a/docs/milestone3.Rmd b/docs/milestone3.Rmd
@@ -5,10 +5,8 @@ date: "14/03/2020"
 output: 
   bookdown::html_document2:
     toc: true
-    keep_md: true
   bookdown::pdf_document2:
     toc: true
-
 ---
 
 ```{r setup, include=FALSE}
@@ -49,15 +47,21 @@ This dataset explains the medical insurance costs of a small sample of the USA p
 ```{r load the data, echo=FALSE}
 # import the data 
 costs <- read_csv(
-  here("data", "raw", "data.csv"),
+  here("data", "processed", "processed_data.csv"),
   col_types = cols(
-    age = col_integer(),
-    sex = readr::col_factor(),
-    bmi = col_double(),
-    children = col_integer(),
-    smoker = readr::col_factor(),
-    region = readr::col_factor(),
-    charges = col_double()
+    age = col_double(),
+  sex = col_character(),
+  bmi = col_double(),
+  children = col_double(),
+  smoker = col_character(),
+  region = col_character(),
+  sex_dummy = col_double(),
+  smoker_dummy = col_double(),
+  southeast = col_double(),
+  southwest = col_double(),
+  northwest = col_double(),
+  northeast = col_double(),
+  charges = col_double()
   )
 )
 ```
@@ -81,13 +85,18 @@ Here is a summary of the dataset, and the values of each variable (Table \@ref(t
 
 ```{r summary, echo=FALSE}
 options(knitr.kable.NA="")
-kable(summary(costs), caption = "summary of the dataset")
+kable(summary(costs %>% select(-sex_dummy,-smoker_dummy,-northeast,-northwest,-southeast,-southwest)), caption = "summary of the dataset")
+```
+```{r correlation, include=FALSE}
+costs_correlations <- costs %>%
+    select(-sex, -smoker, -region) %>% # remove the columns that are not dummy variables
+    cor()
 ```
 
 Next, we want to inspect the data set to see if there is any correlation between the variables. From now on we want to consider charges as our dependent variable.
 In order to analyze correlation between variables, the ones that are categorical with two categories, are translated into binery vectors. The only categorical variable with more than two categories, is region. We split this variable into four different binery vectors, each indicating if the sample data has category (1) or not (0).
 
-After using dummy variables for sex, smoker, and region, according to the correlogram show in Figure \@ref(fig:corrplot-png), smoker and charges has the strongest correlation of 0.79. No high collinearity between independent variables is observed.
+After using dummy variables for sex, smoker, and region, according to the correlogram show in Figure \@ref(fig:corrplot-png), smoker and charges has the strongest correlation of `r round(costs_correlations[5,10],2)`. No high collinearity between independent variables is observed.
 
 
 ```{r corrplot-png, echo = FALSE, fig.cap="Correlation plot", fig.align = 'center', out.width='75%', out.height='75%'}

diff --git a/docs/milestone3.html b/docs/milestone3.html
diff --git a/docs/milestone3.pdf b/docs/milestone3.pdf
diff --git a/images/lmplot001.png b/images/lmplot001.png
diff --git a/images/lmplot002.png b/images/lmplot002.png
diff --git a/images/lmplot003.png b/images/lmplot003.png
diff --git a/images/lmplot004.png b/images/lmplot004.png
diff --git a/images/lmplot005.png b/images/lmplot005.png
diff --git a/scripts/linear_model.R b/scripts/linear_model.R
@@ -50,22 +50,22 @@ main <- function(processed_data,image_path,lm_path) {
   if (!dir.exists(here(lm_path))) {
     dir.create(here(lm_path), recursive = TRUE)
   }
-
+  #read the processed data.
   data <- read.csv(here(processed_data))
-
+  #conduct linear regression
   model <-
     lm(charges ~ age + sex + bmi + children + smoker + region, data = data)
-
+  #plot the first 4 diagnostics graphs
   plots <-
     png(here(image_path, "lmplot%03d.png"))
   plot(model, ask = FALSE)
   dev.off()
 
-
+  #linear regression statistics 
   glanced <- glance(model)
   tidied <- tidy(model)
   augmented <- augment(model)
-
+  #plot the fifth diagnostics graph
   augmented %>%
     ggplot(aes(x = .fitted, y = charges)) +
     geom_point() +
@@ -84,7 +84,7 @@ main <- function(processed_data,image_path,lm_path) {
     )
   )
 
-
+  #save the statistics in separate .rds files
   flist <-
     list(
       model = model,