Predicting pneumonia outcomes: EDA part 2

This post is a supplementary material for an assignment. The assignment is part of the Augmented Machine Learning unit for a Specialised Diploma in Data Science for Business. The aim of the assignment is to use DataRobot for predictive modelling. Exploratory data analysis and feature engineering will be done here in R before the data is imported into DataRobot.

Intro

The aim of this project is to classify if patients with Community Acquired Pneumonia (CAP) became better after seeing a doctor or became worse despite seeing a doctor. The variables of the dataset can be classified into 13 categories. The first 8 categories have been explored in the previous post. The remaining categories will be explored in this post.

library(tidyverse)
theme_set(theme_light())

# previously and partial EDA dataset 
# https://github.com/notast/pneumonia-outcomes/blob/master/pneumonia_EDA1.Rmd
load("CAP_EDA1.RData")

# 13 categories 
categories13<- readxl::read_excel("Incidence rate of community-acquired pneumonia in adults a population-based prospective active surveillance study in three cities in South America.xls", sheet=3)

categories13 %>%  DT::datatable(rownames = F, options = list(searchHighlight = TRUE, paging= T))


Customized EDA functions from the previous post will be used here.

dtype<- function(datafr, x){
datafr%>% select(starts_with(x, ignore.case = F)) %>% str()
}

eda_c<- function(datafr,x){
  datafr %>% select(starts_with(x, ignore.case = F)) %>%  map(~ table(.x, useNA = "always"))
}

eda_n_NAplt<- function (datafr, x){
  datafr %>% select(starts_with(x, ignore.case = F)) %>% summarise(across(starts_with(x), ~mean(is.na(.)))) %>% pivot_longer(cols = everything(), names_to= "Variables" , values_to="pct_na") %>% mutate(Variables= fct_reorder(Variables, pct_na)) %>% ggplot(aes(x=Variables, y=pct_na, fill= pct_na))+ geom_col() + coord_flip() + scale_y_continuous(labels=scales::percent_format()) + scale_fill_viridis_c(option = "plasma")}

eda_n_NAcutoff<- function(datafr, x, low, high){
  datafr%>% select(starts_with(x, ignore.case = F)) %>% summarise(across(starts_with(x), ~mean(is.na(.)))) %>% pivot_longer(cols = everything(), names_to="Variables", values_to="pct_na") %>% filter((pct_na>low & pct_na<high)) %>% pull(Variables)}

eda_n_outlier<-function(datafr, x_selected){
# nested df with plots
  plt<-datafr %>% select(all_of(x_selected)) %>% pivot_longer(cols=everything(),names_to="Variables", values_to="values") %>% nest(-Variables) %>% mutate(plot= map2(.x= data, .y= Variables, 
~ggplot(data=.x, aes(x= values)) + geom_boxplot() + labs(title = .y)
)) 
# print the plots
  for (i in 1:length(x_selected)){
    p<-plt[[3]][[i]]
    print(p)}
  }


12 Care_ continuum of care status category

(dtype(df, "Care"))
## tibble [2,112 x 7] (S3: tbl_df/tbl/data.frame)
##  $ Care_admit           : chr [1:2112] "Yes" "No" "Yes" "Yes" ...
##  $ Care_ICU             : chr [1:2112] "No" "No" "No" "No" ...
##  $ Care_breathingAid    : chr [1:2112] "No" "No" "No" "No" ...
##  $ Care_breathingAidType: chr [1:2112] NA NA NA NA ...
##  $ Care_BPSupport       : chr [1:2112] "No" "No" "Yes" "No" ...
##  $ Care_daysUnfit       : num [1:2112] 10 10 10 15 NA 12 15 6 12 NA ...
##  $ Care_GP/OutptVisit   : num [1:2112] 2 4 1 1 1 2 2 1 3 2 ...
## NULL
(eda_c(df, "Care"))
## $Care_admit
## .x
##          No Unavailable         Yes        <NA> 
##         631           2        1479           0 
## 
## $Care_ICU
## .x
##          No Unavailable         Yes        <NA> 
##        1729          53         330           0 
## 
## $Care_breathingAid
## .x
##          No Unavailable         Yes        <NA> 
##        1928          60         124           0 
## 
## $Care_breathingAidType
## .x
##          ARM CPAP/Bilevel        Other         <NA> 
##           91           26            5         1990 
## 
## $Care_BPSupport
## .x
##          No Unavailable         Yes        <NA> 
##        1930          85          97           0 
## 
## $Care_daysUnfit
## .x
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##    9   17   30   45   47  116   37  298   62   25  408   22   76   23  210  186 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31 
##   18   11   17    6  141   58    9    4    7   25    2    2    1    1   67    2 
##   33   35   36   37   40   45   47   48   50   53   58   59   60   65   69   75 
##    4    6    1    1    8    5    1    1    3    1    1    1    3    1    1    1 
##   80   99  181 <NA> 
##    1   20    1   69 
## 
## $`Care_GP/OutptVisit`
## .x
##    0    1    2    3    4    5    6    7   10   15   21   99 <NA> 
##   57 1460  307  173   56   14    2    5    1    1    1   14   21

replace 99

Again 99 appears as outliers for Care_daysUnfit and Care_GP/OutptVisit. 99 will be replaced with NA.

df<-df %>% mutate(Care_daysUnfit= na_if(Care_daysUnfit, 99),
              `Care_GP/OutptVisit`= na_if(`Care_GP/OutptVisit`, 99)) 

Admission status

Care_admit indicates if the patient was admitted to a hospital and Care_ICU indicates if patient had an ICU stay. 324 patients who were hospitalized also had ICU stay. The labels in Care_admit will include details to reflect patient who were admitted AND had ICU stay (label as Yes (w ICU)). After using information from Care_ICU to expand Care_admit, Care_ICU will be dropped.

(table(df$Care_admit, df$Care_ICU, useNA = "always"))
##              
##                 No Unavailable  Yes <NA>
##   No           584          47    0    0
##   Unavailable    0           2    0    0
##   Yes         1145           4  330    0
##   <NA>           0           0    0    0
df<-df %>% mutate(Care_admit= case_when(
  Care_admit=="Yes" & Care_ICU=="Yes" ~ "Yes (w ICU)",
  T~ Care_admit)) %>% 
  select(-Care_ICU)

(df %>% count(Care_admit, name = "new_tally")) 
## # A tibble: 4 x 2
##   Care_admit  new_tally
##   <chr>           <int>
## 1 No                631
## 2 Unavailable         2
## 3 Yes              1149
## 4 Yes (w ICU)       330

Breathing aid

Care_breathingAid indicates if patient in ICU used any breathing aids. Care_ breathingAidType details the type of breathing aids used. Details from Care_breathingAidType will be integrated into Care_breathingAid and the Care_breathingAidType will be dropped.

(table(df$Care_breathingAid, df$Care_breathingAidType, useNA = "always"))
##              
##                ARM CPAP/Bilevel Other <NA>
##   No             0            0     0 1928
##   Unavailable    0            0     0   60
##   Yes           91           26     5    2
##   <NA>           0            0     0    0
df<-df %>% mutate(Care_breathingAid= case_when(
  Care_breathingAid=="Yes" & Care_breathingAidType=="ARM" ~ "ARM",
  Care_breathingAid=="Yes" & Care_breathingAidType=="CPAP/Bilevel" ~ "CPAP/Bilevel",
  Care_breathingAid=="Yes" & Care_breathingAidType=="Other" ~ "Other",
  Care_breathingAid=="Yes" & is.na(Care_breathingAidType) ~ "Other",
  T~ Care_breathingAid)) %>% select(-Care_breathingAidType)

(count(df, Care_breathingAid, name = "new_tally"))
## # A tibble: 5 x 2
##   Care_breathingAid new_tally
##   <chr>                 <int>
## 1 ARM                      91
## 2 CPAP/Bilevel             26
## 3 No                     1928
## 4 Other                     7
## 5 Unavailable              60


Wrap up

The original dataset had 2302 rows and 176 columns, after EDA the dataset has 2112 rows and 78 columns. More than half of the columns were removed and compressed via EDA.

# Clean up intermediate columns created during EDA 
df<-df %>% select(-Used) %>% rename(Abx_no=New_Abx_no)

dim(df)
## [1] 2112   78

The cleaned up dataset is ready for some action. In the next post, some feature engineering will be done.

df  %>%  DT::datatable(rownames = F, options = list(searchHighlight = TRUE, paging= T))
#save(df, file = "CAP_EDA2.RData")