### ### ### Heart - Preliminary Analysis ### ver 0.0.4 ### #################################################### #------------------------------------------------- ###--- 0. Preliminary library(tidyverse) Heart <- read_csv("https://nmimoto.github.io/datasets/heart.csv") Heart class(Heart) # it's already tibble print(Heart, n=100) # if you want to see more rows print(Heart, width=1000) # if you want to see more cols names(Heart) # #0 "X1" index # #1 "Age" age in years # #2 "Sex" (Qualitative) sex (1 = male; 0 = female) # #3 "ChestPain" (Qualitative) chest pain type # -- Value 1: typical angina # -- Value 2: atypical angina # -- Value 3: non-anginal pain # -- Value 4: asymptomatic # #4 "RestBP" resting blood pressure (in mm Hg on admission to the hospital) # #5 "Chol" serum cholestoral in mg/dl # #6 "Fbs" (Qualitative) fasting blood sugar > 120 mg/dl (1 = true; 0 = false) # #7 "RestECG" (Qualitative) resting electrocardiographic results # -- Value 0: normal # -- Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) # -- Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria # #8 "MaxHR" maximum heart rate achieved # #9 "ExAng" (Qualitative) exercise induced angina (1 = yes; 0 = no) # #10 "Oldpeak" oldpeak = ST depression induced by exercise relative to rest # #11 "Slope" the slope of the peak exercise ST segment # #12 "Ca" number of major vessels (0-3) colored by flourosopy # #13 "Thal" (Qualitative) Thalium stress test # 3 = normal; 6 = fixed defect; 7 = reversable defect # #14 "AHD" (Qualitative)(Response) Yes/No based on an angiographic test # diagnosis of heart disease (angiographic disease status) # -- Value 0: < 50% diameter narrowing # -- Value 1: > 50% diameter narrowing # (in any major vessel: attributes 59 through 68 are vessels) #- table of the respoonse variable table(Heart$AHD) ### ### column "AHD" is the respoinse variable. ### print(Heart2, width=1000) # Rename medv column as resp. Heart2 <- Heart %>% select(-"index") %>% # remove col named "index" rename(resp=AHD) %>% # rename the column relocate(resp) # move "resp" to 1st column print(Heart2, width=1000) Heart2 <- Heart2 %>% mutate(resp = as.factor(resp), # Turn these columns to instead of Sex = as.factor(Sex), ChestPain= as.factor(ChestPain), Fbs = as.factor(Fbs), RestECG = as.factor(RestECG), ExAng = as.factor(ExAng), Thal = as.factor(Thal)) print(Heart2, width=1000) table(Heart2$resp) # note that first level is "No" #------------------------------------------------- ###--- 1. Routine Exploratory Analysis (class of resp should be ) Orig <- Heart2 resp.col.name <- "resp" #- Check for N/A in data. Remove if there's any. summary(Orig) dim(Orig) sum(is.na(Orig)) # If there is na in the data, run below dim(Orig) Orig <- Orig %>% na.omit() dim(Orig) ##---------- ##- Correlation Check # Pick columns that are numeric library(corrplot) # install.packages("corrplot") Orig_num <- Orig %>% select_if(is.numeric) cor(Orig_num) corrplot::corrplot(cor(Orig_num)) corrplot::corrplot(cor(Orig_num), method="number") ##---------- ##- Visualization # pairs(Orig) equivalent library(GGally) # install.packages("GGally") GGally::ggpairs(Orig[, c(2:7, 1)], aes(color=resp, alpha=1)) GGally::ggpairs(Orig[, c(8:14, 1)], aes(color=resp, alpha=1)) ###---------- ###- Chi-sq test of association for each column # Pick columns that has more than 1 unique value list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>% gather() %>% filter(value>1) %>% pull(key) # Apply chisq.test to those columns ChiSq.pval <- Orig %>% select(list_cols) %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value)) ChiSq.pval <- Orig %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value)) ChiSq.pval barplot(t(t(as.matrix(ChiSq.pval))), las=2, cex.names=1, main="p-values from Chi-sq test of association") abline(h=.05, col='red') which(ChiSq.pval < .05) # Col num of variables w <.05 p-value list_cols[which(ChiSq.pval < .05)] # Top list of 'important' variables which(ChiSq.pval > .6) # Col num of variables w >.6 p-value list_cols[which(ChiSq.pval > .6)] # Top list of 'unimportatnt' variables ### End of Routine Exploratory Analysis. # which(ChiSq.pval < .05) # Col num of variables w <.05 p-value # [1] 1 3 4 8 10 11 12 13 14 # Top list of 'important' variables # [1] "resp" "Sex" "ChestPain" "RestECG" "ExAng" "Oldpeak" # [7] "Slope" "Ca" "Thal" # which(ChiSq.pval > .6) # Col num of variables w >.6 p-value # [1] 7 # list_cols[which(ChiSq.pval > .6)] # Top list of 'unimportatnt' variables # [1] "Fbs"