###
###  Data Wrangling with tidyverse
###
library(tidyverse)   # install.packages("tidyverse")

#  Packages included: dplyr,  ggplot2,  readr,  tibble,  tidyr,  purrr

D1 <- read_csv("data/EmployeeData.csv")

##------------------------------------------------------------
##
##  mutate() to create new columns
##

D2 <- D1 %>%
   mutate(
     AnnualBase  = ifelse(frequency=="HOURLY", base*40*52, base*5*52),
     AnnualTotal = ifelse(frequency=="HOURLY", base*40*52, base*5*52),
     AnnualBonus = AnnualTotal - AnnualBase  )


View(D2)

min(D2$AnnualBase) 

ggplot(D2, aes(x=AnnualBase)) + geom_histogram(binwidth=1000)


Boston2 <- Boston %>%
    rename(resp=medv) %>%    # Rename "medv" column as "resp" to streamline analysis.
    relocate(resp)           # move "resp" columnm to 1st
Boston2

# turn "chas" column into 0/1 factor
Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) )
Boston3


Boston4 <- Boston2 %>% mutate( chas=as.numeric(chas),
                               rad=as.numeric(rad))
Boston4

Orig <- Orig %>% 
  mutate(Class_Levels=recode(Class_Levels, 
                    '1'="200 level",
                    '2'="401+461",
                    '3'="4/5/600"  ))

# Combine 2007 and 2007.5 as 2007. 
Orig <- Orig %>% mutate(Semester=round(Semester,0))
Orig


Boston4 <- Boston2 %>% mutate(chas=chas+1)
Boston4

Boston4 <- Boston4 %>% mutate(n=round(rnorm(506,10,3),0), across(chas:rm, ~ .x + n))
Boston4 <- Boston4 %>% relocate(n, .after=chas)


##------------------------------------------------------------
##
##  filter() to extract certain rows
##

D2 %>% filter(year==2017 & county %in% c("CASCADE", "POWELL") | AnnualBonus >= 20000)


# Year is one of 2015, 2016, 1017, county NOT in CASCADE, or POWELL.
D2 %>% filter(year   %in% 2015:2017 &
              !county %in% c("CASCADE", "POWELL") )


##------------------------------------------------------------
##
##  pivoting and unpivoting
##

#  group_by()     # rows summary  (better for later use)
#  pivot_wider()  # column summary
#  summarise()    # summation
#  pivot_longer() # unpivoting  (better for feeding into models)


# Aggregate by county
  D.county <- D2 %>% group_by(county) %>% count()
  D.county

# Aggregate by county and sort 
  D.county <- D2 %>% group_by(county) %>% count() %>% arrange(desc(n))
  D.county

# Aggregate by county and sort 
  D.county <- D2 %>% group_by(county) %>% count() %>% arrange(n)
  D.county

# Show more rows on screen
  print(D.county, n=100)


# plot what's already aggregated
  ggplot(data=D.county, aes(x=county, y=n)) + geom_bar(stat="identity") + coord_flip()

# aggregate while plotting
  ggplot(data=D2, aes(x=county)) + stat_count() + coord_flip()
  ggplot(data=D2, aes(x=county)) + geom_bar() + coord_flip()


# Get mean and median by dept for one county
D3 <- D2 %>% filter(county=="LEWIS AND CLARK") %>%
   group_by(department) %>%
      summarize(
        Av.Base = mean(AnnualBase),
        Med.Base = median(AnnualBase)
      )
print(D3, n=20)


# Get mean of all numeric column
D2 %>% filter(county=="LEWIS AND CLARK") %>%
   group_by(department) %>%
      summarize_if(is.numeric, mean)


# Av Base for each year (long format)
D3 <- D2 %>% filter(county=="LEWIS AND CLARK") %>%
   group_by(department, year) %>%
      summarize(AvBase = mean(AnnualBase))
D3


# Now spread that over side ways (wide format)
D4 <- D3 %>% pivot_wider(names_from=year, values_from=AvBase)
D4


# Stack them back to long fomrat
D5 <- D4 %>% pivot_longer(-department, names_to="Yr", values_to="BaseAv")
D5

# gets 7 more rows than D3


##------------------------------------------------------------
##
## joining
##


EName <- read_csv("data/EmployeeNames.csv")
ESal  <- read_csv("data/EmployeeSalary.csv")
EHist <- read_csv("data/EmployeeHistory.csv")


# EName has wrong col names for EmployeeID
EName
colnames(EName)[1] <- "EmployeeID"
EName

left_join( EName, ESal,  by="EmployeeID")   # salary of 1003 & 1023 are missing. lists 7ppl.
left_join( ESal,  EName, by="EmployeeID")   # name of 1034 is missing. lists 6ppl.
right_join(EName, ESal,  by="EmployeeID")   # same as above except the col order

anti_join(EName, ESal)      # found 1003 & 1023 missing salary
anti_join(ESal,  EName)     # found 1034 missing name  

full_join( EName, ESal)     # list all 8 ppl (3 has some NA)
inner_join(EName, ESal)     # list 5 ppl (no NA)


E1 <- full_join(EName, ESal)  
left_join(E1, EHist)


# Latest Dept Assignment 
EHist %>%
  arrange(desc(Year)) %>%
  distinct(EmployeeID, .keep_all=TRUE)


# First Dept Assignment
EHist %>%
  arrange(Year) %>%
  distinct(EmployeeID, .keep_all=TRUE) %>%
  arrange(EmployeeID)


## Make more data in EHist, and get fitst + last assignment
## Sort in wide format + long format.


##------------------------------------------------------------
##
## Export to Excel 
##

  write_csv("Emmm.csv")

##  use Power Query in Excel