# ============================== # LOAD LIBRARIES # ============================== library(tidyverse) library(readxl) library(ggplot2) library(dplyr) # ============================== # LOAD DATA # ============================== data <- read_excel("financial_data.xlsx") # Clean column names names(data) <- make.names(names(data)) # Replace missing codes (if any) data[data == 999] <- NA # ============================== # RECODING VARIABLES # ============================== # Sex (Male=1, Female=2) data$sex_num <- ifelse(data$Sex == "Male", 1, 2) # Marital Status data$marital_num <- ifelse(data$Marital.Status == "Single", 1, 2) # Yes/No variables data$stock_num <- ifelse(data$Stock.Ownership == "Yes", 1, 0) data$retire_num <- ifelse(data$Retirement.Plan == "Yes", 1, 0) data$home_num <- ifelse(data$Home.Owner == "Yes", 1, 0) data$promo_num <- ifelse(data$Promotion == "Yes", 1, 0) data$trade_num <- ifelse(data$Trade.Last.Year == "Yes", 1, 0) data$internet_num <- ifelse(data$Internet == "Yes", 1, 0) # Education binary data$edu_binary <- ifelse(data$Education %in% c("High School", "Less than High School"), 0, 1) # ============================== # DESCRIPTIVE STATISTICS # ============================== # Categorical frequencies table(data$sex_num) prop.table(table(data$sex_num))*100 table(data$marital_num) prop.table(table(data$marital_num))*100 table(data$stock_num) prop.table(table(data$stock_num))*100 # Continuous variables summary(data$Age) summary(data$Income) summary(data$Kids) # ============================== # GRAPH 1: INCOME BY EDUCATION # ============================== plot1 <- data %>% group_by(Education) %>% summarise(mean_income = mean(Income, na.rm=TRUE)) ggplot(plot1, aes(x=Education, y=mean_income)) + geom_bar(stat="identity") + labs(title="Mean Family Income by Education Level", x="Education Level", y="Mean Income") # ============================== # GRAPH 2: AGE VS INCOME # ============================== ggplot(data, aes(x=Age, y=Income)) + geom_point() + labs(title="Relationship Between Age and Family Income", x="Age", y="Family Income") # ============================== # T-TEST (INCOME BY SEX) # ============================== t_test <- t.test(Income ~ sex_num, data=data) print(t_test) # ============================== # CHI-SQUARE TESTS # ============================== # Retirement Plan vs Retirement Perception (q4) chi1 <- chisq.test(table(data$retire_num, data$q4)) print(chi1) # Promotion vs Job Security (q3) chi2 <- chisq.test(table(data$promo_num, data$q3)) print(chi2) # Home Ownership vs Education Perception (q4) chi3 <- chisq.test(table(data$home_num, data$q4)) print(chi3) # ============================== # LOGISTIC REGRESSION # ============================== model <- glm(stock_num ~ Income + Age + Kids + edu_binary, data=data, family=binomial) summary(model) # ============================== # SAVE OUTPUTS # ============================== dir.create("R_outputs", showWarnings = FALSE) capture.output(t_test, file="R_outputs/t_test_income.txt") capture.output(chi1, file="R_outputs/chi_retirement.txt") capture.output(chi2, file="R_outputs/chi_promotion.txt") capture.output(chi3, file="R_outputs/chi_homeowner.txt") capture.output(summary(model), file="R_outputs/logistic_regression.txt") print("ALL ANALYSIS COMPLETED SUCCESSFULLY")