# loading packages
library(haven)
library(tidyverse)
library(openxlsx)
library(labelled)


# Data preparation and data cleaning -------------------------------------

# loading data
d <- read_sav("data/democratic_inoculation_data_1126.sav")
nrow(d)
# 4867

# # identifying duplicated IP addresses to check
# dup_ids <- d %>% 
#   group_by(IPAddress) %>% 
#   filter(n() > 1) %>% 
#   ungroup() %>% 
#   dplyr::select(IPAddress, StartDate, EndDate, Duration__in_seconds_, Age, Gender, Education, Settlement, nu) %>% 
#   arrange(IPAddress)
# 
# library(writexl)
# write_xlsx(dup_ids, "duplicated_ids_check.xlsx")


## 1. anonymizing IP addresses for privacy reasons -----
d$IPAddress_anon <- sprintf("IP_%05d", as.integer(factor(d$IPAddress)))

# dup_anonids <- d %>% 
#   group_by(IPAddress_anon) %>% 
#   filter(n() > 1) %>% 
#   ungroup() %>% 
#   dplyr::select(IPAddress_anon, IPAddress, StartDate, EndDate, Duration__in_seconds_, Age, Gender, Education, Settlement, nu) %>% 
#   arrange(IPAddress)
# 
# write_xlsx(dup_anonids, "duplicated_ids_check_withanon.xlsx")

## 2. removing unnecessary variables -----
unnecessary_vars <- c("IPAddress", "postal_code", "Q_BallotBoxStuffing", 
                    "Q_RelevantIDFraudScore", "nu", 
                    "Q_DuplicateRespondent",
                    "LocationLatitude", "LocationLongitude",
                    "RecipientLastName", "RecipientFirstName",
                    "RecipientEmail", "ExternalReference",
                    "DistributionChannel", "UserLanguage",
                    "Meta_Browser", "Meta_Version", 
                    "Meta_Operating_System", "Meta_Resolution",
                    "country_live_5_TEXT", "Q_RecaptchaScore")

d <- d %>% 
  dplyr::select(-all_of(unnecessary_vars))


write_sav(d, "data/democratic_inoculation_data_anonymized.sav")


## 3. basic descriptives -----
# # Check basic descriptives of all respondents 
# # keeping only the first occurrence of duplicated IP addresses
# d <- read_sav("data/democratic_inoculation_data_anonymized.sav")
# 
# d <- d %>%
#   arrange(IPAddress_anon, StartDate) %>%
#   group_by(IPAddress_anon) %>%
#   slice(1) %>%     # keep only the first occurrence
#   ungroup()
# nrow(d)
# # 4605
# 
# 
# # cleaning data
# d <- d %>% 
#   # filter(Q_TerminateFlag == "Complete") %>%
#   mutate(cond = case_when(cont_time_1_Page_Submit > 0 ~ "0",  # control
#                           rep_time_1_Page_Submit > 0 ~ "1", # treatment cracks ("reped")
#                           kiep_time_2_Page_Submit > 0 ~ "2", # treatment build-up ("kiep")
#                           TRUE ~ "other"))
# table(d$cond, useNA = "ifany")
# 
# # checking party preference variable
# attr(d$partypref, "labels")
# table(d$partypref) # van: 1727, nincs: 1545, nem tudja: 242
# 
# attr(d$partypref2, "labels")
# table(d$partypref2)
# round((prop.table(table(d$partypref2)) * 100), 2)
# sum(!is.na(d$partypref2)) # 1725 valid responses
# # Fidesz-KDNP:  553 (32.06%)
# # Tisza:        778 (45.10%)
# # DK:            63 (3.65%)
# # Mi Hazánk:    224 (12.99%)
# # MKKP:          47 (2.72%)
# # Jobbik:        17 (0.99%)
# # MSZP:           8 (0.46%)
# # Egyéb:         35 (2.03%)
# 
# 
# psych::describe(d$Age)
# # M = 41.6, SD = 14.36, Median = 40.5, Min = 18, Max = 120
# 
# attr(d$Gender, "labels")
# round((prop.table(table(d$Gender)) * 100), 2) # male: 43.12%, female: 56.88%
# sum(!is.na(d$Gender)) # 4304 valid responses
# 
# attr(d$Education, "labels")
# round((prop.table(table(d$Education)) * 100), 2) # elementary/vocational: 23.61%, secondary: 51.53%, tertiary: 24.86%
# sum(!is.na(d$Education)) # 4304 valid responses
# 
# attr(d$Settlement, "labels")
# round((prop.table(table(d$Settlement)) * 100), 2) # capital: 20.79%, county seat: 23.07%, city: 34.01%, village: 22.12%,  
# sum(!is.na(d$Settlement)) # 4304 valid responses
# 
# attr(d$nationality, "labels")
# round((prop.table(table(d$nationality)) * 100), 2) # Hungarian: 96.04%, other: 3.96%
# sum(!is.na(d$nationality)) # 4304 valid responses
# 
# round((prop.table(table(d$cond)) * 100), 2) 
# # control: 22.54%, 
# # treatment cracks ("reped"): 22.93%,
# # treatment build-up ("kiep"): 21.87%,
# # other: 32.66%
# sum(!is.na(d$cond)) # 4605 valid responses
# d %>% filter(cond != "other") %>% summarise(n = n()) # 3101 valid responses (without "other" category)
# 
# 
# d %>% filter(cond != "other") %>% group_by(cond) %>% summarise(n = n()) %>%
#   mutate(prop = n / sum(n), perc = sprintf("%.2f", prop * 100))
# # control:                     1038 (33.47%), 
# # treatment cracks ("reped"):  1056 (34.05%),
# # treatment build-up ("kiep"): 1007 (32.47%)
# 
# 
# # completion of second attention checks
# sum(!is.na(d$comp_acheck2))  # 954 valid responses (84 dropped until this point)
# sum(!is.na(d$reped_acheck2)) # 920 valid responses  (126 dropped until this point)
# sum(!is.na(d$kiep_acheck2))  # 916  valid responses (91 dropped until this point)
# 
# 
# 
# # cleaning data filtering only completed sessions
# d <- d %>% 
#   filter(Q_TerminateFlag == "Complete") %>%
#   mutate(cond = case_when(cont_time_1_Page_Submit > 0 ~ "0",  # control
#                           rep_time_1_Page_Submit > 0 ~ "1", # treatment cracks ("reped")
#                           kiep_time_2_Page_Submit > 0 ~ "2", # treatment build-up ("kiep")
#                           TRUE ~ "other"))
# table(d$cond, useNA = "ifany")
# # control:                     388 
# # treatment cracks ("reped"):  340
# # treatment build-up ("kiep"): 442
# # if we continue with only those who reached the cond at the first time
# nrow(d)
# # 1170



## 4. generating a codebook -----
# loading data
d <- read_sav("data/democratic_inoculation_data_anonymized.sav")
nrow(d)
# 4867


# generating a detailed codebook
get_values_from_attr <- function(x, limit = 50) {
  
  labs <- attr(x, "labels")
  
  if (!is.null(labs) && length(labs) > 0) {
    # names = válaszcímkék, values = kódok
    out <- paste0(names(labs), " = ", unname(labs))
    return(collapse_limited(out, limit))
  }
  
  # factor
  if (is.factor(x)) {
    return(collapse_limited(levels(x), limit))
  }
  
  # character / logical
  if (is.character(x) || is.logical(x)) {
    return(collapse_limited(x, limit))
  }
  
  # numeric, kevés diszkrét érték (pl. Likert)
  if (is.numeric(x)) {
    ux <- sort(unique(x[!is.na(x)]))
    if (length(ux) <= 15) {
      return(paste(ux, collapse = " | "))
    }
  }
  
  ""
}

get_var_label <- function(x) {
  lab <- attr(x, "label")
  if (!is.null(lab)) as.character(lab) else ""
}

`%||%` <- function(a, b) if (!is.null(a) && length(a) > 0) a else b

collapse_limited <- function(x, limit = 40) {
  x <- unique(x[!is.na(x)])
  if (length(x) == 0) return("")
  if (length(x) > limit) {
    paste0(paste(x[1:limit], collapse = " | "),
           " | ... (+", length(x) - limit, ")")
  } else {
    paste(x, collapse = " | ")
  }
}

get_range <- function(x) {
  if (inherits(x, c("Date", "POSIXct", "POSIXt"))) {
    r <- suppressWarnings(range(x, na.rm = TRUE))
    if (any(!is.finite(as.numeric(r)))) return("")
    return(paste0(format(r[1]), " – ", format(r[2])))
  }
  if (is.numeric(x)) {
    r <- suppressWarnings(range(x, na.rm = TRUE))
    if (any(!is.finite(r))) return("")
    return(paste0(r[1], " – ", r[2]))
  }
  ""
}

codebook <- tibble(
  variable = names(d),
  question = sapply(d, get_var_label),
  class = sapply(d, \(x) paste(class(x), collapse = "/")),
  n = sapply(d, length),
  missing = sapply(d, \(x) sum(is.na(x))),
  missing_pct = round(100 * missing / n, 2),
  unique_values = sapply(d, \(x) dplyr::n_distinct(x, na.rm = TRUE)),
  range = sapply(d, get_range),
  values = sapply(d, get_values_from_attr)
)

# Excel export
wb <- createWorkbook()
addWorksheet(wb, "Codebook")
writeData(wb, "Codebook", codebook)
freezePane(wb, "Codebook", firstRow = TRUE)
setColWidths(wb, "Codebook", cols = 1:ncol(codebook), widths = "auto")
saveWorkbook(wb, "codebook_quick.xlsx", overwrite = TRUE)



## 5. final data cleaning -----

### add condition variable and remove duplicates based on first reached condition -----
d <- d %>% 
  # 1) create condition variable for every row
  mutate(
    cond = case_when(
      cont_time_1_Page_Submit > 0 ~ "0",  # control
      rep_time_1_Page_Submit  > 0 ~ "1",  # treatment cracks ("reped")
      kiep_time_2_Page_Submit > 0 ~ "2",  # treatment build-up ("kiep")
      TRUE ~ "other"                      # did not reach any condition (left early)
    )
  ) %>%
  # 2) order attempts within IP
  arrange(IPAddress_anon, StartDate) %>%
  group_by(IPAddress_anon) %>%
  # 3) keep only rows where a condition was actually reached
  filter(cond != "other") %>%
  # 4) from those, keep the *first* condition-reach per IP
  slice(1) %>%
  ungroup()

table(d$cond, useNA = "ifany")
# control:                     1053
# treatment cracks ("reped"):  1067
# treatment build-up ("kiep"): 1023
nrow(d)
# 3143

# completion of second attention checks
sum(!is.na(d$comp_acheck2))  # 965 valid responses (88 dropped until this point)
sum(!is.na(d$reped_acheck2)) # 929 valid responses  (138 dropped until this point)
sum(!is.na(d$kiep_acheck2))  # 930  valid responses (93 dropped until this point)


### removing participants who failed both attention checks -----
# cleaning data filtering only completed sessions
d <- d %>%
  filter(Q_TerminateFlag == "Complete")

table(d$cond, useNA = "ifany")
# control:                     391 (574 (59.48%) failed both attention checks)
# treatment cracks ("reped"):  344 (585 (62.97%) failed both attention checks)
# treatment build-up ("kiep"): 450 (480 (51.61%) failed both attention checks)
nrow(d)
# 1185

### removing outliers based on 3 SD rule in session duration -----
mean_time <- mean(d$Duration__in_seconds_, na.rm = TRUE)
sd_time <- sd(d$Duration__in_seconds_, na.rm = TRUE)

d <- d %>%
  filter(
    Duration__in_seconds_ >= mean_time - 3 * sd_time,
    Duration__in_seconds_ <= mean_time + 3 * sd_time
  )


# calculate medians (in minutes) of session length ("duration in seconds") variable 
d %>% 
  group_by(cond) %>%
  summarise(min = min(Duration__in_seconds_, na.rm=TRUE)/60,
            max = max(Duration__in_seconds_, na.rm=TRUE)/60,
            mean = mean(Duration__in_seconds_, na.rm=TRUE)/60,
            median = median(Duration__in_seconds_, na.rm=TRUE)/60,
            sd = sd(Duration__in_seconds_, na.rm=TRUE)/60) %>% 
  as.data.frame(.) %>% 
  dplyr::mutate_if(is.numeric, round, 2)

# cond  min     max    mean     median      sd
#   0   6.43  331.28   38.52     30.57     33.10
#   1   6.27  284.28   38.84     32.92     27.56
#   2   5.20  296.00   36.63     27.78     30.60


table(d$cond, useNA = "ifany")
# control:                     338 
# treatment cracks ("reped"):  342 
# treatment build-up ("kiep"): 445 
nrow(d)
# 1175


### saving cleaned data ----
write_sav(d, "data/democratic_inoculation_data_cleaned.sav")