# loading packages library(haven) library(tidyverse) library(openxlsx) library(labelled) # Data preparation and data cleaning ------------------------------------- # loading data d <- read_sav("data/democratic_inoculation_data_1126.sav") nrow(d) # 4867 # # identifying duplicated IP addresses to check # dup_ids <- d %>% # group_by(IPAddress) %>% # filter(n() > 1) %>% # ungroup() %>% # dplyr::select(IPAddress, StartDate, EndDate, Duration__in_seconds_, Age, Gender, Education, Settlement, nu) %>% # arrange(IPAddress) # # library(writexl) # write_xlsx(dup_ids, "duplicated_ids_check.xlsx") ## 1. anonymizing IP addresses for privacy reasons ----- d$IPAddress_anon <- sprintf("IP_%05d", as.integer(factor(d$IPAddress))) # dup_anonids <- d %>% # group_by(IPAddress_anon) %>% # filter(n() > 1) %>% # ungroup() %>% # dplyr::select(IPAddress_anon, IPAddress, StartDate, EndDate, Duration__in_seconds_, Age, Gender, Education, Settlement, nu) %>% # arrange(IPAddress) # # write_xlsx(dup_anonids, "duplicated_ids_check_withanon.xlsx") ## 2. removing unnecessary variables ----- unnecessary_vars <- c("IPAddress", "postal_code", "Q_BallotBoxStuffing", "Q_RelevantIDFraudScore", "nu", "Q_DuplicateRespondent", "LocationLatitude", "LocationLongitude", "RecipientLastName", "RecipientFirstName", "RecipientEmail", "ExternalReference", "DistributionChannel", "UserLanguage", "Meta_Browser", "Meta_Version", "Meta_Operating_System", "Meta_Resolution", "country_live_5_TEXT", "Q_RecaptchaScore") d <- d %>% dplyr::select(-all_of(unnecessary_vars)) write_sav(d, "data/democratic_inoculation_data_anonymized.sav") ## 3. basic descriptives ----- # # Check basic descriptives of all respondents # # keeping only the first occurrence of duplicated IP addresses # d <- read_sav("data/democratic_inoculation_data_anonymized.sav") # # d <- d %>% # arrange(IPAddress_anon, StartDate) %>% # group_by(IPAddress_anon) %>% # slice(1) %>% # keep only the first occurrence # ungroup() # nrow(d) # # 4605 # # # # cleaning data # d <- d %>% # # filter(Q_TerminateFlag == "Complete") %>% # mutate(cond = case_when(cont_time_1_Page_Submit > 0 ~ "0", # control # rep_time_1_Page_Submit > 0 ~ "1", # treatment cracks ("reped") # kiep_time_2_Page_Submit > 0 ~ "2", # treatment build-up ("kiep") # TRUE ~ "other")) # table(d$cond, useNA = "ifany") # # # checking party preference variable # attr(d$partypref, "labels") # table(d$partypref) # van: 1727, nincs: 1545, nem tudja: 242 # # attr(d$partypref2, "labels") # table(d$partypref2) # round((prop.table(table(d$partypref2)) * 100), 2) # sum(!is.na(d$partypref2)) # 1725 valid responses # # Fidesz-KDNP: 553 (32.06%) # # Tisza: 778 (45.10%) # # DK: 63 (3.65%) # # Mi Hazánk: 224 (12.99%) # # MKKP: 47 (2.72%) # # Jobbik: 17 (0.99%) # # MSZP: 8 (0.46%) # # Egyéb: 35 (2.03%) # # # psych::describe(d$Age) # # M = 41.6, SD = 14.36, Median = 40.5, Min = 18, Max = 120 # # attr(d$Gender, "labels") # round((prop.table(table(d$Gender)) * 100), 2) # male: 43.12%, female: 56.88% # sum(!is.na(d$Gender)) # 4304 valid responses # # attr(d$Education, "labels") # round((prop.table(table(d$Education)) * 100), 2) # elementary/vocational: 23.61%, secondary: 51.53%, tertiary: 24.86% # sum(!is.na(d$Education)) # 4304 valid responses # # attr(d$Settlement, "labels") # round((prop.table(table(d$Settlement)) * 100), 2) # capital: 20.79%, county seat: 23.07%, city: 34.01%, village: 22.12%, # sum(!is.na(d$Settlement)) # 4304 valid responses # # attr(d$nationality, "labels") # round((prop.table(table(d$nationality)) * 100), 2) # Hungarian: 96.04%, other: 3.96% # sum(!is.na(d$nationality)) # 4304 valid responses # # round((prop.table(table(d$cond)) * 100), 2) # # control: 22.54%, # # treatment cracks ("reped"): 22.93%, # # treatment build-up ("kiep"): 21.87%, # # other: 32.66% # sum(!is.na(d$cond)) # 4605 valid responses # d %>% filter(cond != "other") %>% summarise(n = n()) # 3101 valid responses (without "other" category) # # # d %>% filter(cond != "other") %>% group_by(cond) %>% summarise(n = n()) %>% # mutate(prop = n / sum(n), perc = sprintf("%.2f", prop * 100)) # # control: 1038 (33.47%), # # treatment cracks ("reped"): 1056 (34.05%), # # treatment build-up ("kiep"): 1007 (32.47%) # # # # completion of second attention checks # sum(!is.na(d$comp_acheck2)) # 954 valid responses (84 dropped until this point) # sum(!is.na(d$reped_acheck2)) # 920 valid responses (126 dropped until this point) # sum(!is.na(d$kiep_acheck2)) # 916 valid responses (91 dropped until this point) # # # # # cleaning data filtering only completed sessions # d <- d %>% # filter(Q_TerminateFlag == "Complete") %>% # mutate(cond = case_when(cont_time_1_Page_Submit > 0 ~ "0", # control # rep_time_1_Page_Submit > 0 ~ "1", # treatment cracks ("reped") # kiep_time_2_Page_Submit > 0 ~ "2", # treatment build-up ("kiep") # TRUE ~ "other")) # table(d$cond, useNA = "ifany") # # control: 388 # # treatment cracks ("reped"): 340 # # treatment build-up ("kiep"): 442 # # if we continue with only those who reached the cond at the first time # nrow(d) # # 1170 ## 4. generating a codebook ----- # loading data d <- read_sav("data/democratic_inoculation_data_anonymized.sav") nrow(d) # 4867 # generating a detailed codebook get_values_from_attr <- function(x, limit = 50) { labs <- attr(x, "labels") if (!is.null(labs) && length(labs) > 0) { # names = válaszcímkék, values = kódok out <- paste0(names(labs), " = ", unname(labs)) return(collapse_limited(out, limit)) } # factor if (is.factor(x)) { return(collapse_limited(levels(x), limit)) } # character / logical if (is.character(x) || is.logical(x)) { return(collapse_limited(x, limit)) } # numeric, kevés diszkrét érték (pl. Likert) if (is.numeric(x)) { ux <- sort(unique(x[!is.na(x)])) if (length(ux) <= 15) { return(paste(ux, collapse = " | ")) } } "" } get_var_label <- function(x) { lab <- attr(x, "label") if (!is.null(lab)) as.character(lab) else "" } `%||%` <- function(a, b) if (!is.null(a) && length(a) > 0) a else b collapse_limited <- function(x, limit = 40) { x <- unique(x[!is.na(x)]) if (length(x) == 0) return("") if (length(x) > limit) { paste0(paste(x[1:limit], collapse = " | "), " | ... (+", length(x) - limit, ")") } else { paste(x, collapse = " | ") } } get_range <- function(x) { if (inherits(x, c("Date", "POSIXct", "POSIXt"))) { r <- suppressWarnings(range(x, na.rm = TRUE)) if (any(!is.finite(as.numeric(r)))) return("") return(paste0(format(r[1]), " – ", format(r[2]))) } if (is.numeric(x)) { r <- suppressWarnings(range(x, na.rm = TRUE)) if (any(!is.finite(r))) return("") return(paste0(r[1], " – ", r[2])) } "" } codebook <- tibble( variable = names(d), question = sapply(d, get_var_label), class = sapply(d, \(x) paste(class(x), collapse = "/")), n = sapply(d, length), missing = sapply(d, \(x) sum(is.na(x))), missing_pct = round(100 * missing / n, 2), unique_values = sapply(d, \(x) dplyr::n_distinct(x, na.rm = TRUE)), range = sapply(d, get_range), values = sapply(d, get_values_from_attr) ) # Excel export wb <- createWorkbook() addWorksheet(wb, "Codebook") writeData(wb, "Codebook", codebook) freezePane(wb, "Codebook", firstRow = TRUE) setColWidths(wb, "Codebook", cols = 1:ncol(codebook), widths = "auto") saveWorkbook(wb, "codebook_quick.xlsx", overwrite = TRUE) ## 5. final data cleaning ----- ### add condition variable and remove duplicates based on first reached condition ----- d <- d %>% # 1) create condition variable for every row mutate( cond = case_when( cont_time_1_Page_Submit > 0 ~ "0", # control rep_time_1_Page_Submit > 0 ~ "1", # treatment cracks ("reped") kiep_time_2_Page_Submit > 0 ~ "2", # treatment build-up ("kiep") TRUE ~ "other" # did not reach any condition (left early) ) ) %>% # 2) order attempts within IP arrange(IPAddress_anon, StartDate) %>% group_by(IPAddress_anon) %>% # 3) keep only rows where a condition was actually reached filter(cond != "other") %>% # 4) from those, keep the *first* condition-reach per IP slice(1) %>% ungroup() table(d$cond, useNA = "ifany") # control: 1053 # treatment cracks ("reped"): 1067 # treatment build-up ("kiep"): 1023 nrow(d) # 3143 # completion of second attention checks sum(!is.na(d$comp_acheck2)) # 965 valid responses (88 dropped until this point) sum(!is.na(d$reped_acheck2)) # 929 valid responses (138 dropped until this point) sum(!is.na(d$kiep_acheck2)) # 930 valid responses (93 dropped until this point) ### removing participants who failed both attention checks ----- # cleaning data filtering only completed sessions d <- d %>% filter(Q_TerminateFlag == "Complete") table(d$cond, useNA = "ifany") # control: 391 (574 (59.48%) failed both attention checks) # treatment cracks ("reped"): 344 (585 (62.97%) failed both attention checks) # treatment build-up ("kiep"): 450 (480 (51.61%) failed both attention checks) nrow(d) # 1185 ### removing outliers based on 3 SD rule in session duration ----- mean_time <- mean(d$Duration__in_seconds_, na.rm = TRUE) sd_time <- sd(d$Duration__in_seconds_, na.rm = TRUE) d <- d %>% filter( Duration__in_seconds_ >= mean_time - 3 * sd_time, Duration__in_seconds_ <= mean_time + 3 * sd_time ) # calculate medians (in minutes) of session length ("duration in seconds") variable d %>% group_by(cond) %>% summarise(min = min(Duration__in_seconds_, na.rm=TRUE)/60, max = max(Duration__in_seconds_, na.rm=TRUE)/60, mean = mean(Duration__in_seconds_, na.rm=TRUE)/60, median = median(Duration__in_seconds_, na.rm=TRUE)/60, sd = sd(Duration__in_seconds_, na.rm=TRUE)/60) %>% as.data.frame(.) %>% dplyr::mutate_if(is.numeric, round, 2) # cond min max mean median sd # 0 6.43 331.28 38.52 30.57 33.10 # 1 6.27 284.28 38.84 32.92 27.56 # 2 5.20 296.00 36.63 27.78 30.60 table(d$cond, useNA = "ifany") # control: 338 # treatment cracks ("reped"): 342 # treatment build-up ("kiep"): 445 nrow(d) # 1175 ### saving cleaned data ---- write_sav(d, "data/democratic_inoculation_data_cleaned.sav")