source("scripts/read_qualtrics_data.R")
d <- load_data("data/TPSai+-+Pilot+Tester_27+April+2026_13.02.csv", "data/factors_v1.1.csv")
items_data <- d$items_data
cols_tpsai <- d$item_idsItem Reduction
Items
Questions
d$factors_map %>%
select(IID, Item) %>%
kable()| IID | Item |
|---|---|
| COMP_01 | I trust the AI to provide me information that helps me manage my psychological challenges |
| COMP_02 | I trust the AI to help me clarify my thoughts about a difficult situation |
| COMP_03 | I trust that what the AI tells me about psychological concepts and processes is accurate and reliable |
| COMP_04 | I can rely on the AI to make sense of the perspective of someone I am having difficulties to understand |
| COMP_05 | Using the AI enables me to see how my experiences fit into what is common for others |
| PERS_01 | I trust the AI to understand my specific needs when I ask for support |
| PERS_02 | I trust the AI to give me advice that is well-tailored to my specific situation |
| PERS_03 | I trust that the more information the AI has about me, the better it can support me |
| PERS_04 | I trust the AI to help me gain new insights into my issues |
| PERS_05 | I trust the AI to suggest helpful strategies based on my situation |
| EMOT_01 | I trust the AI to reflect the feelings I share in a way that feels accurate |
| EMOT_02 | I trust the AI to make me feel heard and validated |
| EMOT_03 | The AI helps me regulate my emotions when I feel distressed |
| EMOT_04 | I trust the AI to help me feel more confident in my ability to handle a difficult situation |
| EMOT_05 | I trust the AI to adjust its responses appropriately as my emotions change during our conversation |
| SAFE_01 | I feel comfortable sharing my private and sensitive information with the AI |
| SAFE_02 | I feel safe opening up to the AI about anything that is on my mind |
| SAFE_03 | I trust that the AI responses are free from harmful social stereotypes |
| SAFE_04 | I trust the AI to provide me with a space where I can express myself freely |
| SAFE_05 | I trust the AI to respond to me in ways that respect who I am |
| RISK_01 | I am concerned that the AI may not be able to respond appropriately to a mental health crisis (e.g., feeling hopeless, stuck, or wanting to self-harm) |
| RISK_02 | Using the AI as my primary source of psychological support would be risky |
| RISK_03 | I am concerned that the AI does not sufficiently highlight its limitations as a mental health tool |
| RISK_04 | I doubt that the AI can provide sufficient support for complex mental health conditions |
| RISK_05 | I believe the AI may provide inaccurate or misleading mental health information |
| CORP_01 | I am concerned that the AI company may use my data in ways I do not approve of |
| CORP_03 | I am skeptical that the AI company ensures the AI is safe to use for mental health support |
| CORP_04 | I am concerned that the values of the AI company do not sufficiently align with my own |
| CORP_05 | I am concerned that the AI company treats my wellbeing as secondary to their commercial interests |
| CRIT_01 | I am concerned that the way the AI communicates may lead me to trust it more than I should |
| CRIT_02 | It is up to me to guide the AI to give responses that meet my needs |
| CRIT_03 | I trust AI responses less as conversations get longer and the AI starts to ‘drift’ (e.g., speaks repetitively or less consistently) |
| CRIT_04 | When in doubt, I would trust my own experience over what the AI tells me |
| CRIT_05 | I feel the need to verify the AI’s sources when the information is important |
| HUMN_01 | The AI cannot replace human connections for emotional support |
| HUMN_02 | Even though the AI sounds human, our interactions lack the shared experience of human connections |
| HUMN_03 | The emotional support the AI can offer is limited by its lack of embodied feelings and shared presence |
| HUMN_04 | While the AI uses empathic language, it cannot care about my wellbeing as a good friend would |
| HUMN_05 | Even though I can talk with the AI about my feelings, it does not experience emotions the way I do |
Statistics
Total number of responses is 70
item_stats <- items_data %>%
select(all_of(cols_tpsai)) %>%
pivot_longer(everything(), names_to = "item", values_to = "response") %>%
group_by(item) %>%
summarise(
n = sum(!is.na(response)),
mean = mean(response, na.rm = TRUE),
sd = sd(response, na.rm = TRUE),
median = median(response, na.rm = TRUE),
min = suppressWarnings(min(response, na.rm = TRUE)),
max = suppressWarnings(max(response, na.rm = TRUE)),
skew = psych::skew(response, na.rm = TRUE),
kurtosis = psych::kurtosi(response, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(item)
item_stats %>%
knitr::kable(
digits = 2,
col.names = c("Item", "N", "Mean", "SD",
"Median", "Min", "Max", "Skew", "Kurtosis"),
)| Item | N | Mean | SD | Median | Min | Max | Skew | Kurtosis |
|---|---|---|---|---|---|---|---|---|
| COMP_01 | 69 | 4.29 | 1.91 | 5.0 | 1 | 7 | -0.39 | -1.14 |
| COMP_02 | 70 | 4.67 | 1.78 | 5.0 | 1 | 7 | -0.66 | -0.62 |
| COMP_03 | 69 | 3.90 | 1.78 | 4.0 | 1 | 7 | -0.18 | -1.22 |
| COMP_04 | 67 | 4.19 | 1.91 | 5.0 | 1 | 7 | -0.34 | -1.15 |
| COMP_05 | 67 | 4.30 | 1.72 | 4.0 | 1 | 7 | -0.27 | -0.80 |
| CORP_01 | 69 | 5.86 | 1.39 | 6.0 | 2 | 7 | -1.15 | 0.61 |
| CORP_03 | 69 | 5.54 | 1.58 | 6.0 | 1 | 7 | -1.00 | 0.29 |
| CORP_04 | 65 | 5.28 | 1.77 | 6.0 | 1 | 7 | -0.80 | -0.36 |
| CORP_05 | 69 | 5.68 | 1.59 | 6.0 | 1 | 7 | -1.04 | 0.12 |
| CRIT_01 | 70 | 5.54 | 1.49 | 6.0 | 2 | 7 | -0.81 | -0.39 |
| CRIT_02 | 70 | 5.44 | 1.45 | 6.0 | 1 | 7 | -1.07 | 0.50 |
| CRIT_03 | 66 | 5.55 | 1.47 | 6.0 | 2 | 7 | -1.03 | 0.09 |
| CRIT_04 | 70 | 6.13 | 1.14 | 7.0 | 2 | 7 | -1.34 | 1.43 |
| CRIT_05 | 68 | 6.04 | 1.34 | 6.0 | 1 | 7 | -1.94 | 4.03 |
| EMOT_01 | 67 | 4.33 | 1.61 | 5.0 | 1 | 7 | -0.60 | -0.57 |
| EMOT_02 | 70 | 4.29 | 1.76 | 4.5 | 1 | 7 | -0.43 | -0.67 |
| EMOT_03 | 63 | 4.49 | 1.75 | 5.0 | 1 | 7 | -0.48 | -0.84 |
| EMOT_04 | 67 | 4.31 | 1.85 | 5.0 | 1 | 7 | -0.36 | -1.04 |
| EMOT_05 | 65 | 3.89 | 1.67 | 4.0 | 1 | 7 | -0.15 | -0.85 |
| HUMN_01 | 70 | 6.24 | 1.26 | 7.0 | 2 | 7 | -1.75 | 2.36 |
| HUMN_02 | 69 | 6.12 | 1.33 | 7.0 | 1 | 7 | -1.75 | 2.62 |
| HUMN_03 | 67 | 5.40 | 1.74 | 6.0 | 1 | 7 | -0.93 | -0.25 |
| HUMN_04 | 70 | 6.37 | 1.19 | 7.0 | 2 | 7 | -1.99 | 3.12 |
| HUMN_05 | 69 | 6.42 | 1.14 | 7.0 | 1 | 7 | -2.37 | 6.24 |
| PERS_01 | 69 | 3.94 | 1.85 | 5.0 | 1 | 7 | -0.40 | -1.22 |
| PERS_02 | 69 | 4.17 | 1.95 | 5.0 | 1 | 7 | -0.39 | -1.04 |
| PERS_03 | 69 | 4.58 | 1.71 | 5.0 | 1 | 7 | -0.56 | -0.58 |
| PERS_04 | 69 | 4.41 | 1.67 | 5.0 | 1 | 7 | -0.61 | -0.53 |
| PERS_05 | 68 | 4.49 | 1.76 | 5.0 | 1 | 7 | -0.65 | -0.80 |
| RISK_01 | 70 | 5.41 | 1.81 | 6.0 | 1 | 7 | -0.82 | -0.72 |
| RISK_02 | 70 | 6.03 | 1.35 | 7.0 | 1 | 7 | -1.48 | 1.83 |
| RISK_03 | 69 | 5.30 | 1.83 | 6.0 | 1 | 7 | -0.71 | -0.90 |
| RISK_04 | 69 | 5.86 | 1.45 | 6.0 | 1 | 7 | -1.30 | 1.01 |
| RISK_05 | 70 | 5.49 | 1.51 | 6.0 | 2 | 7 | -0.77 | -0.41 |
| SAFE_01 | 70 | 3.44 | 2.17 | 3.0 | 1 | 7 | 0.15 | -1.49 |
| SAFE_02 | 70 | 3.36 | 2.06 | 3.0 | 1 | 7 | 0.30 | -1.38 |
| SAFE_03 | 67 | 3.66 | 2.14 | 3.0 | 1 | 7 | 0.19 | -1.47 |
| SAFE_04 | 70 | 4.69 | 1.81 | 5.0 | 1 | 7 | -0.47 | -0.80 |
| SAFE_05 | 70 | 4.86 | 1.67 | 5.0 | 1 | 7 | -0.66 | -0.45 |
Missingness
Items where many people respond Don't know likely capture constructs that participants are uncertain about and can reduce the effective sample size and item statistics like means and correlations. We decided to drop items exceeding 30% missing responses. Our rationale is that if at least 3 out of 10 individuals were not able to meaningfully engage with an item, dropping it simplifies our scale and allows it to focus on measuring items with generally clearer responses.
Beyond just dropping items with at least 30% missingness, we want to inspect the distribution of missing answers per item. When visually inspecting this distribution we will look for discontinuities, meaning items with a substantially higher proportion of Don't know than other items. We may remove such items even if below our pre-set missingness threshold.
missing_threshold <- 0.3
missing_data <- items_data %>%
summarise(
across(all_of(cols_tpsai), ~ mean(is.na(.x)), .names = "{.col}_missing")
) %>%
pivot_longer(
everything(),
names_to = c("item", ".value"),
names_sep = "_(?=[^_]+$)"
) %>%
separate(item, c("factor", "q_n"), sep = "_", remove = FALSE)
# adding another threshold from visual inspection
v_missing_threshold <- 0.05
ggplot(missing_data, aes(x = missing, fill = factor)) +
geom_histogram(binwidth = 0.0075) +
geom_vline(xintercept = missing_threshold) +
geom_vline(xintercept = v_missing_threshold, linetype="dashed",color='red')Visual threshold selects items with at least 3.5 missing responses out of 70.
missing_items <- missing_data %>%
filter(missing > missing_threshold | missing > v_missing_threshold) %>%
select(item, missing) %>%
arrange(desc(missing))
if (nrow(missing_items) > 0) {
missing_items %>% knitr::kable(digits = 2)
}| item | missing |
|---|---|
| EMOT_03 | 0.10 |
| EMOT_05 | 0.07 |
| CORP_04 | 0.07 |
| CRIT_03 | 0.06 |
Distributional Shapes
Skewness and Kurtosis
We examine skewness and kurtosis for each item to check whether responses are spread across the scale or piled up at one end. If everyone scores high/low on an item, it means that the item is too easy/hard to endorse and cannot discriminate between people with high versus low levels of that trait being measured. Moreover, items where most people gave the same answer have little variation, which makes it harder for factor analysis to detect the underlying structure, regardless of the estimation method used.
Items with negative skew and high positive kurtosis (left tail) may have a “ceiling effect” where an item is too easy to endorse and almost everyone agrees.
Items with positive skew and high negative kurtosis(right tail) may have a “floor effect” where an item is too extreme to endorse and very few people agree.
Avoiding ceiling/floor effects is crucial because if items cluster at the extremes, you lose the ability to distinguish among people at those ends.
Following Kline (2016), we treat absolute skewness and kurtosis values above 1.5 as a sign that an item’s responses are too bunched together to be useful. Items exceeding this threshold will be considered for removal or rewording alongside the other item reduction criteria. Additionally, we may consider keeping items with high skew or kurtosis for comparison or reporting purposes but flag them for removal from factor analysis.
dist_threshold <- 1.5
dist_data <- items_data %>%
summarise(
across(all_of(everything()), list(
skew = ~ psych::skew(.x, na.rm = TRUE),
kurt = ~ psych::kurtosi(.x, na.rm = TRUE)
), .names = "{.col}_{.fn}")
) %>%
pivot_longer(
everything(),
names_to = c("item", ".value"),
names_sep = "_(?=[^_]+$)"
) %>%
separate(item, c("subscale", "q_n"), sep = "_", remove = FALSE)
# adding another threshold at 1.25 from visual inspection
v_dist_threshold <- 1.25
ggplot(dist_data, aes(x = skew, y = kurt, colour = subscale, label = item)) +
geom_point() +
geom_hline(yintercept=c(-dist_threshold, dist_threshold)) +
geom_vline(xintercept=c(-dist_threshold, dist_threshold)) +
geom_hline(yintercept=c(-v_dist_threshold, v_dist_threshold),linetype="dashed",color='red') +
geom_vline(xintercept=c(-v_dist_threshold, v_dist_threshold),linetype="dashed",color='red') +
labs(x = "Skewness", y = "Kurtosis")dist_items <- dist_data %>%
filter(abs(skew) > v_dist_threshold | abs(kurt) > v_dist_threshold) %>%
select(item, skew, kurt)
if (nrow(dist_items) > 0) {
dist_items %>% knitr::kable(digits = 2)
}| item | skew | kurt |
|---|---|---|
| SAFE_01 | 0.15 | -1.49 |
| SAFE_02 | 0.30 | -1.38 |
| SAFE_03 | 0.19 | -1.47 |
| RISK_02 | -1.48 | 1.83 |
| RISK_04 | -1.30 | 1.01 |
| CRIT_04 | -1.34 | 1.43 |
| CRIT_05 | -1.94 | 4.03 |
| HUMN_01 | -1.75 | 2.36 |
| HUMN_02 | -1.75 | 2.62 |
| HUMN_04 | -1.99 | 3.12 |
| HUMN_05 | -2.37 | 6.24 |
Visual Inspection
Beyond flagging items with high skew, we can also visually inspect the distribution of each item, looking for items where people consistently avoid using certain points in the scale such as items with responses clustered at the top and end of the scale (Bimodal distribution) or items where some scale points are consistently not used. These items might need to be reworded or removed depending on their distributional shape.
long_data <- items_data %>%
pivot_longer(everything(), names_to = "item", values_to = "response") %>%
mutate(factor = sub("_.*", "", item))
y_max <- long_data %>%
count(item, response) %>%
group_by(item) %>%
mutate(prop = n / sum(n)) %>%
pull(prop) %>%
max()
for (f in unique(long_data$factor)) {
p <- long_data %>%
filter(factor == f) %>%
ggplot(aes(x = factor(response), y = after_stat(prop), group = 1)) +
geom_bar(fill = "#D85A30") +
scale_x_discrete(limits = as.character(1:7)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
coord_cartesian(ylim = c(0, y_max)) +
facet_wrap(~ item, nrow = 1) +
labs(x = "", y = "") +
theme_minimal(base_size = 16)
print(p)
}Survey Duration
Displaying survey duration for pilot test responses in order to decided how much to grant for future data collection.
ggplot(d$raw_data, aes(x = `Duration (in seconds)`)) +
geom_histogram(bins = 50) +
xlim(NA, quantile(d$raw_data$`Duration (in seconds)`, 0.95, na.rm = TRUE)) +
geom_vline(xintercept = 600)