Expert Evaluation

Author

Riccardo Volpato

Data Processing

options(readr.show_col_types = FALSE)
file_path <- "data/TPSai+-+Experts_20+April+2026_22.27.csv"
raw_data <- read_csv(file_path, n_max = 2)
colnames <- names(raw_data)
question_labels <- setNames(sub(" - .*", "", as.character(raw_data[1, ])), colnames)
raw_data <- read_csv(file_path, col_names = colnames, skip = 3)

factors_map <- read_csv('data/factors_v1.csv')
cols_tpsai <- factors_map$IID
choices <- c("Keep" = 1, "Drop" = 2, "Reword" = 3)

# check that survey and factors correspond (should be empty dataframe)
data.frame(
  question_label = unlist(question_labels[cols_tpsai]),
  factor_item = factors_map$Item,
  match = unlist(question_labels[cols_tpsai]) == factors_map$Item
) %>% filter(!match)  %>% nrow()

[1] 0

# generate random data for code development before data collection
random_data <- as.data.frame(
  lapply(cols_tpsai, function(col) {
    sample(names(choices), size=5, replace=TRUE)
  })
)
colnames(random_data) <- cols_tpsai

analysis_data <- raw_data # switch to raw_data after data collection

experts_data <- analysis_data %>%
  select(all_of(cols_tpsai))  %>%
  mutate(across(all_of(cols_tpsai), ~ choices[.x]))

Inter-Rater Agreement

ratings_matrix <- experts_data %>%
  select(all_of(cols_tpsai)) %>%
  mutate(across(everything(), as.integer)) %>%
  as.matrix() 

overall_alpha <- kripp.alpha(ratings_matrix, method = "nominal")
overall_alpha

 Krippendorff's alpha

 Subjects = 40 
   Raters = 8 
    alpha = 0.0549

interpret_alpha <- case_when(
  overall_alpha$value < 0.20 ~ "Slight agreement",
  overall_alpha$value < 0.40 ~ "Fair agreement",
  overall_alpha$value < 0.60 ~ "Moderate agreement",
  overall_alpha$value < 0.80 ~ "Substantial agreement",
  TRUE                        ~ "Almost perfect agreement"
)

cat("Krippendorff's alpha =", round(overall_alpha$value, 3), "-", interpret_alpha)

Krippendorff's alpha = 0.055 - Slight agreement

Inter-disciplinary expertise

Examining the inter-disciplinary nature of expertise by showing which expert had which expertise.

expr_raw <- analysis_data$EXPR

tibble(expert = paste0("E", seq_along(expr_raw)), discipline = expr_raw) %>%
  separate_rows(discipline, sep = ",") %>%
  mutate(discipline = str_trim(discipline)) %>%
  mutate(discipline = fct_infreq(discipline) %>% fct_rev()) %>%  # <- add this
  ggplot(aes(x = expert, y = discipline)) +
  geom_tile(fill = "#378ADD", colour = "white", linewidth = 0.5) +
  labs(x = NULL, y = NULL) +
  theme_minimal(base_size = 12) +
  theme(
    panel.grid  = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Expert Evaluations

Expert judges evaluated each item and voted to Keep, Drop, or Reword it. The majority vote determined the final recommendation for each item. When two options tied, the following rules resolved the decision.

Keep and Reword ties favoured Reword on the basis that any doubt about an item warrants revision rather than acceptance.
Drop and Reword ties favoured Drop on the basis that if an item is difficult enough to reword that experts are split, it is better removed.
Keep and Drop ties favoured Keep, giving the item the benefit of the doubt for further evaluation in subsequent validation stages.

decisions <- experts_data %>%
  pivot_longer(cols = all_of(cols_tpsai), names_to = "item", values_to = "rating") %>%
  mutate(rating = names(choices)[rating]) %>%
  group_by(item) %>%
  summarise(
    n_keep    = sum(rating == "Keep"),
    n_drop    = sum(rating == "Drop"),
    n_reword  = sum(rating == "Reword"),
    pct_agree = round(max(table(rating)) / n(), 2),
    .groups   = "drop"
  ) %>%
  mutate(
    decision = case_when(
      n_reword > n_keep  & n_reword > n_drop  ~ "Reword",
      n_drop   > n_keep  & n_drop   > n_reword ~ "Drop",
      n_keep   > n_drop  & n_keep   > n_reword ~ "Keep",
      n_keep == n_reword & n_keep > n_drop    ~ "Reword",
      n_drop == n_reword & n_drop > n_keep    ~ "Drop",
      n_keep == n_drop  & n_keep > n_reword   ~ "Keep"
    ),
    item = factor(item, levels = cols_tpsai)
  ) %>%
  arrange(item)

kable(decisions, col.names = c("Item", "Keep", "Drop", "Reword", "% Agreement", "Decision"))

Item	Keep	Drop	Reword	% Agreement	Decision
COMP_01	5	0	3	0.62	Keep
COMP_02	6	0	2	0.75	Keep
COMP_03	3	0	5	0.62	Reword
COMP_04	3	1	4	0.50	Reword
COMP_05	4	3	1	0.50	Keep
PERS_01	3	0	5	0.62	Reword
PERS_02	6	0	2	0.75	Keep
PERS_03	5	0	3	0.62	Keep
PERS_04	6	1	1	0.75	Keep
PERS_05	2	2	4	0.50	Reword
EMOT_01	5	0	3	0.62	Keep
EMOT_02	5	0	3	0.62	Keep
EMOT_03	3	1	4	0.50	Reword
EMOT_04	6	1	1	0.75	Keep
EMOT_05	7	0	1	0.88	Keep
SAFE_01	7	0	1	0.88	Keep
SAFE_02	6	0	2	0.75	Keep
SAFE_03	5	0	3	0.62	Keep
SAFE_04	7	0	1	0.88	Keep
SAFE_05	6	1	1	0.75	Keep
RISK_01	4	0	4	0.50	Reword
RISK_02	7	0	1	0.88	Keep
RISK_03	5	1	2	0.62	Keep
RISK_04	5	0	3	0.62	Keep
RISK_05	6	0	2	0.75	Keep
CORP_01	8	0	0	1.00	Keep
CORP_02	2	3	3	0.38	Drop
CORP_03	5	0	3	0.62	Keep
CORP_04	7	0	1	0.88	Keep
CORP_05	6	0	2	0.75	Keep
CRIT_01	4	0	4	0.50	Reword
CRIT_02	5	2	1	0.62	Keep
CRIT_03	2	2	4	0.50	Reword
CRIT_04	7	0	1	0.88	Keep
CRIT_05	7	0	1	0.88	Keep
HUMN_01	7	0	1	0.88	Keep
HUMN_02	8	0	0	1.00	Keep
HUMN_03	5	0	3	0.62	Keep
HUMN_04	5	1	2	0.62	Keep
HUMN_05	8	0	0	1.00	Keep

Decisions by theoretical factor

factor_summary <- decisions %>%
  mutate(factor = str_extract(item, "^[A-Z]+")) %>%
  group_by(factor) %>%
  summarise(
    keep   = sum(n_keep),
    drop   = sum(n_drop),
    reword = sum(n_reword),
    total  = keep + drop + reword
  ) %>%
  mutate(
    pct_keep   = keep   / total * 100,
    pct_reword = reword / total * 100,
    pct_drop   = drop   / total * 100
  ) %>%
  pivot_longer(
    cols = starts_with("pct_"),
    names_to = "decision",
    names_prefix = "pct_",
    values_to = "percentage"
  ) %>%
  mutate(decision = factor(str_to_title(decision), levels = c("Keep", "Reword", "Drop")))

ggplot(factor_summary, aes(x = factor, y = percentage, fill = decision)) +
  geom_col(width = 0.6) +
  scale_fill_manual(values = c(
    "Keep"   = "#639922",
    "Reword" = "#BA7517",
    "Drop"   = "#E24B4A"
  )) +
  labs(x = NULL, y = NULL, fill = NULL, title = NULL) +
  scale_y_continuous(labels = scales::label_percent(scale = 1), expand = c(0, 0)) +
  theme_minimal(base_size = 13) +
  theme(legend.position   = "top")

Factors to add

analysis_data$COMP_ADD

[1] "Trust has 2 main components: cognitive and affective. There are many way to operationalize trust. None clear operationalization of the construct emerge here. Please note a systematic review approach https://www.tandfonline.com/doi/full/10.1080/15366367.2025.2485678"
[2] NA                                                                                                                                                                                                                                                                         
[3] NA                                                                                                                                                                                                                                                                         
[4] NA                                                                                                                                                                                                                                                                         
[5] NA                                                                                                                                                                                                                                                                         
[6] NA                                                                                                                                                                                                                                                                         
[7] "I trust that it won't lead me to misinformation or harm."                                                                                                                                                                                                                 
[8] "Something about agency and competence after asking AI, such as: \"I trust that after asking advice from AI, I will be more competent in taking action to solve my psychological difficulties\""

analysis_data$PERS_ADD

[1] "It is not clear the whole section. Why it us labeled “personalization”? How this construct has been identified? What is it about?"
[2] NA                                                                                                                                 
[3] NA                                                                                                                                 
[4] NA                                                                                                                                 
[5] NA                                                                                                                                 
[6] "I trust the AI to adjust its advice to fit my individual needs rather than giving generic responses"                              
[7] NA                                                                                                                                 
[8] NA

analysis_data$EMOT_ADD

[1] NA                                                                                                                                                                                                                                   
[2] NA                                                                                                                                                                                                                                   
[3] NA                                                                                                                                                                                                                                   
[4] "This may reflect two distinct processes: emotional validation (feeling understood)and emotional regulation (shifting emotional states), which could be worth distinguishing conceptually (and later empirically using EFA, CFA etc)"
[5] "The word trust,I’m thinking of using the word I believe. It sounds clearer from my perspective. When I say I believe this tool can do so and so, compared to when I say I trust, it feels more accurate."                           
[6] "I trust the AI to provide emotionally supportive responses without being dismissive or superficial."                                                                                                                                
[7] "Note: I wouldn't label emotions as \"good\" or \"bad\", as different emotions can have different functions and limitations."                                                                                                        
[8] NA

analysis_data$SAFE_ADD

[1] NA                                                                                                                                                                                      
[2] NA                                                                                                                                                                                      
[3] NA                                                                                                                                                                                      
[4] "#2 and #4 are a bit similar here, perhaps keep just one of them"                                                                                                                       
[5] NA                                                                                                                                                                                      
[6] "I trust the AI to respond without making me feel criticised or dismissed."                                                                                                             
[7] NA                                                                                                                                                                                      
[8] "\"I trust AI to ultimately suggest to me what is best for me\" or \"I trust that AI has my best interest in mind, when it provides suggestions about what I should do to feel better\""

analysis_data$RISK_ADD

[1] "Please note some Q may be reworded" NA                                  
[3] NA                                   NA                                  
[5] NA                                   NA                                  
[7] NA                                   NA

analysis_data$CORP_ADD

[1] NA               NA               NA               "great section!"
[5] NA               NA               NA               NA

analysis_data$CRIT_ADD

[1] NA NA NA NA NA NA NA NA

analysis_data$HUMN_ADD

[1] NA                                                                                       
[2] NA                                                                                       
[3] NA                                                                                       
[4] NA                                                                                       
[5] NA                                                                                       
[6] NA                                                                                       
[7] "AI cannot comprehend and care about my experiences the same way as another human being."
[8] NA