Reply to BMJ 2019;367:l6573

João Martins, 01.2020

Background

BMJ 2019;367:l6573 reports that female researchers in Life Sciences use fewer positive terms to describe the importance of their research than their male counterparts.

Below, we investigate whether those observations can be reproduced for grants funded by the Swiss National Science Foundation (SNSF), the leading public research funding organization in Switzerland.

Main observations

Analysis

For a statistical analysis in R, we use the following packages for convenience:

library(here)
library(tidyverse)
library(tidylog)
library(janitor)
library(stopwords)
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
library(broom)

The SNSF provides public datasets with information regarding the research domain, the titles, and the abstracts of the grants it funds, as well as the gender of the grantees.

To reproduce the analysis in BMJ 2019;367:l6573, we will download two datasets available on the SNSF’s grant portal at p3.snf.ch:

download.file(
  "http://p3.snf.ch/P3Export/P3_GrantExport_with_abstracts.csv",
  here("data", "P3_GrantExport_with_abstracts.csv"))

download.file(
  "http://p3.snf.ch/P3Export/P3_PersonExport.csv",
  here("data", "P3_PersonExport.csv"))

It’s best to read the .csv files with the default column type set to character:

grants <- 
  here("data", "P3_GrantExport_with_abstracts.csv") %>%
  read_csv2(col_types = cols(.default = "c")) %>% 
  clean_names()

grantees <- 
  here("data", "P3_PersonExport.csv") %>%
  read_csv2(col_types = cols(.default = "c")) %>%
  clean_names()

Let’s cut the grants table to the few variables of interest: title, abstract, domain/discipline, and date.

grants <- grants %>%
  select(
    project_number, 
    project_title, 
    project_title_english,
    abstract,
    lay_summary_english,
    discipline_name_hierarchy,
    discipline_name,
    start_date)

(As the SNSF accepts grants in several languages, we try to collect as many English abstracts and titles as possible.) We also need to remove grants for which title and abstract are not available.

grants <- grants %>%
  mutate(
    project_title = ifelse(
      is.na(project_title_english), 
      project_title,
      project_title_english),
    abstract = ifelse(
      is.na(lay_summary_english), 
      abstract,
      lay_summary_english)) %>%
  select(
    -project_title_english, 
    -lay_summary_english) %>%
  drop_na()

We combine the title and the abstract, remove punctuation, double spaces, and parse the text as lists of words.

grants <- grants %>%
  unite("text", c("project_title", "abstract"), sep = " ") %>%
  mutate(
    text = str_replace_all(text, "[[:punct:]]", " "),
    text = str_to_lower(text),
    text = str_squish(text),
    text = str_split(text, " "))

To further tidy the data-set, we separate the column discipline_name_hierarchy into domain and subdomain:

grants <- grants %>%
  separate(
    discipline_name_hierarchy, 
    c("domain", "subdomain"), ";", fill = "right")

We count the use of stop-words for English, German, French, and Italian (heuristic for language detection), and keep the texts where English has the highest score:

count_stopwords <- function(lang, w) {
  sum(w %in% stopwords(lang))
}

detect_language <- function(words, babel = c("en", "de", "fr", "it")) {
  max <- map_int(babel,count_stopwords, w = words) %>%
    which.max()
  pluck(babel, max)
}

grants <- grants %>% 
  mutate(lang = map_chr(text, detect_language)) %>%
  filter(lang == "en")

And we extract the year from the grant start date.

grants <- grants %>% 
  extract(start_date, "start_year", "([[:digit:]]{4}$)") %>%
  mutate(
    start_year = as.numeric(start_year),
    start_year = cut(start_year, breaks = 5))

As for the grantees, we are interested in gender and assume only applicants and responsible applicants actually write the grant.

grantees <- grantees %>%
  select(person_id = person_id_snsf, 
         gender,
         projects_as_responsible_applicant, 
         projects_as_applicant) %>%
  drop_na()

We sort the grants according to whether grantees are all female, all male, or mixed.

df_gender <- grantees %>%
  select(person_id, gender)

df_mapp <- grantees %>%
  select(person_id, project_id = projects_as_responsible_applicant) %>%
  mutate(project_id = str_split(project_id, ";")) %>%
  unnest(project_id) %>%
  semi_join(grants, by = c("project_id" = "project_number"))

df_app <- grantees %>%
  select(person_id, project_id = projects_as_applicant) %>%
  mutate(project_id = str_split(project_id, ";")) %>%
  unnest(project_id) %>%
  semi_join(grants, by = c("project_id" = "project_number"))

df_app <- df_app %>%
  bind_rows(df_mapp) %>%
  distinct(person_id, project_id) %>%
  left_join(df_gender, by = "person_id")

gender_proj <- df_app %>%
  group_by(project_id) %>%
  summarise(
    n_authors = n(),
    share_women = sum(gender == "female") / n_authors) %>% 
  ungroup() %>%
  mutate(share_women = round(share_women, 1))

grants <- grants %>%
  left_join(gender_proj, by = c("project_number" = "project_id")) %>%
  drop_na() %>%
  mutate(
    gender = "mixed",
    gender = ifelse(share_women == 0, "male", gender),
    gender = ifelse(share_women == 1, "female", gender))

For consistency with the method presented in BMJ 2019;367:l6573, we flag whether positive terms occur at least once:

contains_any <- function(w, terms)
  any(terms %in% w)

terms25 <- c( 
  "novel", "unique", "promising", "favorable", "robust",
  "excellent", "prominent", "supportive", "encouraging", "remarkable",
  "innovative", "unprecedented", "bright", "enormous", "reassuring",
  "creative", "assuring", "hopeful", "astonishing", "spectacular",
  "amazing", "inventive", "phenomenal", "groundbreaking", "inspiring")

grants <- grants %>% 
  mutate(is_positive = map_lgl(text, contains_any, terms = terms25))

grants %>%
  count(start_year, gender) %>%
  kable()

grants %>%
  filter(start_year != "(1997,2002]") %>% # 2 women only
  mutate(
    start_year = fct_drop(start_year),
    start_year = fct_rev(start_year),
    domain = factor(
              domain, 
              c("Biology and Medicine",
                "Mathematics, Natural- and Engineering Sciences",
                "Humanities and Social Sciences"),
              c("LS", "PE", "SH")),
    subdomain = factor(subdomain),
    gender = factor(gender, c("male", "female"), c("M", "F")),
    is_positive = factor(is_positive, c("FALSE", "TRUE"), c("N", "Y"))) %>%
  select(is_positive, gender, 
         domain, subdomain, discipline_name, 
         share_women, start_year) %>%
  write_rds(here("data", "clean_counts.Rds"))

And we try to look for possible differences in counts by gender, domain, and start year. Below detailed shares (%):

dataset <- read_rds(here("data", "clean_counts.Rds")) %>%
  filter(!is.na(gender))
## filter: removed 1,650 rows (11%), 13,943 rows remaining
get_positive_share <- function(grouped_df, side_note) {
  grouped_df %>%
    summarise(
      total  = n(),
      n_pos = sum(is_positive == "Y")) %>%
    mutate(share = round(100 * n_pos / total, 1)) %>%
    kable(caption = side_note)
}

dataset %>%
  group_by(domain) %>%
    get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by domain.")
## group_by: one grouping variable (domain)
## summarise: now 3 rows and 3 columns, ungrouped
## mutate: new variable 'share' with 3 unique values and 0% NA

Percentage of applications for which title and abstract contain at least one positive term by domain.

domain total n_pos share
LS 5258 2488 47.3
PE 6080 2789 45.9
SH 2605 705 27.1
dataset %>%
  group_by(gender) %>%
  get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by gender of the grantee.")
## group_by: one grouping variable (gender)
## summarise: now 2 rows and 3 columns, ungrouped
## mutate: new variable 'share' with 2 unique values and 0% NA

Percentage of applications for which title and abstract contain at least one positive term by gender of the grantee.

gender total n_pos share
M 12012 5225 43.5
F 1931 757 39.2
dataset %>%
  group_by(start_year, gender) %>%
  get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by start year of the grant and by gender of the grantee.")
## group_by: 2 grouping variables (start_year, gender)
## summarise: now 8 rows and 4 columns, one group variable remaining (start_year)
## mutate (grouped): new variable 'share' with 8 unique values and 0% NA

Percentage of applications for which title and abstract contain at least one positive term by start year of the grant and by gender of the grantee.

start_year gender total n_pos share
(2015,2020] M 2797 1447 51.7
(2015,2020] F 639 295 46.2
(2011,2015] M 4691 2124 45.3
(2011,2015] F 728 271 37.2
(2006,2011] M 3192 1192 37.3
(2006,2011] F 385 131 34.0
(2002,2006] M 1332 462 34.7
(2002,2006] F 179 60 33.5
dataset %>%
  group_by(domain, gender) %>%
  get_positive_share(
    side_note = "Percentage of applications for which title and abstract contain at least one positive term by domain of the grant and by gender of the grantee.")
## group_by: 2 grouping variables (domain, gender)
## summarise: now 6 rows and 4 columns, one group variable remaining (domain)
## mutate (grouped): new variable 'share' with 6 unique values and 0% NA

Percentage of applications for which title and abstract contain at least one positive term by domain of the grant and by gender of the grantee.

domain gender total n_pos share
LS M 4445 2108 47.4
LS F 813 380 46.7
PE M 5573 2561 46.0
PE F 507 228 45.0
SH M 1994 556 27.9
SH F 611 149 24.4
dataset %>%
  group_by(start_year, domain, gender) %>%
  get_positive_share(
    side_note = "Percentage of applications for which title and abstract contain at least one positive term by start year and domain of the grant and by gender of the grantee.")
## group_by: 3 grouping variables (start_year, domain, gender)
## summarise: now 24 rows and 5 columns, 2 group variables remaining (start_year, domain)
## mutate (grouped): new variable 'share' with 24 unique values and 0% NA

Percentage of applications for which title and abstract contain at least one positive term by start year and domain of the grant and by gender of the grantee.

start_year domain gender total n_pos share
(2015,2020] LS M 1080 618 57.2
(2015,2020] LS F 269 146 54.3
(2015,2020] PE M 1177 647 55.0
(2015,2020] PE F 166 89 53.6
(2015,2020] SH M 540 182 33.7
(2015,2020] SH F 204 60 29.4
(2011,2015] LS M 1593 800 50.2
(2011,2015] LS F 258 124 48.1
(2011,2015] PE M 2262 1084 47.9
(2011,2015] PE F 198 90 45.5
(2011,2015] SH M 836 240 28.7
(2011,2015] SH F 272 57 21.0
(2006,2011] LS M 1111 447 40.2
(2006,2011] LS F 175 67 38.3
(2006,2011] PE M 1631 644 39.5
(2006,2011] PE F 109 38 34.9
(2006,2011] SH M 450 101 22.4
(2006,2011] SH F 101 26 25.7
(2002,2006] LS M 661 243 36.8
(2002,2006] LS F 111 43 38.7
(2002,2006] PE M 503 186 37.0
(2002,2006] PE F 34 11 32.4
(2002,2006] SH M 168 33 19.6
(2002,2006] SH F 34 6 17.6

Generalized linear models (with a logit link function for binary response data) can help checking the significance level of the observed differences:

model_gender <- glm(is_positive ~ gender, 
                    dataset, family = "binomial")
model_year <- glm(is_positive ~ start_year, 
                    dataset, family = "binomial")
model_gender_year <- glm(is_positive ~ gender + start_year, 
                    dataset, family = "binomial")
model_gender_year_domain <- glm(is_positive ~ gender + domain + start_year, 
                           dataset, family = "binomial")

tidy(model_gender) %>% 
  kable(caption = "GLM for gender of the grantee.")

GLM for gender of the grantee.

term estimate std.error statistic p.value
(Intercept) -0.2615542 0.0184046 -14.21138 0.0000000
genderF -0.1772545 0.0501151 -3.53695 0.0004048
tidy(model_year) %>% 
  kable(caption = "GLM for start year of the grant.")

GLM for start year of the grant.

term estimate std.error statistic p.value
(Intercept) 0.0279413 0.0341229 0.8188428 0.4128761
start_year(2011,2015] -0.2611385 0.0437332 -5.9711682 0.0000000
start_year(2006,2011] -0.5607458 0.0486198 -11.5332805 0.0000000
start_year(2002,2006] -0.6669680 0.0639625 -10.4274844 0.0000000
tidy(model_gender_year) %>% 
  kable(caption = "GLM for gender of the grantee and start year of the grant.")

GLM for gender of the grantee and start year of the grant.

term estimate std.error statistic p.value
(Intercept) 0.0715699 0.0354239 2.020388 0.0433432
genderF -0.2348357 0.0506782 -4.633862 0.0000036
start_year(2011,2015] -0.2736773 0.0438631 -6.239353 0.0000000
start_year(2006,2011] -0.5798493 0.0488458 -11.871019 0.0000000
start_year(2002,2006] -0.6837244 0.0641222 -10.662826 0.0000000
tidy(model_gender_year_domain) %>% 
  kable(caption = "GLM for gender of the grantee and start year and domain of the grant.")

GLM for gender of the grantee and start year and domain of the grant.

term estimate std.error statistic p.value
(Intercept) 0.2773068 0.0422339 6.565978 0.0000000
genderF -0.1231801 0.0521501 -2.362030 0.0181752
domainPE -0.0711986 0.0384672 -1.850891 0.0641852
domainSH -0.9409120 0.0528571 -17.801038 0.0000000
start_year(2011,2015] -0.2828457 0.0446153 -6.339658 0.0000000
start_year(2006,2011] -0.6353008 0.0496612 -12.792707 0.0000000
start_year(2002,2006] -0.7690391 0.0650683 -11.818954 0.0000000