Analysis
For a statistical analysis in R
, we use the following packages for convenience:
library(here)
library(tidyverse)
library(tidylog)
library(janitor)
library(stopwords)
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
library(broom)
The SNSF provides public datasets with information regarding the research domain, the titles, and the abstracts of the grants it funds, as well as the gender of the grantees.
To reproduce the analysis in BMJ 2019;367:l6573, we will download two datasets available on the SNSF’s grant portal at p3.snf.ch:
download.file(
"http://p3.snf.ch/P3Export/P3_GrantExport_with_abstracts.csv",
here("data", "P3_GrantExport_with_abstracts.csv"))
download.file(
"http://p3.snf.ch/P3Export/P3_PersonExport.csv",
here("data", "P3_PersonExport.csv"))
It’s best to read the .csv
files with the default column type set to character
:
grants <-
here("data", "P3_GrantExport_with_abstracts.csv") %>%
read_csv2(col_types = cols(.default = "c")) %>%
clean_names()
grantees <-
here("data", "P3_PersonExport.csv") %>%
read_csv2(col_types = cols(.default = "c")) %>%
clean_names()
Let’s cut the grants
table to the few variables of interest: title, abstract, domain/discipline, and date.
grants <- grants %>%
select(
project_number,
project_title,
project_title_english,
abstract,
lay_summary_english,
discipline_name_hierarchy,
discipline_name,
start_date)
(As the SNSF accepts grants in several languages, we try to collect as many English abstracts and titles as possible.) We also need to remove grants for which title and abstract are not available.
grants <- grants %>%
mutate(
project_title = ifelse(
is.na(project_title_english),
project_title,
project_title_english),
abstract = ifelse(
is.na(lay_summary_english),
abstract,
lay_summary_english)) %>%
select(
-project_title_english,
-lay_summary_english) %>%
drop_na()
We combine the title and the abstract, remove punctuation, double spaces, and parse the text as lists of words.
grants <- grants %>%
unite("text", c("project_title", "abstract"), sep = " ") %>%
mutate(
text = str_replace_all(text, "[[:punct:]]", " "),
text = str_to_lower(text),
text = str_squish(text),
text = str_split(text, " "))
To further tidy the data-set, we separate the column discipline_name_hierarchy into domain and subdomain:
grants <- grants %>%
separate(
discipline_name_hierarchy,
c("domain", "subdomain"), ";", fill = "right")
We count the use of stop-words for English, German, French, and Italian (heuristic for language detection), and keep the texts where English has the highest score:
count_stopwords <- function(lang, w) {
sum(w %in% stopwords(lang))
}
detect_language <- function(words, babel = c("en", "de", "fr", "it")) {
max <- map_int(babel,count_stopwords, w = words) %>%
which.max()
pluck(babel, max)
}
grants <- grants %>%
mutate(lang = map_chr(text, detect_language)) %>%
filter(lang == "en")
And we extract the year from the grant start date.
grants <- grants %>%
extract(start_date, "start_year", "([[:digit:]]{4}$)") %>%
mutate(
start_year = as.numeric(start_year),
start_year = cut(start_year, breaks = 5))
As for the grantees, we are interested in gender and assume only applicants and responsible applicants actually write the grant.
grantees <- grantees %>%
select(person_id = person_id_snsf,
gender,
projects_as_responsible_applicant,
projects_as_applicant) %>%
drop_na()
We sort the grants according to whether grantees are all female, all male, or mixed.
df_gender <- grantees %>%
select(person_id, gender)
df_mapp <- grantees %>%
select(person_id, project_id = projects_as_responsible_applicant) %>%
mutate(project_id = str_split(project_id, ";")) %>%
unnest(project_id) %>%
semi_join(grants, by = c("project_id" = "project_number"))
df_app <- grantees %>%
select(person_id, project_id = projects_as_applicant) %>%
mutate(project_id = str_split(project_id, ";")) %>%
unnest(project_id) %>%
semi_join(grants, by = c("project_id" = "project_number"))
df_app <- df_app %>%
bind_rows(df_mapp) %>%
distinct(person_id, project_id) %>%
left_join(df_gender, by = "person_id")
gender_proj <- df_app %>%
group_by(project_id) %>%
summarise(
n_authors = n(),
share_women = sum(gender == "female") / n_authors) %>%
ungroup() %>%
mutate(share_women = round(share_women, 1))
grants <- grants %>%
left_join(gender_proj, by = c("project_number" = "project_id")) %>%
drop_na() %>%
mutate(
gender = "mixed",
gender = ifelse(share_women == 0, "male", gender),
gender = ifelse(share_women == 1, "female", gender))
For consistency with the method presented in BMJ 2019;367:l6573, we flag whether positive terms occur at least once:
contains_any <- function(w, terms)
any(terms %in% w)
terms25 <- c(
"novel", "unique", "promising", "favorable", "robust",
"excellent", "prominent", "supportive", "encouraging", "remarkable",
"innovative", "unprecedented", "bright", "enormous", "reassuring",
"creative", "assuring", "hopeful", "astonishing", "spectacular",
"amazing", "inventive", "phenomenal", "groundbreaking", "inspiring")
grants <- grants %>%
mutate(is_positive = map_lgl(text, contains_any, terms = terms25))
grants %>%
count(start_year, gender) %>%
kable()
grants %>%
filter(start_year != "(1997,2002]") %>% # 2 women only
mutate(
start_year = fct_drop(start_year),
start_year = fct_rev(start_year),
domain = factor(
domain,
c("Biology and Medicine",
"Mathematics, Natural- and Engineering Sciences",
"Humanities and Social Sciences"),
c("LS", "PE", "SH")),
subdomain = factor(subdomain),
gender = factor(gender, c("male", "female"), c("M", "F")),
is_positive = factor(is_positive, c("FALSE", "TRUE"), c("N", "Y"))) %>%
select(is_positive, gender,
domain, subdomain, discipline_name,
share_women, start_year) %>%
write_rds(here("data", "clean_counts.Rds"))
And we try to look for possible differences in counts by gender, domain, and start year. Below detailed shares (%):
dataset <- read_rds(here("data", "clean_counts.Rds")) %>%
filter(!is.na(gender))
## filter: removed 1,650 rows (11%), 13,943 rows remaining
get_positive_share <- function(grouped_df, side_note) {
grouped_df %>%
summarise(
total = n(),
n_pos = sum(is_positive == "Y")) %>%
mutate(share = round(100 * n_pos / total, 1)) %>%
kable(caption = side_note)
}
dataset %>%
group_by(domain) %>%
get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by domain.")
## group_by: one grouping variable (domain)
## summarise: now 3 rows and 3 columns, ungrouped
## mutate: new variable 'share' with 3 unique values and 0% NA
Percentage of applications for which title and abstract contain at least one positive term by domain.
LS |
5258 |
2488 |
47.3 |
PE |
6080 |
2789 |
45.9 |
SH |
2605 |
705 |
27.1 |
dataset %>%
group_by(gender) %>%
get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by gender of the grantee.")
## group_by: one grouping variable (gender)
## summarise: now 2 rows and 3 columns, ungrouped
## mutate: new variable 'share' with 2 unique values and 0% NA
Percentage of applications for which title and abstract contain at least one positive term by gender of the grantee.
M |
12012 |
5225 |
43.5 |
F |
1931 |
757 |
39.2 |
dataset %>%
group_by(start_year, gender) %>%
get_positive_share(side_note = "Percentage of applications for which title and abstract contain at least one positive term by start year of the grant and by gender of the grantee.")
## group_by: 2 grouping variables (start_year, gender)
## summarise: now 8 rows and 4 columns, one group variable remaining (start_year)
## mutate (grouped): new variable 'share' with 8 unique values and 0% NA
Percentage of applications for which title and abstract contain at least one positive term by start year of the grant and by gender of the grantee.
(2015,2020] |
M |
2797 |
1447 |
51.7 |
(2015,2020] |
F |
639 |
295 |
46.2 |
(2011,2015] |
M |
4691 |
2124 |
45.3 |
(2011,2015] |
F |
728 |
271 |
37.2 |
(2006,2011] |
M |
3192 |
1192 |
37.3 |
(2006,2011] |
F |
385 |
131 |
34.0 |
(2002,2006] |
M |
1332 |
462 |
34.7 |
(2002,2006] |
F |
179 |
60 |
33.5 |
dataset %>%
group_by(domain, gender) %>%
get_positive_share(
side_note = "Percentage of applications for which title and abstract contain at least one positive term by domain of the grant and by gender of the grantee.")
## group_by: 2 grouping variables (domain, gender)
## summarise: now 6 rows and 4 columns, one group variable remaining (domain)
## mutate (grouped): new variable 'share' with 6 unique values and 0% NA
Percentage of applications for which title and abstract contain at least one positive term by domain of the grant and by gender of the grantee.
LS |
M |
4445 |
2108 |
47.4 |
LS |
F |
813 |
380 |
46.7 |
PE |
M |
5573 |
2561 |
46.0 |
PE |
F |
507 |
228 |
45.0 |
SH |
M |
1994 |
556 |
27.9 |
SH |
F |
611 |
149 |
24.4 |
dataset %>%
group_by(start_year, domain, gender) %>%
get_positive_share(
side_note = "Percentage of applications for which title and abstract contain at least one positive term by start year and domain of the grant and by gender of the grantee.")
## group_by: 3 grouping variables (start_year, domain, gender)
## summarise: now 24 rows and 5 columns, 2 group variables remaining (start_year, domain)
## mutate (grouped): new variable 'share' with 24 unique values and 0% NA
Percentage of applications for which title and abstract contain at least one positive term by start year and domain of the grant and by gender of the grantee.
(2015,2020] |
LS |
M |
1080 |
618 |
57.2 |
(2015,2020] |
LS |
F |
269 |
146 |
54.3 |
(2015,2020] |
PE |
M |
1177 |
647 |
55.0 |
(2015,2020] |
PE |
F |
166 |
89 |
53.6 |
(2015,2020] |
SH |
M |
540 |
182 |
33.7 |
(2015,2020] |
SH |
F |
204 |
60 |
29.4 |
(2011,2015] |
LS |
M |
1593 |
800 |
50.2 |
(2011,2015] |
LS |
F |
258 |
124 |
48.1 |
(2011,2015] |
PE |
M |
2262 |
1084 |
47.9 |
(2011,2015] |
PE |
F |
198 |
90 |
45.5 |
(2011,2015] |
SH |
M |
836 |
240 |
28.7 |
(2011,2015] |
SH |
F |
272 |
57 |
21.0 |
(2006,2011] |
LS |
M |
1111 |
447 |
40.2 |
(2006,2011] |
LS |
F |
175 |
67 |
38.3 |
(2006,2011] |
PE |
M |
1631 |
644 |
39.5 |
(2006,2011] |
PE |
F |
109 |
38 |
34.9 |
(2006,2011] |
SH |
M |
450 |
101 |
22.4 |
(2006,2011] |
SH |
F |
101 |
26 |
25.7 |
(2002,2006] |
LS |
M |
661 |
243 |
36.8 |
(2002,2006] |
LS |
F |
111 |
43 |
38.7 |
(2002,2006] |
PE |
M |
503 |
186 |
37.0 |
(2002,2006] |
PE |
F |
34 |
11 |
32.4 |
(2002,2006] |
SH |
M |
168 |
33 |
19.6 |
(2002,2006] |
SH |
F |
34 |
6 |
17.6 |
Generalized linear models (with a logit link function for binary response data) can help checking the significance level of the observed differences:
model_gender <- glm(is_positive ~ gender,
dataset, family = "binomial")
model_year <- glm(is_positive ~ start_year,
dataset, family = "binomial")
model_gender_year <- glm(is_positive ~ gender + start_year,
dataset, family = "binomial")
model_gender_year_domain <- glm(is_positive ~ gender + domain + start_year,
dataset, family = "binomial")
tidy(model_gender) %>%
kable(caption = "GLM for gender of the grantee.")
GLM for gender of the grantee.
(Intercept) |
-0.2615542 |
0.0184046 |
-14.21138 |
0.0000000 |
genderF |
-0.1772545 |
0.0501151 |
-3.53695 |
0.0004048 |
tidy(model_year) %>%
kable(caption = "GLM for start year of the grant.")
GLM for start year of the grant.
(Intercept) |
0.0279413 |
0.0341229 |
0.8188428 |
0.4128761 |
start_year(2011,2015] |
-0.2611385 |
0.0437332 |
-5.9711682 |
0.0000000 |
start_year(2006,2011] |
-0.5607458 |
0.0486198 |
-11.5332805 |
0.0000000 |
start_year(2002,2006] |
-0.6669680 |
0.0639625 |
-10.4274844 |
0.0000000 |
tidy(model_gender_year) %>%
kable(caption = "GLM for gender of the grantee and start year of the grant.")
GLM for gender of the grantee and start year of the grant.
(Intercept) |
0.0715699 |
0.0354239 |
2.020388 |
0.0433432 |
genderF |
-0.2348357 |
0.0506782 |
-4.633862 |
0.0000036 |
start_year(2011,2015] |
-0.2736773 |
0.0438631 |
-6.239353 |
0.0000000 |
start_year(2006,2011] |
-0.5798493 |
0.0488458 |
-11.871019 |
0.0000000 |
start_year(2002,2006] |
-0.6837244 |
0.0641222 |
-10.662826 |
0.0000000 |
tidy(model_gender_year_domain) %>%
kable(caption = "GLM for gender of the grantee and start year and domain of the grant.")
GLM for gender of the grantee and start year and domain of the grant.
(Intercept) |
0.2773068 |
0.0422339 |
6.565978 |
0.0000000 |
genderF |
-0.1231801 |
0.0521501 |
-2.362030 |
0.0181752 |
domainPE |
-0.0711986 |
0.0384672 |
-1.850891 |
0.0641852 |
domainSH |
-0.9409120 |
0.0528571 |
-17.801038 |
0.0000000 |
start_year(2011,2015] |
-0.2828457 |
0.0446153 |
-6.339658 |
0.0000000 |
start_year(2006,2011] |
-0.6353008 |
0.0496612 |
-12.792707 |
0.0000000 |
start_year(2002,2006] |
-0.7690391 |
0.0650683 |
-11.818954 |
0.0000000 |