library(bibliometrix) library(dplyr) library(purrr) library(stringr) library(grr) library(tidyr) library(ggplot2) library(cowplot) library(pscl) library(MASS) library(car) library(multcomp) library(xtable) rm(list=ls()) #>---------------------------------------------------------- ##DATA_IMPORT---- # Initialise dataframe processed bib files dta <- data.frame() setwd("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2010_2014") file_list <- list.files() for (file in file_list){ # if the merged dataset does exist, append to it if (exists("dta")){ lines <-readLines(file) dta2 <- convert2df(lines, dbsource = "isi", format = "bibtex") dta <- rbind(dta, dta2) rm(dta2) } } bibdf <- dta # Sort on total cites bibdf <- bibdf[order(bibdf$TC, decreasing = T),] # Split most and least cited at median value for total cites bibdf$cite_split <- ifelse(bibdf$TC > median(bibdf$TC), 1, 0) bibdf$cite_split <- as.factor(bibdf$cite_split) # Papers with no affiliation data which(is.na(bibdf$C1)) # Remove papers with no affiliation data bibdf <- bibdf[!(is.na(bibdf$C1)), ] # Search for equal co-authorship affiliations grep('contributed equally', bibdf$C1, ignore.case = T) grep('first authors', bibdf$C1, ignore.case = T) grep('last authors', bibdf$C1, ignore.case = T) grep('equal contributions', bibdf$C1, ignore.case = T) # Papers with no 'REPRINT' info that cause the script to stop. reprint <- which(!grepl('reprint', bibdf$C1, ignore.case = T)) # Remove papers with no 'REPRINT' info that cause the script to stop. bibdf <- bibdf[-c(reprint), ] #>---------------------------------------------------------- ##CREATE_DF---- # Initialise output df append <- data.frame(matrix(ncol = 9, nrow = 1)) colnames(append) <- c("Study", "UT" , "Journal", "Tot_Cites", "Cite_Split", "Num_Affil", "Num_Auth", "Affil_No", "Freq") append$UT <- as.character(append$UT) append$Journal <- factor(append$Journal) append$Cite_Split <- factor(append$Cite_Split) append$Affil_No <- factor(append$Affil_No) # Loop through studies len <- c(1:length(bibdf$AU)) for(j in seq_along(len)){ # Create Authors data auth_names <- strsplit(bibdf$AU[j], ";") # Split author names (at semi-colon) auth_names <- unlist(auth_names) auth_names <- unique(auth_names) # Only include unique occurrences of authors name # Create Affiliations data split_af <- str_replace_all(bibdf$C1[j], fixed(" "), "") # Remove affiliation whitespace split_af <- strsplit(split_af, ";") # Split by affiliations (at semi-colon) split_af <- split_af[[1]][-1] # Remove redundant first row (reprint author). This address is repeated later affil_length <- length(split_af) split_af <- gsub("([^,]+,[^,]+),", "\\1;", split_af) # Replace every second comma with a semi-colon. This will identify complete names - last, first as one element split_af <- strsplit(split_af, ";") # Split at semi-colon # Delete duplicate names attached to each affiliation - there should only be unique names associated with each affiliation for(i in 1:length(split_af)){ split_af[[i]] <- unique(split_af[[i]]) } split_af <- unlist(split_af) # Delete elements with no comma - these only contain affiliation data z <- !grepl(',', split_af, ignore.case = T) # Logical identifying which elements contain no comma zvec <- which(z==T) # Vector of elements if(!length(zvec)==F){ # Need to skip this step if all elements contain a comma, otherwise zvec is NULL and the script stops split_af <- split_af[-zvec] # Delete these elements from split_af (zvec contains values) } split_af <- strsplit(split_af, ",") # Now split at comma to create last and first name columns mat <- matrix(unlist(split_af), ncol=2, byrow=TRUE) # Create matrix with name info (will also include redundant institution details) df <- as.data.frame(mat) colnames(df) <- c("Last", "First") df$Init <- substr(df$First,1,1) # Extract first name initial - will use this in the name matching df$Last <- paste(df$Last,df$Init) # Attach initial to last name auth_affil <- df$Last # This will be matched with auth_names match <- matches(auth_names, auth_affil) # Determine how many occurrences of each authors name from the affiliations data affil <- table(match$x) # Table from matched occurrences - affil_tab <- table(affil) # Summarise further to determine number of affiliations no_affil <- as.data.frame(affil_tab) colnames(no_affil) <- c("Affil_No", "Freq") no_affil <- cbind("Study" = j, "UT" = bibdf$UT[j], "Journal" = bibdf$SO[j], "Tot_Cites" = bibdf$TC[j], "Cite_Split" = bibdf$cite_split[j], "Num_Affil" = affil_length, "Num_Auth" = length(auth_names) , no_affil) append <- rbind(no_affil, append) # append data to existing } # Reorder levels of Affil_No append$Affil_No <- factor(append$Affil_No, levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "12")) # Convert to wide format append_wide <- spread(append, Affil_No, Freq) # Rename Affil cols names(append_wide)[c(8:18)] <- c("Affil_1", "Affil_2", "Affil_3", "Affil_4", "Affil_5", "Affil_6", "Affil_7", "Affil_8", "Affil_9", "Affil_10", "Affil_12") # Delete NA row (if Study = NA) del <- which(is.na(append_wide$Study)) append_wide <- append_wide[-del,] # Replace NA's with 0 append_wide[is.na(append_wide)] <- 0 # Delete NA Column append_wide <- append_wide[,-19] # Sort on Study Num append_wide <- append_wide[order(append_wide$Study),] # Create variable indicating max number of affiliations for study temp <- aggregate(as.numeric(Affil_No) ~ Study, data = append, max) append_wide$Affil_Max <- temp[,2] append_wide$Affil_Max <- factor(append_wide$Affil_Max) levels(append_wide$Affil_Max) # There is no level 10 for Affil_Max, because even though there was one study with 10 affiliations, this was the same study that had 12, so 10 wasn't counted. Also, level 11 is actually 12 (12 - as factor - converted to 11 - as numeric). # Quantiles of author numbers - Split at 4, 6, and 10 which represent the 25%, 50% and 75% quantiles for the number of authors on each paper. This categorises author numbers for the modelling quantile(append_wide$Num_Auth) append_wide$Cat_Auth <- 0 append_wide$Cat_Auth[append_wide$Num_Auth < 4] <- 1 append_wide$Cat_Auth[append_wide$Num_Auth > 3 & append_wide$Num_Auth < 6] <- 2 append_wide$Cat_Auth[append_wide$Num_Auth > 5 & append_wide$Num_Auth < 10] <- 3 append_wide$Cat_Auth[append_wide$Num_Auth > 9] <- 4 append_wide$Cat_Auth <- factor(append_wide$Cat_Auth) # Subset and create DF to have no more than 6 affiliations. append_wide_sub <- subset(append_wide, as.numeric(Affil_Max) < 7) append_wide_sub <- append_wide_sub[,-14:-18] append_wide_sub$Affil_Max <- factor(append_wide_sub$Affil_Max) #>---------------------------------------------------------- ##SUMMARIES---- # Number of articles length(append_wide$Study) # Total citations over 5 years sum(append_wide$Tot_Cites) # Total author appearances over 5 years - These are not unique sum(append_wide$Num_Auth) # Total affiliations over 5 years sum(append_wide$Num_Affil) # Maximum number of citations on a paper max(append_wide$Tot_Cites) # Minimum number of citations on a paper min(append_wide$Tot_Cites) # Average number of citations per paper mean(append_wide$Tot_Cites) # Median number of citations per paper median(append_wide$Tot_Cites) # Maximum number of authors on a paper max(append_wide$Num_Auth) # Minimum number of authors on a paper min(append_wide$Num_Auth) # Average number of authors per paper mean(append_wide$Num_Auth) # Median number of authors per paper median(append_wide$Num_Auth) # Maximum number of affiliations on a paper max(append_wide$Num_Affil) # Minimum number of affiliations on a paper min(append_wide$Num_Affil) # Average number of affiliations per paper mean(append_wide$Num_Affil) # Median number of affiliations per paper median(append_wide$Num_Affil) # Relative fractions (RFs) of authors for each affiliation append_wide$RF_1 <- append_wide$Affil_1/append_wide$Num_Auth append_wide$RF_2 <- append_wide$Affil_2/append_wide$Num_Auth append_wide$RF_3 <- append_wide$Affil_3/append_wide$Num_Auth append_wide$RF_4 <- append_wide$Affil_4/append_wide$Num_Auth append_wide$RF_5 <- append_wide$Affil_5/append_wide$Num_Auth append_wide$RF_6 <- append_wide$Affil_6/append_wide$Num_Auth RF_means <- map(append_wide[21:26], mean) RF_means <- unlist(RF_means) RF_av_percent <- RF_means*100 # Table giving number of papers in each category (Max_Affil * Cat_Auth) source("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/Functions/crosstab.r") xtabf <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "f") # Frequency xtabj <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "j") # Percent xtable(xtabf$crosstab, digits=0) xtable(xtabj$crosstab) # Diff number of authors - most vs least cited t.test(Num_Auth ~ Cite_Split, append_wide) # Diff number of affiliations - most vs least cited t.test(Num_Affil ~ Cite_Split, append_wide) # Diff in number of authors for Affil=1 t.test(Affil_1 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=2 t.test(Affil_2 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=3 t.test(Affil_3 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=4 t.test(Affil_4 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=5 t.test(Affil_5 ~ Cite_Split, append_wide) # Diff in number of authors for Affil=6 t.test(Affil_6 ~ Cite_Split, append_wide) # Number of papers in most-cited group length(append_wide$Num_Affil[append_wide$Cite_Split==1]) # Number of papers in least-cited group length(append_wide$Num_Affil[append_wide$Cite_Split==0]) # Compare ** number of papers** across each category of affiliation - not mutually exclusive i.e. a paper may have more than one affiliation category and will therefore be counted more than once tabfreq <- table(append$Affil_No, append$Cite_Split) xtable(tabfreq) most_vec <- tabfreq[,2] # Frequency of affiliations in most cited half of all papers total_vec <- tabfreq[,1] + tabfreq[,2] # Total frequency of affiliations across all papers prop.test(most_vec, total_vec) # Proportions test (equivalent to Chi-Square) prop.trend.test(most_vec, total_vec) # Proportions trend test # Compare ** number of authors** across each category of affiliation lowcite <- subset(append_wide, Cite_Split == 0) highcite <- subset(append_wide, Cite_Split == 1) lowcite_sum <- map(lowcite[,8:18], sum) lowcite_sum <- unlist(lowcite_sum) sum(lowcite_sum) highcite_sum <- map(highcite[,8:18], sum) highcite_sum <- unlist(highcite_sum) sum(highcite_sum) row_tot <- lowcite_sum + highcite_sum row_perc <- row_tot/sum(row_tot) col_tot <- c(sum(lowcite_sum), sum(highcite_sum), sum(row_tot), 1) col_perc <- col_tot/sum(row_tot) citedf <- data.frame(lowcite_sum, highcite_sum, row_tot, row_perc) citedf <- rbind(citedf[1:10,], c(0, 0, 0, 0) , citedf[-(1:10),]) #Insert row for Affil 11 citedf <- rbind(citedf, col_tot, col_perc) citedf$row_perc[14] <- NA rownames(citedf)[13] <- "col_tot" rownames(citedf)[14] <- "col_perc" xtable(citedf, digits=0) prop.test(citedf$highcite_sum[1:8], citedf$tot[1:8]) # Proportions test (equivalent to Chi-Square) prop.trend.test(citedf$highcite_sum[1:6], citedf$tot[1:6]) # Proportions trend test #>---------------------------------------------------------- ##MODELLING---- # LM summary(mod1a <- lm(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub)) # Log-Linear summary(mod1b <- lm(log(Tot_Cites+1) ~ Cat_Auth + Affil_Max, data=append_wide_sub)) mod1b$coefficients*100 # NB summary(mod1c <- glm.nb(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub)) exp(mod1c$coef) # Poisson summary(mod2 <- glm(Tot_Cites ~ Cat_Auth + Affil_Max, family="poisson", data=append_wide_sub)) exp(mod2$coef) # Compare pchisq(2 * (logLik(mod1c) - logLik(mod2)), df = 1, lower.tail = FALSE) # Chi-square high indicating NB better fit. # Compare conditional means and variances of Tot_Cites within each level of Affil_Max. Variances exceed means suggesting overdispersion. More evidence that NB better model to fit. with(append_wide_sub, tapply(Tot_Cites, Affil_Max, function(x) {sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))})) # Logistic summary(mod1d <- glm(Cite_Split ~ Cat_Auth + Affil_Max, data=append_wide_sub, family = "binomial")) exp(mod1d$coef) # LM - Interaction summary(mod1e <- lm(Tot_Cites ~ Affil_Max + Cat_Auth + Affil_Max:Cat_Auth, data=append_wide_sub)) # summary(mod1e <- lm(Tot_Cites ~ Cat_Auth + Affil_Max + Cat_Auth:Affil_Max, data=append_wide_sub)) # Marginal Means library(emmeans) emmeans(mod1e, ~ Affil_Max + Cat_Auth) emmip(mod1e, Affil_Max ~ Cat_Auth) # Global test for dropping interaction L <- matrix(c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) # Anova - should be same as contrast anova(mod1a, mod1e) # Test Contrasts # Auth Num = 1, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 1, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 1, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 2, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,1, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 3, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 2 L <- matrix(c(0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,1,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 3 L <- matrix(c(0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Auth Num = 4, Max Affil = 4 L <- matrix(c(0,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 1, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,1,0,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 2, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,1,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,1,0, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,1,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 3, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,1,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 2 L <- matrix(c(0,0,0,0,0,0, 1,0,0,0,0,1, 0,0,0,0,0,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 3 L <- matrix(c(0,0,0,0,0,0, 0,1,0,0,0,0, 0,0,0,0,1,0, 0,0,0,0,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Max Affil = 4, Auth Num = 4 L <- matrix(c(0,0,0,0,0,0, 0,0,1,0,0,0, 0,0,0,0,0,0, 0,0,0,1,0,0),byrow=TRUE,ncol=24) (summary(glht(mod1e,L),test=Chisqtest())) (contrast <- confint(glht(mod1e,L),test=Chisqtest())) # Predictions nd <- cov_pat6[,1:2] # cbind predictions and SE's for 3 models nd <- cbind(nd, Mean_LM = predict(mod1a, newdata = nd, type = "response"), SE_LM = predict(mod1a, newdata = nd, type="response", se.fit = T)$se.fit, Mean_LogLM = predict(mod1b, newdata = nd, type = "response"), SE_LogLM = predict(mod1b, newdata = nd, type="response", se.fit = T)$se.fit, Mean_NB = predict(mod1c, newdata = nd, type = "response"), SE_NB = predict(mod1c, newdata = nd, type="response", se.fit = T)$se.fit) mean_cites <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub, mean) # Back Transform menas and SE's for loglinear mod nd2 <- cbind(nd, Mean_LogLM_Back = exp(nd[,5] + 0.5*summary(mod1b)$sigma^2), SE_LogLM_Back = exp(nd[,5])*nd[,6], Mean_Cites = mean_cites[,3]) # DF of means and CI's nd_ci <- cbind(nd2[,1:2], "Mean_Cites" = nd2[,11], "LM_lower" = nd2[,3]-(1.96*nd2[,4]), "LM_mean" = nd2[,3], "LM_upper" = nd2[,3]+(1.96*nd2[,4]), "LogLM_lower" = nd2[,9]-(1.96*nd2[,10]), "LogLM_mean" = nd2[,9], "LogLM_upper" = nd2[,9]+(1.96*nd2[,10]), "NB_lower" = nd2[,7]-(1.96*nd2[,8]), "NB_mean" = nd2[,7], "NB_upper" = nd2[,7]+(1.96*nd2[,8])) # Convert to long for plotting nd_ci_long <- data.frame(cbind("Cat_Auth" = rep(nd_ci$Cat_Auth,4), "Affil_Max" = rep(nd_ci$Affil_Max,4), "Model" = rep(c(1:4), each=24), "Mean" = c(nd_ci$Mean_Cites, nd_ci$LM_mean, nd_ci$LogLM_mean, nd_ci$NB_mean), "Lower" = c(rep(NA, 24), nd_ci$LM_lower, nd_ci$LogLM_lower, nd_ci$NB_lower), "Upper" = c(rep(NA, 24), nd_ci$LM_upper, nd_ci$LogLM_upper, nd_ci$NB_upper))) # Format Model Output # LM cia <- as.data.frame(confint(mod1a)) mod1a_out <- cbind(mod1a$coefficients, coef(summary(mod1a))[,2], cia[,1], cia[,2], coef(summary(mod1a))[,3], coef(summary(mod1a))[,4]) mod1a_out <- data.frame(mod1a_out) colnames(mod1a_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value") xtable(mod1a_out, digits=3) # Log-LM cib <- as.data.frame(confint(mod1b)) mod1b_out <- cbind(mod1b$coefficients, coef(summary(mod1b))[,2], cib[,1], cib[,2], coef(summary(mod1b))[,3], coef(summary(mod1b))[,4]) mod1b_out <- data.frame(mod1b_out) colnames(mod1b_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value") xtable(mod1b_out, digits=3) # NB cic <- as.data.frame(exp(confint(mod1c))) mod1c_out <- cbind(mod1c$coefficients, exp(mod1c$coefficients), coef(summary(mod1c))[,2], cic[,1], cic[,2], coef(summary(mod1c))[,3], coef(summary(mod1c))[,4]) mod1c_out <- data.frame(mod1c_out) colnames(mod1c_out) <- c("beta", "Rate", "S.E.", "2.5%", "97.5%","Z", "p-value") xtable(mod1c_out, digits=3) # Check fit - p value interpretation (should be > 0.05 for good fit) 1 - pchisq(mod1c$deviance, mod1c$df.residual) # Check fit - resid dev interpretation (should not be above this value for a good fit) qchisq(0.95, df.residual(mod1c)) deviance(mod1c) #>---------------------------------------------------------- ##PLOTS---- # HISTOGRAMS # Hist Citations hist_1 <- ggplot(append_wide, aes(Tot_Cites)) + geom_histogram(binwidth = 1) + scale_x_continuous("Citations/Paper",breaks=seq(0,500,50), limits=c(0, 500)) + labs(y = "Frequency") # Hist Authors hist_2 <- ggplot(append_wide, aes(Num_Auth)) + geom_histogram(binwidth = 1) + scale_x_continuous("Authors/Paper",breaks=seq(0,50,5), limits=c(0, 50)) + labs(y = "Frequency") # Hist Affiliations hist_3 <- ggplot(append_wide, aes(Num_Affil)) + geom_histogram(binwidth = 1) + scale_x_continuous("Affiliations/Paper",breaks=seq(0,50,5), limits=c(0, 50)) + labs(y = "Frequency") plot_grid(hist_1, hist_2, hist_3, align='h', ncol = 1, labels=c('A', 'B', 'C')) med.n <- function(x){ return(c(y = median(x)+10, label = round(median(x),0))) # experiment with the multiplier to find the perfect position } mean.n <- function(x){ return(c(y = -8, label = round(mean(x),0))) # experiment with the multiplier to find the perfect position } # BOXPLOTS ggplot(append_wide_sub, aes(Affil_Max, Tot_Cites)) + geom_boxplot(outlier.size = 0.2) + coord_cartesian(ylim=c(0, 500)) + facet_grid(~Cat_Auth) + theme(plot.subtitle = element_text(vjust = 1), plot.caption = element_text(vjust = 1), axis.line = element_line(colour = "gray50", size = 0.5, linetype = "solid"), panel.background = element_rect(fill = NA)) + labs(x = "Maximum Affiliation", y = "Citations") + stat_summary(fun.data = mean.n, geom = "text") + stat_summary(fun.data = med.n, geom = "text") # CORRELATIONS cor_df <- append_wide_sub[, c(4,6,7,14,8:13)] cor_df$Affil_Max <- as.numeric(cor_df$Affil_Max) cor_df$Cat_Auth <- as.numeric(cor_df$Cat_Auth) names(cor_df)=c("Citations", "Affiliations","Authors","Max. Affiliation","Affiliation = 1","Affiliation = 2","Affiliation = 3","Affiliation = 4","Affiliation = 5","Affiliation = 6") cor_mat <- cor(cor_df, use="complete.obs") source("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/Functions/corrplot.r") rquery.cormat(cor_df) # BALLOON PLOT # Creating covariate pattern from ungrouped data test <- append_wide[,c(4,19,20)] temp <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data = test, sum) # Create zero values for the missing covariate patterns temp <- rbind(temp, c(1,8,0)) temp <- rbind(temp, c(2,8,0)) temp <- rbind(temp, c(1,9,0)) temp <- rbind(temp, c(2,9,0)) temp <- rbind(temp, c(1,11,0)) temp <- rbind(temp, c(2,11,0)) temp <- rbind(temp, c(3,11,0)) # Only first 6 affiliations. cov_pat6 <- temp[1:24,] # Add offset - number of papers per category cov_pat6$offs <- c(3142, 2715, 2898, 1387, 1371, 2207, 3845, 3374, 454, 811, 1509, 1859, 103, 210, 419, 695, 24, 61, 119, 250, 4, 9, 35, 64) # P 110 of R Graphics Cookbook - Models summed citations - from individual paper data. ggplot(cov_pat6, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites)) + geom_point(shape=21, colour="black", fill="cornsilk") + scale_size_area(max_size=40, guide=F) + labs(x = "Maximum Affiliation", y = "Author Number") + geom_text(aes(label=Tot_Cites), vjust=7, colour="grey50", size=4) # # Expected? summed citations if number of papers in each category was the same (i.e. original data -> fewer papers with higher Max_Affil) # a <- table(append_wide_sub$Cat_Auth,append_wide_sub$Affil_Max) # b <- as.data.frame.matrix(a) # c <- max(b)/b # d <- c(c[,1],c[,2],c[,3],c[,4],c[,5],c[,6]) # e <- cbind(cov_pat6, "adj" = d) # e$Tot_Cites2 <- e$Tot_Cites*e$adj # ggplot(e, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites2)) + # geom_point(shape=21, colour="black", fill="cornsilk") + # scale_size_area(max_size=40, guide=F) + # labs(x = "Maximum Affiliation", y = "Author Category") + # geom_text(aes(label=Tot_Cites2), vjust=7, colour="grey50", size=4) # LINE PLOTS # With CI's nd_ci_long %>% mutate(Model = as.factor(Model)) %>% ggplot(aes(Affil_Max, Mean)) + geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) + scale_linetype_manual(values=c(2,1,1,1)) + geom_ribbon(alpha = .3, aes(ymin = Lower, ymax = Upper, group = Model, fill = Model)) + scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) + scale_fill_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) + facet_grid(. ~ Cat_Auth) # Without CI's # Create gridline cuts minorsy <- seq(50,200,by=10) minorsx <- seq(1,6,by=1) nd_ci_long %>% mutate(Model = as.factor(Model)) %>% ggplot(aes(Affil_Max, Mean)) + geom_hline(mapping=NULL, yintercept=minorsy,colour='grey95') + geom_vline(mapping=NULL, xintercept=minorsx,colour='grey95') + geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) + scale_linetype_manual(values=c(2,1,1,1)) + scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3"), labels=c("Data", "Linear Model", "LogLinear Model", "Negative Binomial Model")) + facet_grid(. ~ Cat_Auth) + scale_x_continuous("Maximum Affiliation", breaks=seq(1,6,1)) + scale_y_continuous("Citations", breaks=seq(0,200,10), limits=c(50, 200)) + theme(legend.position="top", legend.title=element_blank()) + guides(color = guide_legend(override.aes = list(linetype = c(2,1,1,1))), linetype = FALSE) #>---------------------------------------------------------- #>---------------------------------------------------------- # ADDITIONAL STUFF #>---------------------------------------------------------- #>---------------------------------------------------------- # AUTHORS # NB summary(m1 <- glm.nb(Tot_Cites ~ Num_Auth, maxit=100, data=append_wide)) # Models citations at the paper level exp(m1$coef) 1 - pchisq(m1$deviance, m1$df.residual) # AFFILIATIONS # NB summary(m1 <- glm.nb(Tot_Cites ~ Num_Affil, data=append_wide)) # Models citations at the paper level exp(m1$coef) 1 - pchisq(m1$deviance, m1$df.residual) # Scatterplot: Number of Authors vs Number of Affiliations plot(append_wide$Num_Auth, append_wide$Num_Affil) cor.test(append_wide$Num_Auth, append_wide$Num_Affil) plot(log(append_wide$Num_Auth), log(append_wide$Num_Affil)) # Scatterplot: Number of Authors vs Total Citations plot(append_wide$Num_Auth, append_wide$Tot_Cites) cor.test(append_wide$Num_Auth, append_wide$Tot_Cites) plot(log(append_wide$Num_Auth), log(append_wide$Tot_Cites)) # Scatterplot: Number of Affiliations vs Total Citations plot(append_wide$Num_Affil, append_wide$Tot_Cites) cor.test(append_wide$Num_Affil, append_wide$Tot_Cites) plot(log(append_wide$Num_Affil), log(append_wide$Tot_Cites)) # SMALL DATASET # datan <- readLines("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2014 Only/Nature.bib") # bibdfn <- convert2df(datan, dbsource = "isi", format = "bibtex") # bibresn <- biblioAnalysis(bibdfn, sep = ";") # # datas <- readLines("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2014 Only/Science.bib") # bibdfs <- convert2df(datas, dbsource = "isi", format = "bibtex") # bibress <- biblioAnalysis(bibdfs, sep = ";") # # # Join Nature and Science df's # bibdf <- rbind(bibdfn, bibdfs) # Compare most and least cited halves and do PLOTS # Create binary variable for most cited half (Bin_Cites_Tot = 1) and least cited half (Bin_Cites_Tot = 0) OVERALL - equal numbers of papers in each half append2 <- append[order(append$Tot_Cites),] append2$Bin_Cites_Tot <- 0 temp <- append_wide$Study[append_wide$Bin_Cites_Tot==1] # Identify Bin_Cites_Tot==1 from append_wide data, and match append2$Bin_Cites_Tot[append2$Study<472] <- 1 append2$Bin_Cites_Tot[append2$Study>828 & append2$Study<1157] <- 1 append2 <- append2[order(append2$Study),] # Summary Function summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE, conf.interval=.95, .drop=TRUE) { library(plyr) # New version of length which can handle NA's: if na.rm==T, don't count them length2 <- function (x, na.rm=FALSE) { if (na.rm) sum(!is.na(x)) else length(x) } # This does the summary. For each group's data frame, return a vector with # N, mean, and sd datac <- ddply(data, groupvars, .drop=.drop, .fun = function(xx, col) { c(N = length2(xx[[col]], na.rm=na.rm), mean = mean (xx[[col]], na.rm=na.rm), sd = sd (xx[[col]], na.rm=na.rm) ) }, measurevar ) # Rename the "mean" column datac <- rename(datac, c("mean" = measurevar)) datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean # Confidence interval multiplier for standard error # Calculate t-statistic for confidence interval: # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1 ciMult <- qt(conf.interval/2 + .5, datac$N-1) datac$ci <- datac$se * ciMult return(datac) } #---------------------------------------------------------- # Number of Authors # Summary measures append2most <- subset(append2, Bin_Cites_Tot == 1) # Most cited append2least <- subset(append2, Bin_Cites_Tot == 0) # Least cited sum_most <- summarySE(append2most, measurevar="Freq", groupvars="Affil_No") sum_most <- sum_most[1:6,] # Make max affiliations = 6 sum_least <- summarySE(append2least, measurevar="Freq", groupvars="Affil_No") # Create Cite Group variable sum_most$cite <- rep(1,6) sum_least$cite <- rep(0,6) # Join Cite Group summary data together citesum <- rbind(sum_most, sum_least) citesum$Affil_No <- factor(citesum$Affil_No, levels = c("1", "2", "3", "4", "5", "6")) # Re-order levels citesum$cite <- factor(citesum$cite) # Plot the 2 Cite Groups pd <- position_dodge(.1) ggplot(citesum, aes(x=Affil_No, y=Freq, colour=cite, group=cite)) + theme_classic() + geom_errorbar(aes(ymin=Freq-se, ymax=Freq+se), colour="black", width=.3, position=pd) + geom_line(position=pd, aes(group=cite)) + geom_point(aes(shape=cite), size = 4) + scale_shape(name ="", labels=c("Least-Cited", "Most-Cited")) + theme(legend.position="top") + xlab("Number of Affiliations") + ylab("Average Number of Authors per Affiliation") + scale_colour_grey(start = 0, end = 0.5) #---------------------------------------------------------- # Relative Fraction of Authors # Compute RFs append2$RF <- append2$Freq/append2$Num_Auth # Summary measures append2most <- subset(append2, Bin_Cites_Tot == 1) # Most cited append2least <- subset(append2, Bin_Cites_Tot == 0) # Least cited sum_most <- summarySE(append2most, measurevar="RF", groupvars="Affil_No") sum_most <- sum_most[1:6,] # Make max affiliations = 6 sum_least <- summarySE(append2least, measurevar="RF", groupvars="Affil_No") # Create Cite Group variable sum_most$cite <- rep(1,6) sum_least$cite <- rep(0,6) # Join Cite Group summary data together citesum <- rbind(sum_most, sum_least) citesum$Affil_No <- factor(citesum$Affil_No, levels = c("1", "2", "3", "4", "5", "6")) # Re-order levels citesum$cite <- factor(citesum$cite) # Plot the 2 Cite Groups pd <- position_dodge(.1) ggplot(citesum, aes(x=Affil_No, y=RF, colour=cite, group=cite)) + theme_classic() + geom_errorbar(aes(ymin=RF-se, ymax=RF+se), colour="black", width=.3, position=pd) + geom_line(position=pd, aes(group=cite)) + geom_point(aes(shape=cite), size = 4) + scale_shape(name ="", labels=c("Least-Cited", "Most-Cited")) + theme(legend.position="top") + xlab("Number of Affiliations") + ylab("Relative Fraction of Authors per Affiliation") + scale_colour_grey(start = 0, end = 0.5) # # Randomly check data matches that on WOS Portal # # Identify random study by Accession Number online: # # Match this to study (UT) in append_wide dataframe # which(append_wide$UT=="ISI000343799700048") # # Check that record matches # append_wide[562,] # # Create binary variable for most cited half (Bin_Cites_Jour = 1) and least cited half (Bin_Cites_Jour = 0) BY Journal (effectively a subgroup analysis) # tempsci <- subset(append_wide, Journal=="SCIENCE") # tempnat <- subset(append_wide, Journal=="NATURE") # tempsci$Bin_Cites_Jour <- 0 # tempsci$Bin_Cites_Jour[384:768] <- 1 # tempnat$Bin_Cites_Jour <- 0 # tempnat$Bin_Cites_Jour[414:828] <- 1 # # Rejoin append_wide # append_wide <- rbind(tempsci, tempnat) # # Create binary variable for most cited half (Bin_Cites_Tot = 1) and least cited half (Bin_Cites_Tot = 0) OVERALL # append_wide <- append_wide[order(append_wide$Tot_Cites),] # append_wide$Bin_Cites_Tot <- 0 # append_wide$Bin_Cites_Tot[798:1596] <- 1 # append_wide <- append_wide[order(append_wide$Study),]