library(bibliometrix)
library(dplyr)
library(purrr)
library(stringr)
library(grr)
library(tidyr)
library(ggplot2)
library(cowplot)
library(pscl)
library(MASS)
library(car)
library(multcomp)
library(xtable)
rm(list=ls())

#>----------------------------------------------------------

##DATA_IMPORT----

# Initialise dataframe processed bib files
dta <- data.frame()

setwd("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2010_2014")
file_list <- list.files()

for (file in file_list){
  # if the merged dataset does exist, append to it
  if (exists("dta")){
    lines <-readLines(file)
    dta2 <- convert2df(lines, dbsource = "isi", format = "bibtex")
    dta <- rbind(dta, dta2)
    rm(dta2)
  }
}

bibdf <- dta

# Sort on total cites
bibdf <- bibdf[order(bibdf$TC, decreasing = T),]
# Split most and least cited at median value for total cites
bibdf$cite_split <- ifelse(bibdf$TC > median(bibdf$TC), 1, 0)
bibdf$cite_split <- as.factor(bibdf$cite_split)

# Papers with no affiliation data
which(is.na(bibdf$C1))
# Remove papers with no affiliation data
bibdf <- bibdf[!(is.na(bibdf$C1)), ]

# Search for equal co-authorship affiliations
grep('contributed equally', bibdf$C1, ignore.case = T)
grep('first authors', bibdf$C1, ignore.case = T)
grep('last authors', bibdf$C1, ignore.case = T)
grep('equal contributions', bibdf$C1, ignore.case = T)

# Papers with no 'REPRINT' info that cause the script to stop.
reprint <- which(!grepl('reprint', bibdf$C1, ignore.case = T))
# Remove papers with no 'REPRINT' info that cause the script to stop.
bibdf <- bibdf[-c(reprint), ]


#>----------------------------------------------------------

##CREATE_DF----

# Initialise output df
append <- data.frame(matrix(ncol = 9, nrow = 1))
colnames(append) <- c("Study", "UT" , "Journal", "Tot_Cites", "Cite_Split", "Num_Affil", "Num_Auth", "Affil_No", "Freq")
append$UT <- as.character(append$UT)
append$Journal <- factor(append$Journal)
append$Cite_Split <- factor(append$Cite_Split)
append$Affil_No <- factor(append$Affil_No)

# Loop through studies
len <- c(1:length(bibdf$AU))

for(j in seq_along(len)){
  # Create Authors data
  auth_names <- strsplit(bibdf$AU[j], ";") # Split author names (at semi-colon)
  auth_names <- unlist(auth_names)
  auth_names <- unique(auth_names) # Only include unique occurrences of authors name
  # Create Affiliations data
  split_af <- str_replace_all(bibdf$C1[j], fixed(" "), "") # Remove affiliation whitespace
  split_af <- strsplit(split_af, ";") # Split by affiliations (at semi-colon)
  split_af <- split_af[[1]][-1] # Remove redundant first row (reprint author). This address is repeated later
  affil_length <- length(split_af)
  split_af <- gsub("([^,]+,[^,]+),", "\\1;", split_af) # Replace every second comma with a semi-colon. This will identify complete names - last, first as one element
  split_af <- strsplit(split_af, ";") # Split at semi-colon
  # Delete duplicate names attached to each affiliation - there should only be unique names associated with each affiliation
  for(i in 1:length(split_af)){
    split_af[[i]] <- unique(split_af[[i]]) 
  }
  split_af <- unlist(split_af)
  # Delete elements with no comma - these only contain affiliation data
  z <- !grepl(',', split_af, ignore.case = T) # Logical identifying which elements contain no comma
  zvec <- which(z==T) # Vector of elements
  if(!length(zvec)==F){ # Need to skip this step if all elements contain a comma, otherwise zvec is NULL and the script stops
  split_af <- split_af[-zvec] # Delete these elements from split_af (zvec contains values)
  }
  split_af <- strsplit(split_af, ",") # Now split at comma to create last and first name columns
  mat  <- matrix(unlist(split_af), ncol=2, byrow=TRUE) # Create matrix with name info (will also include redundant institution details)
  df   <- as.data.frame(mat)
  colnames(df) <- c("Last", "First")
  df$Init <- substr(df$First,1,1) # Extract first name initial - will use this in the name matching
  df$Last <- paste(df$Last,df$Init) # Attach initial to last name
  auth_affil <- df$Last # This will be matched with auth_names
  match <- matches(auth_names, auth_affil) # Determine how many occurrences of each authors name from the affiliations data
  affil <- table(match$x) # Table from matched occurrences - 
  affil_tab <- table(affil) # Summarise further to determine number of affiliations
  no_affil <- as.data.frame(affil_tab)
  colnames(no_affil) <- c("Affil_No", "Freq")
  no_affil <- cbind("Study" = j, "UT" =  bibdf$UT[j], "Journal" = bibdf$SO[j], "Tot_Cites" = bibdf$TC[j], "Cite_Split" = bibdf$cite_split[j], "Num_Affil" = affil_length, "Num_Auth" = length(auth_names) , no_affil)
  append <- rbind(no_affil, append) # append data to existing
}

# Reorder levels of Affil_No
append$Affil_No <- factor(append$Affil_No, levels = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "12"))


# Convert to wide format
append_wide <- spread(append, Affil_No, Freq)
# Rename Affil cols 
names(append_wide)[c(8:18)] <- c("Affil_1", "Affil_2", "Affil_3", "Affil_4", "Affil_5", "Affil_6", "Affil_7", "Affil_8", "Affil_9", "Affil_10", "Affil_12")
# Delete NA row (if Study = NA)
del <- which(is.na(append_wide$Study))
append_wide <- append_wide[-del,]
# Replace NA's with 0
append_wide[is.na(append_wide)] <- 0
# Delete NA Column
append_wide <- append_wide[,-19]


# Sort on Study Num
append_wide <- append_wide[order(append_wide$Study),]
# Create variable indicating max number of affiliations for study
temp <- aggregate(as.numeric(Affil_No) ~ Study, data = append, max)
append_wide$Affil_Max <- temp[,2]
append_wide$Affil_Max <- factor(append_wide$Affil_Max)
levels(append_wide$Affil_Max) # There is no level 10 for Affil_Max, because even though there was one study with 10 affiliations, this was the same study that had 12, so 10 wasn't counted. Also, level 11 is actually 12 (12 - as factor - converted to 11 - as numeric).


# Quantiles of author numbers - Split at 4, 6, and 10 which represent the 25%, 50% and 75% quantiles for the number of authors on each paper. This categorises author numbers for the modelling
quantile(append_wide$Num_Auth)
append_wide$Cat_Auth <- 0
append_wide$Cat_Auth[append_wide$Num_Auth < 4] <- 1
append_wide$Cat_Auth[append_wide$Num_Auth > 3 & append_wide$Num_Auth < 6] <- 2
append_wide$Cat_Auth[append_wide$Num_Auth > 5 & append_wide$Num_Auth < 10] <- 3
append_wide$Cat_Auth[append_wide$Num_Auth > 9] <- 4
append_wide$Cat_Auth <- factor(append_wide$Cat_Auth)


# Subset and create DF to have no more than 6 affiliations.
append_wide_sub <- subset(append_wide, as.numeric(Affil_Max) < 7)
append_wide_sub <- append_wide_sub[,-14:-18]
append_wide_sub$Affil_Max <- factor(append_wide_sub$Affil_Max)


#>----------------------------------------------------------

##SUMMARIES----

# Number of articles 
length(append_wide$Study)

# Total citations over 5 years
sum(append_wide$Tot_Cites)
# Total author appearances over 5 years - These are not unique 
sum(append_wide$Num_Auth)
# Total affiliations over 5 years
sum(append_wide$Num_Affil)


# Maximum number of citations on a paper
max(append_wide$Tot_Cites)
# Minimum number of citations on a paper
min(append_wide$Tot_Cites)
# Average number of citations per paper
mean(append_wide$Tot_Cites)
# Median number of citations per paper
median(append_wide$Tot_Cites)


# Maximum number of authors on a paper
max(append_wide$Num_Auth)
# Minimum number of authors on a paper
min(append_wide$Num_Auth)
# Average number of authors per paper
mean(append_wide$Num_Auth)
# Median number of authors per paper
median(append_wide$Num_Auth)


# Maximum number of affiliations on a paper
max(append_wide$Num_Affil)
# Minimum number of affiliations on a paper
min(append_wide$Num_Affil)
# Average number of affiliations per paper
mean(append_wide$Num_Affil)
# Median number of affiliations per paper
median(append_wide$Num_Affil)


# Relative fractions (RFs) of authors for each affiliation
append_wide$RF_1 <- append_wide$Affil_1/append_wide$Num_Auth
append_wide$RF_2 <- append_wide$Affil_2/append_wide$Num_Auth
append_wide$RF_3 <- append_wide$Affil_3/append_wide$Num_Auth
append_wide$RF_4 <- append_wide$Affil_4/append_wide$Num_Auth
append_wide$RF_5 <- append_wide$Affil_5/append_wide$Num_Auth
append_wide$RF_6 <- append_wide$Affil_6/append_wide$Num_Auth
RF_means <- map(append_wide[21:26], mean)
RF_means <- unlist(RF_means)
RF_av_percent <- RF_means*100


# Table giving number of papers in each category (Max_Affil * Cat_Auth)
source("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/Functions/crosstab.r")
xtabf <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "f") # Frequency
xtabj <- crosstab(append_wide_sub, row.vars = "Cat_Auth", col.vars = "Affil_Max", type = "j") # Percent
xtable(xtabf$crosstab, digits=0)
xtable(xtabj$crosstab)


# Diff number of authors - most vs least cited
t.test(Num_Auth ~ Cite_Split, append_wide)
# Diff number of affiliations - most vs least cited
t.test(Num_Affil ~ Cite_Split, append_wide)

# Diff in number of authors for Affil=1
t.test(Affil_1 ~ Cite_Split, append_wide) 
# Diff in number of authors for Affil=2
t.test(Affil_2 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=3
t.test(Affil_3 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=4
t.test(Affil_4 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=5
t.test(Affil_5 ~ Cite_Split, append_wide)
# Diff in number of authors for Affil=6
t.test(Affil_6 ~ Cite_Split, append_wide)


# Number of papers in most-cited group
length(append_wide$Num_Affil[append_wide$Cite_Split==1])
# Number of papers in least-cited group
length(append_wide$Num_Affil[append_wide$Cite_Split==0])

# Compare ** number of papers** across each category of affiliation - not mutually exclusive i.e. a paper may have more than one affiliation category and will therefore be counted more than once
tabfreq <- table(append$Affil_No, append$Cite_Split)
xtable(tabfreq)

most_vec <- tabfreq[,2] # Frequency of affiliations in most cited half of all papers
total_vec <- tabfreq[,1] + tabfreq[,2] # Total frequency of affiliations across all papers
prop.test(most_vec, total_vec) # Proportions test (equivalent to Chi-Square)
prop.trend.test(most_vec, total_vec) # Proportions trend test


# Compare ** number of authors** across each category of affiliation
lowcite <- subset(append_wide, Cite_Split == 0)
highcite <- subset(append_wide, Cite_Split == 1)
lowcite_sum <- map(lowcite[,8:18], sum)
lowcite_sum <- unlist(lowcite_sum)
sum(lowcite_sum)
highcite_sum <- map(highcite[,8:18], sum)
highcite_sum <- unlist(highcite_sum)
sum(highcite_sum)
row_tot <- lowcite_sum + highcite_sum
row_perc <- row_tot/sum(row_tot)
col_tot <- c(sum(lowcite_sum), sum(highcite_sum), sum(row_tot), 1) 
col_perc <- col_tot/sum(row_tot)
citedf <- data.frame(lowcite_sum, highcite_sum, row_tot, row_perc)
citedf <-  rbind(citedf[1:10,], c(0, 0, 0, 0) , citedf[-(1:10),]) #Insert row for Affil 11
citedf <- rbind(citedf, col_tot, col_perc)
citedf$row_perc[14] <- NA
rownames(citedf)[13] <- "col_tot"
rownames(citedf)[14] <- "col_perc"
xtable(citedf, digits=0)

prop.test(citedf$highcite_sum[1:8], citedf$tot[1:8]) # Proportions test (equivalent to Chi-Square)
prop.trend.test(citedf$highcite_sum[1:6], citedf$tot[1:6]) # Proportions trend test


#>----------------------------------------------------------

##MODELLING----

# LM
summary(mod1a <- lm(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub))
# Log-Linear
summary(mod1b <- lm(log(Tot_Cites+1) ~ Cat_Auth + Affil_Max, data=append_wide_sub))
mod1b$coefficients*100
# NB
summary(mod1c <- glm.nb(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub))
exp(mod1c$coef)
# Poisson
summary(mod2 <- glm(Tot_Cites ~ Cat_Auth + Affil_Max, family="poisson", data=append_wide_sub))
exp(mod2$coef)
# Compare
pchisq(2 * (logLik(mod1c) - logLik(mod2)), df = 1, lower.tail = FALSE) # Chi-square high indicating NB better fit.
# Compare conditional means and variances of Tot_Cites within each level of Affil_Max. Variances exceed means suggesting overdispersion. More evidence that NB better model to fit.
with(append_wide_sub, tapply(Tot_Cites, Affil_Max, function(x) {sprintf("M (SD) = %1.2f (%1.2f)", mean(x), sd(x))}))

# Logistic
summary(mod1d <- glm(Cite_Split ~ Cat_Auth + Affil_Max, data=append_wide_sub, family = "binomial"))
exp(mod1d$coef)

# LM - Interaction
summary(mod1e <- lm(Tot_Cites ~ Affil_Max + Cat_Auth + Affil_Max:Cat_Auth, data=append_wide_sub))
# summary(mod1e <- lm(Tot_Cites ~ Cat_Auth + Affil_Max + Cat_Auth:Affil_Max, data=append_wide_sub))

# Marginal Means
library(emmeans)
emmeans(mod1e, ~ Affil_Max + Cat_Auth)
emmip(mod1e, Affil_Max ~ Cat_Auth)


# Global test for dropping interaction
L <- matrix(c(0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
              0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
# Anova - should be same as contrast
anova(mod1a, mod1e)

# Test Contrasts
# Auth Num = 1, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 1, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 1, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 2, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 2, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 2, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,1,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 3, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 3, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 3, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Auth Num = 4, Max Affil = 2
L <- matrix(c(0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,1,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 4, Max Affil = 3
L <- matrix(c(0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Auth Num = 4, Max Affil = 4
L <- matrix(c(0,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 1, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 1, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 1, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 2, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,1,0,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 2, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 2, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,1,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 3, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,1,0,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 3, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,1,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 3, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,1,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Max Affil = 4, Auth Num = 2
L <- matrix(c(0,0,0,0,0,0,
              1,0,0,0,0,1,
              0,0,0,0,0,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 4, Auth Num = 3
L <- matrix(c(0,0,0,0,0,0,
              0,1,0,0,0,0,
              0,0,0,0,1,0,
              0,0,0,0,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))

# Max Affil = 4, Auth Num = 4
L <- matrix(c(0,0,0,0,0,0,
              0,0,1,0,0,0,
              0,0,0,0,0,0,
              0,0,0,1,0,0),byrow=TRUE,ncol=24)
(summary(glht(mod1e,L),test=Chisqtest()))
(contrast <- confint(glht(mod1e,L),test=Chisqtest()))


# Predictions
nd <- cov_pat6[,1:2]
# cbind predictions and SE's for 3 models
nd <- cbind(nd, Mean_LM = predict(mod1a, newdata = nd, type = "response"), SE_LM = predict(mod1a, newdata = nd, type="response", se.fit = T)$se.fit, Mean_LogLM = predict(mod1b, newdata = nd, type = "response"), SE_LogLM = predict(mod1b, newdata = nd, type="response", se.fit = T)$se.fit, Mean_NB = predict(mod1c, newdata = nd, type = "response"), SE_NB = predict(mod1c, newdata = nd, type="response", se.fit = T)$se.fit)
mean_cites <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data=append_wide_sub, mean)
# Back Transform menas and SE's for loglinear mod
nd2 <- cbind(nd, Mean_LogLM_Back = exp(nd[,5] + 0.5*summary(mod1b)$sigma^2), SE_LogLM_Back = exp(nd[,5])*nd[,6], Mean_Cites = mean_cites[,3])
# DF of means and CI's
nd_ci <- cbind(nd2[,1:2], "Mean_Cites" = nd2[,11], "LM_lower" = nd2[,3]-(1.96*nd2[,4]), "LM_mean" =  nd2[,3], "LM_upper" = nd2[,3]+(1.96*nd2[,4]), "LogLM_lower" = nd2[,9]-(1.96*nd2[,10]), "LogLM_mean" =  nd2[,9], "LogLM_upper" = nd2[,9]+(1.96*nd2[,10]), "NB_lower" = nd2[,7]-(1.96*nd2[,8]), "NB_mean" =  nd2[,7], "NB_upper" = nd2[,7]+(1.96*nd2[,8]))
# Convert to long for plotting
nd_ci_long <- data.frame(cbind("Cat_Auth" = rep(nd_ci$Cat_Auth,4), "Affil_Max" = rep(nd_ci$Affil_Max,4), "Model" = rep(c(1:4), each=24), "Mean" = c(nd_ci$Mean_Cites, nd_ci$LM_mean, nd_ci$LogLM_mean, nd_ci$NB_mean), "Lower" = c(rep(NA, 24), nd_ci$LM_lower, nd_ci$LogLM_lower, nd_ci$NB_lower), "Upper" = c(rep(NA, 24), nd_ci$LM_upper, nd_ci$LogLM_upper, nd_ci$NB_upper)))


# Format Model Output
# LM
cia <- as.data.frame(confint(mod1a))
mod1a_out <- cbind(mod1a$coefficients, coef(summary(mod1a))[,2], cia[,1], cia[,2], coef(summary(mod1a))[,3],  coef(summary(mod1a))[,4])
mod1a_out <- data.frame(mod1a_out)
colnames(mod1a_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value")
xtable(mod1a_out, digits=3)
# Log-LM
cib <- as.data.frame(confint(mod1b))
mod1b_out <- cbind(mod1b$coefficients, coef(summary(mod1b))[,2], cib[,1], cib[,2], coef(summary(mod1b))[,3],  coef(summary(mod1b))[,4])
mod1b_out <- data.frame(mod1b_out)
colnames(mod1b_out) <- c("beta", "S.E.", "2.5%", "97.5%","t", "p-value")
xtable(mod1b_out, digits=3)
# NB
cic <- as.data.frame(exp(confint(mod1c)))
mod1c_out <- cbind(mod1c$coefficients, exp(mod1c$coefficients), coef(summary(mod1c))[,2], cic[,1], cic[,2], coef(summary(mod1c))[,3],  coef(summary(mod1c))[,4])
mod1c_out <- data.frame(mod1c_out)
colnames(mod1c_out) <- c("beta", "Rate", "S.E.", "2.5%", "97.5%","Z", "p-value")
xtable(mod1c_out, digits=3)
# Check fit - p value interpretation (should be > 0.05 for good fit)
1 - pchisq(mod1c$deviance, mod1c$df.residual)
# Check fit - resid dev interpretation (should not be above this value for a good fit)
qchisq(0.95, df.residual(mod1c))
deviance(mod1c)


#>----------------------------------------------------------

##PLOTS----

# HISTOGRAMS
# Hist Citations
hist_1 <- ggplot(append_wide, aes(Tot_Cites)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Citations/Paper",breaks=seq(0,500,50), limits=c(0, 500)) +
  labs(y = "Frequency")
# Hist Authors
hist_2 <- ggplot(append_wide, aes(Num_Auth)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Authors/Paper",breaks=seq(0,50,5), limits=c(0, 50)) +
  labs(y = "Frequency")
# Hist Affiliations
hist_3 <- ggplot(append_wide, aes(Num_Affil)) + 
  geom_histogram(binwidth = 1) + 
  scale_x_continuous("Affiliations/Paper",breaks=seq(0,50,5), limits=c(0, 50)) +
  labs(y = "Frequency")

plot_grid(hist_1, hist_2, hist_3, align='h', ncol = 1, labels=c('A', 'B', 'C'))


med.n <- function(x){
  return(c(y = median(x)+10, label = round(median(x),0))) 
  # experiment with the multiplier to find the perfect position
}
mean.n <- function(x){
  return(c(y = -8, label = round(mean(x),0))) 
  # experiment with the multiplier to find the perfect position
}
# BOXPLOTS
ggplot(append_wide_sub, aes(Affil_Max, Tot_Cites)) +
geom_boxplot(outlier.size = 0.2) +
coord_cartesian(ylim=c(0, 500)) +
facet_grid(~Cat_Auth) + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    axis.line = element_line(colour = "gray50", 
        size = 0.5, linetype = "solid"), 
    panel.background = element_rect(fill = NA)) + labs(x = "Maximum Affiliation", y = "Citations") +
  stat_summary(fun.data = mean.n, geom = "text") + 
  stat_summary(fun.data = med.n, geom = "text")


# CORRELATIONS
cor_df <- append_wide_sub[, c(4,6,7,14,8:13)]
cor_df$Affil_Max <- as.numeric(cor_df$Affil_Max)
cor_df$Cat_Auth <- as.numeric(cor_df$Cat_Auth)
names(cor_df)=c("Citations", "Affiliations","Authors","Max. Affiliation","Affiliation = 1","Affiliation = 2","Affiliation = 3","Affiliation = 4","Affiliation = 5","Affiliation = 6")
cor_mat <- cor(cor_df, use="complete.obs")

source("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/Functions/corrplot.r")
rquery.cormat(cor_df)


# BALLOON PLOT
# Creating covariate pattern from ungrouped data
test <- append_wide[,c(4,19,20)]
temp <- aggregate(Tot_Cites ~ Cat_Auth + Affil_Max, data = test, sum)
# Create zero values for the missing covariate patterns
temp <- rbind(temp, c(1,8,0))
temp <- rbind(temp, c(2,8,0))
temp <- rbind(temp, c(1,9,0))
temp <- rbind(temp, c(2,9,0))
temp <- rbind(temp, c(1,11,0))
temp <- rbind(temp, c(2,11,0))
temp <- rbind(temp, c(3,11,0))

# Only first 6 affiliations.
cov_pat6 <- temp[1:24,]
# Add offset - number of papers per category
cov_pat6$offs <- c(3142, 2715, 2898, 1387, 1371, 2207, 3845, 3374, 454, 811, 1509, 1859, 103, 210, 419, 695, 24, 61, 119, 250, 4, 9, 35, 64)

# P 110 of R Graphics Cookbook - Models summed citations - from individual paper data. 
ggplot(cov_pat6, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites)) +
  geom_point(shape=21, colour="black", fill="cornsilk") +
  scale_size_area(max_size=40, guide=F) +
  labs(x = "Maximum Affiliation", y = "Author Number") +
  geom_text(aes(label=Tot_Cites), vjust=7, colour="grey50", size=4)

# # Expected? summed citations if number of papers in each category was the same (i.e. original data -> fewer papers with higher Max_Affil)
# a <- table(append_wide_sub$Cat_Auth,append_wide_sub$Affil_Max)
# b <- as.data.frame.matrix(a)
# c <- max(b)/b
# d <- c(c[,1],c[,2],c[,3],c[,4],c[,5],c[,6])
# e <- cbind(cov_pat6, "adj" = d)
# e$Tot_Cites2 <- e$Tot_Cites*e$adj
# ggplot(e, aes(x=Affil_Max, y=Cat_Auth, size=Tot_Cites2)) +
#   geom_point(shape=21, colour="black", fill="cornsilk") +
#   scale_size_area(max_size=40, guide=F) +
#   labs(x = "Maximum Affiliation", y = "Author Category") +
#   geom_text(aes(label=Tot_Cites2), vjust=7, colour="grey50", size=4)


# LINE PLOTS
# With CI's
nd_ci_long %>%
  mutate(Model = as.factor(Model)) %>%
  ggplot(aes(Affil_Max, Mean)) + 
  geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) +
  scale_linetype_manual(values=c(2,1,1,1)) +
  geom_ribbon(alpha = .3, aes(ymin = Lower, ymax = Upper, 
                              group = Model, fill = Model)) +
  scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) +
  scale_fill_manual(values=c("black", "dodgerblue", "red3", "springgreen3")) +
  facet_grid(. ~ Cat_Auth)

# Without CI's
# Create gridline cuts
minorsy <- seq(50,200,by=10)
minorsx <- seq(1,6,by=1)

nd_ci_long %>%
  mutate(Model = as.factor(Model)) %>%
  ggplot(aes(Affil_Max, Mean)) + 
  geom_hline(mapping=NULL, yintercept=minorsy,colour='grey95') +
  geom_vline(mapping=NULL, xintercept=minorsx,colour='grey95') +
  geom_line(aes(color = Model, group = Model, linetype = Model), size = 0.6) +
  scale_linetype_manual(values=c(2,1,1,1)) +
  scale_color_manual(values=c("black", "dodgerblue", "red3", "springgreen3"), labels=c("Data", "Linear Model", "LogLinear Model", "Negative Binomial Model")) +
  facet_grid(. ~ Cat_Auth) +
  scale_x_continuous("Maximum Affiliation", breaks=seq(1,6,1)) +
  scale_y_continuous("Citations", breaks=seq(0,200,10), limits=c(50, 200)) +
  theme(legend.position="top", legend.title=element_blank()) +
  guides(color = guide_legend(override.aes = list(linetype = c(2,1,1,1))), linetype = FALSE)


#>----------------------------------------------------------
#>----------------------------------------------------------
# ADDITIONAL STUFF
#>----------------------------------------------------------
#>----------------------------------------------------------


# AUTHORS
# NB
summary(m1 <- glm.nb(Tot_Cites ~ Num_Auth, maxit=100, data=append_wide)) # Models citations at the paper level
exp(m1$coef)
1 - pchisq(m1$deviance, m1$df.residual)

# AFFILIATIONS
# NB
summary(m1 <- glm.nb(Tot_Cites ~ Num_Affil, data=append_wide)) # Models citations at the paper level
exp(m1$coef)
1 - pchisq(m1$deviance, m1$df.residual)


# Scatterplot: Number of Authors vs Number of Affiliations
plot(append_wide$Num_Auth, append_wide$Num_Affil)
cor.test(append_wide$Num_Auth, append_wide$Num_Affil)
plot(log(append_wide$Num_Auth), log(append_wide$Num_Affil))

# Scatterplot: Number of Authors vs Total Citations
plot(append_wide$Num_Auth, append_wide$Tot_Cites)
cor.test(append_wide$Num_Auth, append_wide$Tot_Cites)
plot(log(append_wide$Num_Auth), log(append_wide$Tot_Cites))

# Scatterplot: Number of Affiliations vs Total Citations
plot(append_wide$Num_Affil, append_wide$Tot_Cites)
cor.test(append_wide$Num_Affil, append_wide$Tot_Cites)
plot(log(append_wide$Num_Affil), log(append_wide$Tot_Cites))


# SMALL DATASET
# datan <- readLines("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2014 Only/Nature.bib")
# bibdfn <- convert2df(datan, dbsource = "isi", format = "bibtex")
# bibresn <- biblioAnalysis(bibdfn, sep = ";")
# 
# datas <- readLines("/Users/paulsanfilippo/Dropbox/Research Projects/2017/Polygamous Affiliations/bib files/2014 Only/Science.bib")
# bibdfs <- convert2df(datas, dbsource = "isi", format = "bibtex")
# bibress <- biblioAnalysis(bibdfs, sep = ";")
# 
# # Join Nature and Science df's
# bibdf <- rbind(bibdfn, bibdfs)


# Compare most and least cited halves and do PLOTS

# Create binary variable for most cited half (Bin_Cites_Tot = 1) and least cited half (Bin_Cites_Tot = 0) OVERALL - equal numbers of papers in each half
append2 <- append[order(append$Tot_Cites),]
append2$Bin_Cites_Tot <- 0
temp <- append_wide$Study[append_wide$Bin_Cites_Tot==1] # Identify Bin_Cites_Tot==1 from append_wide data, and match
append2$Bin_Cites_Tot[append2$Study<472] <- 1
append2$Bin_Cites_Tot[append2$Study>828 & append2$Study<1157] <- 1
append2 <- append2[order(append2$Study),]


# Summary Function
summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
                      conf.interval=.95, .drop=TRUE) {
  library(plyr)
  
  # New version of length which can handle NA's: if na.rm==T, don't count them
  length2 <- function (x, na.rm=FALSE) {
    if (na.rm) sum(!is.na(x))
    else       length(x)
  }
  
  # This does the summary. For each group's data frame, return a vector with
  # N, mean, and sd
  datac <- ddply(data, groupvars, .drop=.drop,
                 .fun = function(xx, col) {
                   c(N    = length2(xx[[col]], na.rm=na.rm),
                     mean = mean   (xx[[col]], na.rm=na.rm),
                     sd   = sd     (xx[[col]], na.rm=na.rm)
                   )
                 },
                 measurevar
  )
  
  # Rename the "mean" column    
  datac <- rename(datac, c("mean" = measurevar))
  
  datac$se <- datac$sd / sqrt(datac$N)  # Calculate standard error of the mean
  
  # Confidence interval multiplier for standard error
  # Calculate t-statistic for confidence interval: 
  # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
  ciMult <- qt(conf.interval/2 + .5, datac$N-1)
  datac$ci <- datac$se * ciMult
  
  return(datac)
}

#----------------------------------------------------------
# Number of Authors

# Summary measures
append2most <- subset(append2, Bin_Cites_Tot == 1) # Most cited
append2least <- subset(append2, Bin_Cites_Tot == 0) # Least cited
sum_most <- summarySE(append2most, measurevar="Freq", groupvars="Affil_No")
sum_most <- sum_most[1:6,] # Make max affiliations = 6
sum_least <- summarySE(append2least, measurevar="Freq", groupvars="Affil_No")

# Create Cite Group variable
sum_most$cite <- rep(1,6)
sum_least$cite <- rep(0,6)

# Join Cite Group summary data together
citesum <- rbind(sum_most, sum_least)
citesum$Affil_No <- factor(citesum$Affil_No, levels = c("1", "2", "3", "4", "5", "6")) # Re-order levels
citesum$cite <- factor(citesum$cite)

# Plot the 2 Cite Groups
pd <- position_dodge(.1)
ggplot(citesum, aes(x=Affil_No, y=Freq, colour=cite, group=cite)) + 
  theme_classic() +
  geom_errorbar(aes(ymin=Freq-se, ymax=Freq+se), colour="black", width=.3, position=pd) +
  geom_line(position=pd, aes(group=cite)) + 
  geom_point(aes(shape=cite), size = 4) +
  scale_shape(name  ="", labels=c("Least-Cited", "Most-Cited")) +
  theme(legend.position="top") +
  xlab("Number of Affiliations") +
  ylab("Average Number of Authors per Affiliation") + 
  scale_colour_grey(start = 0, end = 0.5)


#----------------------------------------------------------
# Relative Fraction of Authors

# Compute RFs
append2$RF <- append2$Freq/append2$Num_Auth


# Summary measures
append2most <- subset(append2, Bin_Cites_Tot == 1) # Most cited
append2least <- subset(append2, Bin_Cites_Tot == 0) # Least cited
sum_most <- summarySE(append2most, measurevar="RF", groupvars="Affil_No")
sum_most <- sum_most[1:6,] # Make max affiliations = 6
sum_least <- summarySE(append2least, measurevar="RF", groupvars="Affil_No")

# Create Cite Group variable
sum_most$cite <- rep(1,6)
sum_least$cite <- rep(0,6)

# Join Cite Group summary data together
citesum <- rbind(sum_most, sum_least)
citesum$Affil_No <- factor(citesum$Affil_No, levels = c("1", "2", "3", "4", "5", "6")) # Re-order levels
citesum$cite <- factor(citesum$cite)

# Plot the 2 Cite Groups
pd <- position_dodge(.1)
ggplot(citesum, aes(x=Affil_No, y=RF, colour=cite, group=cite)) + 
  theme_classic() +
  geom_errorbar(aes(ymin=RF-se, ymax=RF+se), colour="black", width=.3, position=pd) +
  geom_line(position=pd, aes(group=cite)) + 
  geom_point(aes(shape=cite), size = 4) +
  scale_shape(name  ="", labels=c("Least-Cited", "Most-Cited")) +
  theme(legend.position="top") +
  xlab("Number of Affiliations") +
  ylab("Relative Fraction of Authors per Affiliation") + 
  scale_colour_grey(start = 0, end = 0.5)


# # Randomly check data matches that on WOS Portal
# # Identify random study by Accession Number online:
# # Match this to study (UT) in append_wide dataframe
# which(append_wide$UT=="ISI000343799700048")
# # Check that record matches
# append_wide[562,]


# # Create binary variable for most cited half (Bin_Cites_Jour = 1) and least cited half (Bin_Cites_Jour = 0) BY Journal (effectively a subgroup analysis)
# tempsci <- subset(append_wide, Journal=="SCIENCE")
# tempnat <- subset(append_wide, Journal=="NATURE")
# tempsci$Bin_Cites_Jour <- 0
# tempsci$Bin_Cites_Jour[384:768] <- 1
# tempnat$Bin_Cites_Jour <- 0
# tempnat$Bin_Cites_Jour[414:828] <- 1
# # Rejoin append_wide
# append_wide <- rbind(tempsci, tempnat)


# # Create binary variable for most cited half (Bin_Cites_Tot = 1) and least cited half (Bin_Cites_Tot = 0) OVERALL
# append_wide <- append_wide[order(append_wide$Tot_Cites),]
# append_wide$Bin_Cites_Tot <- 0
# append_wide$Bin_Cites_Tot[798:1596] <- 1
# append_wide <- append_wide[order(append_wide$Study),]