######################################################################
## Workbook for Reproducing of Discriminant Function Analysis
#################################################################

## PRELIMINARY: 
##    Prepare infile.csv with appropriate rows, grouping columns and missing data fields. 
##    Identify column numbers to be used in the analysis (groupings and data columns).
##    Choose variable for coefficient, determine type of PAC if mentioned, and whether the overall PAC or that of a particular group is to be used.

library(MASS)
library(candisc)

##############################################################################
## Function needed for standardised coefficients 
##  - modified from http://little-book-of-r-for-multivariate-analysis.readthedocs.org/en/latest/src/multivariateanalysis.html#linear-discriminant-analysis 
##############################################################################
calcWithinGroupsVariance <- function(variable,groupvariable)
{
  # find out how many values the group variable can take
  groupvariable2 <- as.factor(groupvariable[[1]])
  levels <- levels(groupvariable2)
  numlevels <- length(levels)
  # get the mean and standard deviation for each group:
  numtotal <- 0
  denomtotal <- 0
  for (i in 1:numlevels)
  {
    leveli <- levels[i]
    levelidata <- variable[groupvariable==leveli]
    levelilength <- length(levelidata)
    # get the standard deviation for group i:
    sdi <- sd(levelidata)
    numi <- (levelilength - 1)*(sdi * sdi)
    denomi <- levelilength
    numtotal <- numtotal + numi
    denomtotal <- denomtotal + denomi
  }
  # calculate the within-groups variance
  Vw <- numtotal / (denomtotal - numlevels)
  return(Vw)
}

###############################################################
## Load spreadsheet - see v1 to download from Google Drive
###############################################################
## Read data file - need to specify quotes because some fields contain apostrophes
spreadsheetdir="./"
dfa_repro0=read.table(paste(spreadsheetdir,"Reproducing DFAs 2014-09-21 updated2.txt",sep=""),na.strings=c("NA","",".","#DIV/0!","#REF!","#VALUE!","#NUM!","9999"),header=T,sep="\t",quote="\"",blank.lines.skip = TRUE,stringsAsFactors=F)

dfa_repro=dfa_repro0 # keep an unaltered version 

#######################################################################
## Add new columns to hold results
#######################################################################
dfa_repro$nrows=NA # Number of rows in dataset
dfa_repro$missing=NA # Missing values in data files
dfa_repro$nincomplete=NA
dfa_repro$PVE_flat=NA # PVE with flat priors
dfa_repro$PVE_prop=NA # PVE with proportional priors
dfa_repro$PVE_best=NA # Closest value to the published value
dfa_repro$PVE_diff_perc=NA # Difference between the published value and the best reanalysed one
dfa_repro$PVE_best_type=NA # flat or proportional
dfa_repro$COEF_flat_raw=NA # Raw coefficient, flat priors
dfa_repro$COEF_prop_raw=NA # Raw coefficient, proportional priors
dfa_repro$COEF_flat_std=NA # Standardised coefficient, flat priors
dfa_repro$COEF_prop_std=NA # Standardised coefficient, proportional priors
dfa_repro$COEF_best=NA # Closest value to the published value
dfa_repro$COEF_diff_perc=NA # Difference between the published value and the best reanalysed one
dfa_repro$COEF_best_type=NA # The type of coefficient closest to the published value
dfa_repro$PAC_flat_basic=NA # Percent assigned correctly, based on resubstitution and flat priors
dfa_repro$PAC_flat_cv=NA # Percent assigned correctly, based on jackknife cross-validation and flat priors
dfa_repro$PAC_prop_basic=NA # Percent assigned correctly, based on resubstitution and proportional priors
dfa_repro$PAC_prop_cv=NA # Percent assigned correctly, based on jackknife cross-validation and proportional priors
dfa_repro$PAC_best=NA # Closest value to the published value
dfa_repro$PAC_diff_perc=NA # Difference between the published value and the best reanalysed one
dfa_repro$PAC_best_type=NA # The type of coefficient closest to the published value

###############################################################
## Run analysis using information from spreadsheet
###############################################################
## Which rows to include or exclude
include=dfa_repro$paper.no.[which(dfa_repro$Infile=="y")]
exclude=dfa_repro$paper.no.[which(dfa_repro$Exclude=="y")]

for(j in include[!include %in% exclude]){ 
  i=match(j,dfa_repro[,"paper.no."]) 
  
  #######################################################################
  ## Information block - takes values from the spreadsheet
  #######################################################################
  ## Each data set is in a specific folder nested within a 
  ##  folder for studies published in that year
  year=dfa_repro[i,'PY']
  datadir=dfa_repro[i,'Data.folder.name']
   
  ## Load and parse the input file ("infile")
  newdat0=read.csv(file.path(year,datadir,data"infile"),na.strings=c("NA","na","",".","#N/A","#DIV/0!","#REF!","#VALUE!","#NUM!","9999","-9999"))

  groupcol=as.numeric(dfa_repro[i,'Grouping.column']) # Column containing grouping factor
  datcols0=dfa_repro[i,'Data.columns'] # Text string listing columns used for DFA
  datcols=eval(parse(text=paste("c(",datcols0,")",sep=""))) # Gives warning or error if formatting is wrong
  coefcol=as.numeric(dfa_repro[i,'COEF_col'])# Column containing variable to focus on for DF coefficient
  newdat0[,groupcol]=factor(newdat0[,groupcol]) # set grouping column as factor
  ngroups=nlevels(newdat0[,groupcol]) # number of groups
  naxes=as.numeric(dfa_repro[i,'PVE_nsummed'])# number of axes summed for PVE (usually 1)
  coeftype=dfa_repro[i,'COEF_type'] # standardised or raw DF coefficients
 
  ## Check sample size and exclude rows with missing group data
  newdat=newdat0[which(!is.na(newdat0[,groupcol])),]
  dfa_repro$nrows[i]=nrow(newdat)
  dfa_repro$missing[i]=any(is.na(newdat[,datcols]))
  dfa_repro$nincomplete[i]=nrow(newdat)-length(which(complete.cases(newdat[,datcols])))
  if(dfa_repro$missing[i])next

  #######################################################################
  ## Run analysis with flat or proportional priors
  #######################################################################
  
  lda.flat=lda(data.frame(newdat[,datcols]),grouping=newdat[,groupcol],prior=c(rep(1,ngroups)/ngroups))
  lda.prop=lda(data.frame(newdat[,datcols]),grouping=newdat[,groupcol])
  
  #######################################################################
  ## PVE 
  #######################################################################
  if(is.na(dfa_repro$PVE_nsummed[i]))naxes=1
  
  lda.flat.pve=lda.flat$svd^2/sum(lda.flat$svd^2)
  lda.prop.pve=lda.prop$svd^2/sum(lda.prop$svd^2)
  
  dfa_repro$PVE_flat[i]=sum(lda.flat.pve[1:naxes])*100
  dfa_repro$PVE_prop[i]=sum(lda.prop.pve[1:naxes])*100
  
  ## Identify closest value and fill in results columns
  if(!is.na(dfa_repro$PVE_paper[i])){
    ord=rank(abs(dfa_repro[i,c("PVE_prop","PVE_flat")]-dfa_repro$PVE_paper[i]),ties="min")
    dfa_repro$PVE_best[i]=unlist(dfa_repro[i,c("PVE_prop","PVE_flat")][order(ord)][1])
    dfa_repro$PVE_diff_perc[i]=100*(abs(dfa_repro$PVE_best[i]-dfa_repro$PVE_paper[i])/dfa_repro$PVE_paper[i])
    dfa_repro$PVE_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ") 
  }
  
  #######################################################################
  ## Coefficients
  #######################################################################
  if(!is.na(dfa_repro$COEF_paper[i])){
    ## Raw coefficients
    dfa_repro$COEF_flat_raw[i]=coef(lda.flat)[match(coefcol,datcols),1]
    dfa_repro$COEF_prop_raw[i]=coef(lda.prop)[match(coefcol,datcols),1]
    
    ## Standardised coefficients
    wgsd=sqrt(calcWithinGroupsVariance(newdat[,coefcol],newdat[,groupcol]))
    dfa_repro$COEF_flat_std[i]=dfa_repro$COEF_flat_raw[i]*wgsd
    dfa_repro$COEF_prop_std[i]=dfa_repro$COEF_prop_raw[i]*wgsd  
  
    ## Identify closest value and fill in results columns
    if(!is.na(coeftype)){
      ord=rank(abs(abs(dfa_repro[i,paste(c("COEF_prop_","COEF_flat_"),coeftype,sep="")])-abs(dfa_repro$COEF_paper[i])),ties="min")
      dfa_repro$COEF_best[i]=unlist(dfa_repro[i,paste(c("COEF_prop_","COEF_flat_"),coeftype,sep="")][order(ord)][1])
      dfa_repro$COEF_diff_perc[i]=100*(abs(abs(dfa_repro$COEF_best[i])-abs(dfa_repro$COEF_paper[i]))/abs(dfa_repro$COEF_paper[i]))
      dfa_repro$COEF_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ")     
    }
  }
  
  #######################################################################
  ## Percent assigned correctly
  #######################################################################
  ## Classification tables for flat and proportional priors
  if(!is.na(dfa_repro$PAC_paper[i])){
    ct_flat_basic=table(newdat[,groupcol], predict(lda.flat)$class)
    ct_prop_basic=table(newdat[,groupcol], predict(lda.prop)$class)
    ct_flat_cv=table(newdat[,groupcol], update(lda.flat,CV=T)$class)
    ct_prop_cv=table(newdat[,groupcol], update(lda.prop,CV=T)$class)
    
    ## Percentage of total or just one group assigned correctly
    if(dfa_repro$PAC_group[i]=="all" & !is.na(dfa_repro$PAC_group[i])){ ## Percentage of total
      dfa_repro$PAC_flat_basic[i]=100*sum(diag(ct_flat_basic))/sum(ct_flat_cv)
      dfa_repro$PAC_prop_basic[i]=100*sum(diag(ct_prop_basic))/sum(ct_prop_cv)
      dfa_repro$PAC_flat_cv[i]=100*sum(diag(ct_flat_cv))/sum(ct_flat_cv)
      dfa_repro$PAC_prop_cv[i]=100*sum(diag(ct_prop_cv))/sum(ct_prop_cv)
    } else { ## Percentage for just one group - use proportions by row
      thislevel=match(dfa_repro$PAC_group[i],levels(newdat[,groupcol]))
      dfa_repro$PAC_flat_basic[i]=100*diag(prop.table(ct_flat_basic,1))[thislevel]
      dfa_repro$PAC_prop_basic[i]=100*diag(prop.table(ct_prop_basic,1))[thislevel]
      dfa_repro$PAC_flat_cv[i]=100*diag(prop.table(ct_flat_cv,1))[thislevel]
      dfa_repro$PAC_prop_cv[i]=100*diag(prop.table(ct_prop_cv,1))[thislevel]   
    }  
    
    ## Choose best, depending on whether CV or basic
    pactype=dfa_repro$PAC_method[i] # Some papers specified which type of PAC was reported
    if(pactype %in% c("basic","cv")){
      ord=rank(abs(dfa_repro[i,paste(c("PAC_prop_","PAC_flat_"),pactype,sep="")]-dfa_repro$PAC_paper[i]),ties="min")
      dfa_repro$PAC_best[i]=unlist(dfa_repro[i,paste(c("PAC_prop_","PAC_flat_"),pactype,sep="")][order(ord)][1])
      dfa_repro$PAC_diff_perc[i]=100*(abs(dfa_repro$PAC_best[i]-dfa_repro$PAC_paper[i])/dfa_repro$PAC_paper[i])
      dfa_repro$PAC_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ") 
    } else {
      ord=rank(abs(dfa_repro[i,c("PAC_prop_cv","PAC_prop_basic","PAC_flat_cv","PAC_flat_basic")]-dfa_repro$PAC_paper[i]),ties="min")
      dfa_repro$PAC_best[i]=unlist(dfa_repro[i,c("PAC_prop_cv","PAC_prop_basic","PAC_flat_cv","PAC_flat_basic")][order(ord)])[1]
      dfa_repro$PAC_diff_perc[i]=100*(abs(dfa_repro$PAC_best[i]-dfa_repro$PAC_paper[i])/dfa_repro$PAC_paper[i])
      dfa_repro$PAC_best_type[i]=paste(c("prop_cv","prop_basic","flat_cv","flat_basic")[ord==1],collapse=", ")  
    }  
  }
  dfa_repro[i,]
}


write.table(dfa_repro,file="dfa_ranalysed.txt",sep="\t",na="",row.names=F)