###################################################################### ## Workbook for Reproducing of Discriminant Function Analysis ################################################################# ## PRELIMINARY: ## Prepare infile.csv with appropriate rows, grouping columns and missing data fields. ## Identify column numbers to be used in the analysis (groupings and data columns). ## Choose variable for coefficient, determine type of PAC if mentioned, and whether the overall PAC or that of a particular group is to be used. library(MASS) library(candisc) ############################################################################## ## Function needed for standardised coefficients ## - modified from http://little-book-of-r-for-multivariate-analysis.readthedocs.org/en/latest/src/multivariateanalysis.html#linear-discriminant-analysis ############################################################################## calcWithinGroupsVariance <- function(variable,groupvariable) { # find out how many values the group variable can take groupvariable2 <- as.factor(groupvariable[[1]]) levels <- levels(groupvariable2) numlevels <- length(levels) # get the mean and standard deviation for each group: numtotal <- 0 denomtotal <- 0 for (i in 1:numlevels) { leveli <- levels[i] levelidata <- variable[groupvariable==leveli] levelilength <- length(levelidata) # get the standard deviation for group i: sdi <- sd(levelidata) numi <- (levelilength - 1)*(sdi * sdi) denomi <- levelilength numtotal <- numtotal + numi denomtotal <- denomtotal + denomi } # calculate the within-groups variance Vw <- numtotal / (denomtotal - numlevels) return(Vw) } ############################################################### ## Load spreadsheet - see v1 to download from Google Drive ############################################################### ## Read data file - need to specify quotes because some fields contain apostrophes spreadsheetdir="./" dfa_repro0=read.table(paste(spreadsheetdir,"Reproducing DFAs 2014-09-21 updated2.txt",sep=""),na.strings=c("NA","",".","#DIV/0!","#REF!","#VALUE!","#NUM!","9999"),header=T,sep="\t",quote="\"",blank.lines.skip = TRUE,stringsAsFactors=F) dfa_repro=dfa_repro0 # keep an unaltered version ####################################################################### ## Add new columns to hold results ####################################################################### dfa_repro$nrows=NA # Number of rows in dataset dfa_repro$missing=NA # Missing values in data files dfa_repro$nincomplete=NA dfa_repro$PVE_flat=NA # PVE with flat priors dfa_repro$PVE_prop=NA # PVE with proportional priors dfa_repro$PVE_best=NA # Closest value to the published value dfa_repro$PVE_diff_perc=NA # Difference between the published value and the best reanalysed one dfa_repro$PVE_best_type=NA # flat or proportional dfa_repro$COEF_flat_raw=NA # Raw coefficient, flat priors dfa_repro$COEF_prop_raw=NA # Raw coefficient, proportional priors dfa_repro$COEF_flat_std=NA # Standardised coefficient, flat priors dfa_repro$COEF_prop_std=NA # Standardised coefficient, proportional priors dfa_repro$COEF_best=NA # Closest value to the published value dfa_repro$COEF_diff_perc=NA # Difference between the published value and the best reanalysed one dfa_repro$COEF_best_type=NA # The type of coefficient closest to the published value dfa_repro$PAC_flat_basic=NA # Percent assigned correctly, based on resubstitution and flat priors dfa_repro$PAC_flat_cv=NA # Percent assigned correctly, based on jackknife cross-validation and flat priors dfa_repro$PAC_prop_basic=NA # Percent assigned correctly, based on resubstitution and proportional priors dfa_repro$PAC_prop_cv=NA # Percent assigned correctly, based on jackknife cross-validation and proportional priors dfa_repro$PAC_best=NA # Closest value to the published value dfa_repro$PAC_diff_perc=NA # Difference between the published value and the best reanalysed one dfa_repro$PAC_best_type=NA # The type of coefficient closest to the published value ############################################################### ## Run analysis using information from spreadsheet ############################################################### ## Which rows to include or exclude include=dfa_repro$paper.no.[which(dfa_repro$Infile=="y")] exclude=dfa_repro$paper.no.[which(dfa_repro$Exclude=="y")] for(j in include[!include %in% exclude]){ i=match(j,dfa_repro[,"paper.no."]) ####################################################################### ## Information block - takes values from the spreadsheet ####################################################################### ## Each data set is in a specific folder nested within a ## folder for studies published in that year year=dfa_repro[i,'PY'] datadir=dfa_repro[i,'Data.folder.name'] ## Load and parse the input file ("infile") newdat0=read.csv(file.path(year,datadir,data"infile"),na.strings=c("NA","na","",".","#N/A","#DIV/0!","#REF!","#VALUE!","#NUM!","9999","-9999")) groupcol=as.numeric(dfa_repro[i,'Grouping.column']) # Column containing grouping factor datcols0=dfa_repro[i,'Data.columns'] # Text string listing columns used for DFA datcols=eval(parse(text=paste("c(",datcols0,")",sep=""))) # Gives warning or error if formatting is wrong coefcol=as.numeric(dfa_repro[i,'COEF_col'])# Column containing variable to focus on for DF coefficient newdat0[,groupcol]=factor(newdat0[,groupcol]) # set grouping column as factor ngroups=nlevels(newdat0[,groupcol]) # number of groups naxes=as.numeric(dfa_repro[i,'PVE_nsummed'])# number of axes summed for PVE (usually 1) coeftype=dfa_repro[i,'COEF_type'] # standardised or raw DF coefficients ## Check sample size and exclude rows with missing group data newdat=newdat0[which(!is.na(newdat0[,groupcol])),] dfa_repro$nrows[i]=nrow(newdat) dfa_repro$missing[i]=any(is.na(newdat[,datcols])) dfa_repro$nincomplete[i]=nrow(newdat)-length(which(complete.cases(newdat[,datcols]))) if(dfa_repro$missing[i])next ####################################################################### ## Run analysis with flat or proportional priors ####################################################################### lda.flat=lda(data.frame(newdat[,datcols]),grouping=newdat[,groupcol],prior=c(rep(1,ngroups)/ngroups)) lda.prop=lda(data.frame(newdat[,datcols]),grouping=newdat[,groupcol]) ####################################################################### ## PVE ####################################################################### if(is.na(dfa_repro$PVE_nsummed[i]))naxes=1 lda.flat.pve=lda.flat$svd^2/sum(lda.flat$svd^2) lda.prop.pve=lda.prop$svd^2/sum(lda.prop$svd^2) dfa_repro$PVE_flat[i]=sum(lda.flat.pve[1:naxes])*100 dfa_repro$PVE_prop[i]=sum(lda.prop.pve[1:naxes])*100 ## Identify closest value and fill in results columns if(!is.na(dfa_repro$PVE_paper[i])){ ord=rank(abs(dfa_repro[i,c("PVE_prop","PVE_flat")]-dfa_repro$PVE_paper[i]),ties="min") dfa_repro$PVE_best[i]=unlist(dfa_repro[i,c("PVE_prop","PVE_flat")][order(ord)][1]) dfa_repro$PVE_diff_perc[i]=100*(abs(dfa_repro$PVE_best[i]-dfa_repro$PVE_paper[i])/dfa_repro$PVE_paper[i]) dfa_repro$PVE_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ") } ####################################################################### ## Coefficients ####################################################################### if(!is.na(dfa_repro$COEF_paper[i])){ ## Raw coefficients dfa_repro$COEF_flat_raw[i]=coef(lda.flat)[match(coefcol,datcols),1] dfa_repro$COEF_prop_raw[i]=coef(lda.prop)[match(coefcol,datcols),1] ## Standardised coefficients wgsd=sqrt(calcWithinGroupsVariance(newdat[,coefcol],newdat[,groupcol])) dfa_repro$COEF_flat_std[i]=dfa_repro$COEF_flat_raw[i]*wgsd dfa_repro$COEF_prop_std[i]=dfa_repro$COEF_prop_raw[i]*wgsd ## Identify closest value and fill in results columns if(!is.na(coeftype)){ ord=rank(abs(abs(dfa_repro[i,paste(c("COEF_prop_","COEF_flat_"),coeftype,sep="")])-abs(dfa_repro$COEF_paper[i])),ties="min") dfa_repro$COEF_best[i]=unlist(dfa_repro[i,paste(c("COEF_prop_","COEF_flat_"),coeftype,sep="")][order(ord)][1]) dfa_repro$COEF_diff_perc[i]=100*(abs(abs(dfa_repro$COEF_best[i])-abs(dfa_repro$COEF_paper[i]))/abs(dfa_repro$COEF_paper[i])) dfa_repro$COEF_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ") } } ####################################################################### ## Percent assigned correctly ####################################################################### ## Classification tables for flat and proportional priors if(!is.na(dfa_repro$PAC_paper[i])){ ct_flat_basic=table(newdat[,groupcol], predict(lda.flat)$class) ct_prop_basic=table(newdat[,groupcol], predict(lda.prop)$class) ct_flat_cv=table(newdat[,groupcol], update(lda.flat,CV=T)$class) ct_prop_cv=table(newdat[,groupcol], update(lda.prop,CV=T)$class) ## Percentage of total or just one group assigned correctly if(dfa_repro$PAC_group[i]=="all" & !is.na(dfa_repro$PAC_group[i])){ ## Percentage of total dfa_repro$PAC_flat_basic[i]=100*sum(diag(ct_flat_basic))/sum(ct_flat_cv) dfa_repro$PAC_prop_basic[i]=100*sum(diag(ct_prop_basic))/sum(ct_prop_cv) dfa_repro$PAC_flat_cv[i]=100*sum(diag(ct_flat_cv))/sum(ct_flat_cv) dfa_repro$PAC_prop_cv[i]=100*sum(diag(ct_prop_cv))/sum(ct_prop_cv) } else { ## Percentage for just one group - use proportions by row thislevel=match(dfa_repro$PAC_group[i],levels(newdat[,groupcol])) dfa_repro$PAC_flat_basic[i]=100*diag(prop.table(ct_flat_basic,1))[thislevel] dfa_repro$PAC_prop_basic[i]=100*diag(prop.table(ct_prop_basic,1))[thislevel] dfa_repro$PAC_flat_cv[i]=100*diag(prop.table(ct_flat_cv,1))[thislevel] dfa_repro$PAC_prop_cv[i]=100*diag(prop.table(ct_prop_cv,1))[thislevel] } ## Choose best, depending on whether CV or basic pactype=dfa_repro$PAC_method[i] # Some papers specified which type of PAC was reported if(pactype %in% c("basic","cv")){ ord=rank(abs(dfa_repro[i,paste(c("PAC_prop_","PAC_flat_"),pactype,sep="")]-dfa_repro$PAC_paper[i]),ties="min") dfa_repro$PAC_best[i]=unlist(dfa_repro[i,paste(c("PAC_prop_","PAC_flat_"),pactype,sep="")][order(ord)][1]) dfa_repro$PAC_diff_perc[i]=100*(abs(dfa_repro$PAC_best[i]-dfa_repro$PAC_paper[i])/dfa_repro$PAC_paper[i]) dfa_repro$PAC_best_type[i]=paste(c("prop","flat")[ord==1],collapse=", ") } else { ord=rank(abs(dfa_repro[i,c("PAC_prop_cv","PAC_prop_basic","PAC_flat_cv","PAC_flat_basic")]-dfa_repro$PAC_paper[i]),ties="min") dfa_repro$PAC_best[i]=unlist(dfa_repro[i,c("PAC_prop_cv","PAC_prop_basic","PAC_flat_cv","PAC_flat_basic")][order(ord)])[1] dfa_repro$PAC_diff_perc[i]=100*(abs(dfa_repro$PAC_best[i]-dfa_repro$PAC_paper[i])/dfa_repro$PAC_paper[i]) dfa_repro$PAC_best_type[i]=paste(c("prop_cv","prop_basic","flat_cv","flat_basic")[ord==1],collapse=", ") } } dfa_repro[i,] } write.table(dfa_repro,file="dfa_ranalysed.txt",sep="\t",na="",row.names=F)