#Set of functions to clean data from the Paleobiology Database (PBDB) for geographic range and sampling calculations #Written by James Boyle May 20th, 2016 #############Sequence of functions for pre-processing PBDB brachiopod dataset of Boyle 2017################### #install.packages("GeoRange") #library(GeoRange) ##Load in pbdb csv file with comments on data requested in the original sheet deleted #brach<-read.csv(file.choose()) ##Check for errors in the dataset where names have duplicate numbers #dupList<-duplicateNumbers_forName(brach) ##Reduce the taxonomic resolution of the dataset to genus-level #Brach_Genus<-genus_Cleaning(brach) ##If desired, find and then remove taxa which are younger than some cutoff data #CenoGen<-PreTimeGenera(Brach_Genus,minAge=66) #Brach_GenusPreCeno<-Brach_Genus[-CenoGen$TaxaPos,] ##Trim the dataset to taxa that only occur in a set number of localities or sections #BrachGenusPreCenoCommon<-PBDB_TaxaTrim(Brach_GenusPreCeno,nSectCutoff=50,nLocCutoff=50) ##Geographic range rarefaction analyses of geographic range were performed using BrachGenusPreCenoCommon as one of the input variables ##############Sequence of functions to perform PBDB analyses in Boyle 2017######################################### ##Create a taxa X locations occurrence matrix for later steps #CoordTest<-CoordList_PBDB(BrachGenusPreCenoCommon) ##Calculate the measures of geographic range for all brachiopod genera #BrachGeo<-GeoRange_MultiTaxa(CoordTest,TaxaStart=3) ##Calculate the skewness and coefficient of variation of the distribution of geographic range measures #BrachSkewCV<-GeoPerformance_SkewCV(BrachGeo) ##Perform a rarefaction analysis of dataset to calculate 7 measures of geographic range at various sample sizes #PBDBRareTest_BrachGenusPreCenoCommon<-GeoRarefaction_MultiTaxa(CoordTest,TaxaStart=3,iter=20,steps=c(1,500,400,300,200,100,50,25,10),replacePts=TRUE) ##PEE calculations for Brachiopods #PEETestBrach_AllTaxa<-PEE_MultiTaxa(PBDBRareTest_BrachGenusPreCenoCommon) ##GLM models (performed one at a time, adjusting for total variance in each geographic range measure) ##Example #g<-glm(BrachGeo$duration~BrachGeo$n.sect) #aov(g) #Call: # aov(formula = g) # #Terms: # BrachGeo$n.sect Residuals #Sum of Squares 158478.7 812101.9 #Deg. of Freedom 1 368 # #Residual standard error: 46.97657 #Estimated effects may be unbalanced #> g # #Call: glm(formula = BrachGeo$duration ~ BrachGeo$n.sect) # #Coefficients: # (Intercept) BrachGeo$n.sect # 51.1017 0.0877 # #Degrees of Freedom: 369 Total (i.e. Null); 368 Residual #Null Deviance: 970600 #Residual Deviance: 812100 AIC: 3903 #Percent Variance Explained = ("Sum of Squares" for variable/Sum of "Sum of squares") * 100 #{158478/(158478.7+812101.9)}*100 #################################################FUNCTIONS#################################################### #Function to look for taxa in a Paleobiology csv dataset that have multiple ID numbers duplicateNumbers_forName<-function(pbdb){ TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)]) TNames<-unique(TNames) Taxa<-unique(pbdb$accepted_no) nTaxa<-length(Taxa) numTaxa<-length(TNames) res<-c(NULL) for(i in 1:numTaxa){ num<-c(NULL) spots<-which(pbdb$accepted_name==TNames[i]) uniq<-unique(pbdb$accepted_no[spots]) if(length(uniq)>1){ res<-c(res,TNames[i]) print(TNames[i]) } } return(res) } #dupList<-duplicateNumbers_forName(brach) ################################################################################### #Function for genus-level PBDB occurrences to assign species/spp/subgenus/etc. to same genus-level ID #Arbitrarily assigns ID numbers for genera starting at 900001 genus_Cleaning<-function(pbdb){ pbdbMat<-pbdb #Changes accepted_name column to a character, otherwise run into type errors later #does not recognize genus names as valid factor levels when trying to reassign, turns to NAs pbdbMat$accepted_name<-as.character(pbdbMat$accepted_name) nOccs<-length(pbdb[,1]) genusList<-c(NULL) #Loop to read through all occurrences and gather a genus list for(i in 1:nOccs){ print(i) tempSplit<-strsplit(pbdbMat$accepted_name[i]," ") pbdbMat$accepted_name[i]<-tempSplit[[1]][1] genusList<-c(genusList,tempSplit[[1]][1]) } genusList<-unique(genusList) nGen<-length(genusList) for(j in 1:nGen){ genPos<-which(pbdbMat$accepted_name==genusList[j]) pbdbMat$accepted_no[genPos]<-900000+j } #Change accepted_name variable back to a factor to avoid problems in PBDBOccurrneceCleaner() function pbdbMat$accepted_name<-as.factor(pbdbMat$accepted_name) return(pbdbMat) } #testPBDB_Genus<-genus_Cleaning(brach) ################################################################################## #Compile list of genera with first occurrences less than or equal to the minAge PreTimeGenera<-function(pbdb,minAge=66){ TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)]) TNames<-unique(TNames) #Taxa<-unique(pbdb$accepted_no) nTaxa<-length(TNames) CenoTaxa<-c() CenoPos<-c() for(i in 1:nTaxa){ print(i) taxPos<-which(pbdb$accepted_name==TNames[i]) minMA<-min(pbdb$min_ma[taxPos]) if(minMA<=minAge){ CenoTaxa<-c(CenoTaxa,TNames[i]) CenoPos<-c(CenoPos,taxPos) } } res<-list(CenozoicTaxa=CenoTaxa,TaxaPos=CenoPos) } #CenoGen<-PreTimeGenera(testPBDB_Genus,minAge=66) ############################################################################################################################ #Function to remove taxa that are observed fewer than some set number of times #nSectCutoff is the minimum number of unique collections #nLocCutoff is the minimum number of unique geographic locations PBDB_TaxaTrim<-function(pbdb,nSectCutoff=50,nLocCutoff=50){ #Checks for and removes any occurrences with no associated paleocoordinates LocUnknown<-which(is.na(pbdb$paleolng)) pbdb<-pbdb[-LocUnknown,] TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)]) TNames<-unique(TNames) Taxa<-unique(pbdb$accepted_no) nTaxa<-length(Taxa) TaxaCutPos<-c() for(i in 1:nTaxa){ print(i) taxPos<-which(pbdb$accepted_no==Taxa[i]) if(length(unique(pbdb$collection_no[taxPos]))