#Set of functions to clean data from the Paleobiology Database (PBDB) for geographic range and sampling calculations
#Written by James Boyle May 20th, 2016


#############Sequence of functions for pre-processing PBDB brachiopod dataset of Boyle 2017###################
#install.packages("GeoRange")
#library(GeoRange)

##Load in pbdb csv file with comments on data requested in the original sheet deleted
	#brach<-read.csv(file.choose())

##Check for errors in the dataset where names have duplicate numbers
	#dupList<-duplicateNumbers_forName(brach)

##Reduce the taxonomic resolution of the dataset to genus-level
	#Brach_Genus<-genus_Cleaning(brach)

##If desired, find and then remove taxa which are younger than some cutoff data
	#CenoGen<-PreTimeGenera(Brach_Genus,minAge=66)
	#Brach_GenusPreCeno<-Brach_Genus[-CenoGen$TaxaPos,]

##Trim the dataset to taxa that only occur in a set number of localities or sections
	#BrachGenusPreCenoCommon<-PBDB_TaxaTrim(Brach_GenusPreCeno,nSectCutoff=50,nLocCutoff=50)

##Geographic range rarefaction analyses of geographic range were performed using BrachGenusPreCenoCommon as one of the input variables


##############Sequence of functions to perform PBDB analyses in Boyle 2017#########################################
##Create a taxa X locations occurrence matrix for later steps
	#CoordTest<-CoordList_PBDB(BrachGenusPreCenoCommon)

##Calculate the measures of geographic range for all brachiopod genera
	#BrachGeo<-GeoRange_MultiTaxa(CoordTest,TaxaStart=3)

##Calculate the skewness and coefficient of variation of the distribution of geographic range measures
	#BrachSkewCV<-GeoPerformance_SkewCV(BrachGeo)

##Perform a rarefaction analysis of dataset to calculate 7 measures of geographic range at various sample sizes
	#PBDBRareTest_BrachGenusPreCenoCommon<-GeoRarefaction_MultiTaxa(CoordTest,TaxaStart=3,iter=20,steps=c(1,500,400,300,200,100,50,25,10),replacePts=TRUE)

##PEE calculations for Brachiopods
	#PEETestBrach_AllTaxa<-PEE_MultiTaxa(PBDBRareTest_BrachGenusPreCenoCommon)


##GLM models (performed one at a time, adjusting for total variance in each geographic range measure)
##Example
	#g<-glm(BrachGeo$duration~BrachGeo$n.sect)

	#aov(g)
	#Call:
	#   aov(formula = g)
	#
	#Terms:
	#                BrachGeo$n.sect Residuals
	#Sum of Squares         158478.7  812101.9
	#Deg. of Freedom               1       368
	#
	#Residual standard error: 46.97657
	#Estimated effects may be unbalanced

	#> g
	#
	#Call:  glm(formula = BrachGeo$duration ~ BrachGeo$n.sect)
	#
	#Coefficients:
	#    (Intercept)  BrachGeo$n.sect  
	#        51.1017           0.0877  
	#
	#Degrees of Freedom: 369 Total (i.e. Null);  368 Residual
	#Null Deviance:      970600 
	#Residual Deviance: 812100       AIC: 3903
	
	#Percent Variance Explained = ("Sum of Squares" for variable/Sum of "Sum of squares") * 100
		#{158478/(158478.7+812101.9)}*100


#################################################FUNCTIONS####################################################

#Function to look for taxa in a Paleobiology csv dataset that have multiple ID numbers
duplicateNumbers_forName<-function(pbdb){
	TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)])
	TNames<-unique(TNames)
	Taxa<-unique(pbdb$accepted_no)
	nTaxa<-length(Taxa)
	numTaxa<-length(TNames)
	res<-c(NULL)
	for(i in 1:numTaxa){
		num<-c(NULL)
		spots<-which(pbdb$accepted_name==TNames[i])
		uniq<-unique(pbdb$accepted_no[spots])
		if(length(uniq)>1){
			res<-c(res,TNames[i])
			print(TNames[i])
		}
	}
	return(res)
}

#dupList<-duplicateNumbers_forName(brach)

###################################################################################
#Function for genus-level PBDB occurrences to assign species/spp/subgenus/etc. to same genus-level ID
	#Arbitrarily assigns ID numbers for genera starting at 900001
genus_Cleaning<-function(pbdb){
	pbdbMat<-pbdb
	#Changes accepted_name column to a character, otherwise run into type errors later
		#does not recognize genus names as valid factor levels when trying to reassign, turns to NAs
	pbdbMat$accepted_name<-as.character(pbdbMat$accepted_name)
	nOccs<-length(pbdb[,1])
	genusList<-c(NULL)
	#Loop to read through all occurrences and gather a genus list
	for(i in 1:nOccs){
		print(i)
		tempSplit<-strsplit(pbdbMat$accepted_name[i]," ")
		pbdbMat$accepted_name[i]<-tempSplit[[1]][1]
		genusList<-c(genusList,tempSplit[[1]][1])
	}
	genusList<-unique(genusList)
	nGen<-length(genusList)
	for(j in 1:nGen){
		genPos<-which(pbdbMat$accepted_name==genusList[j])
		pbdbMat$accepted_no[genPos]<-900000+j
	}
	#Change accepted_name variable back to a factor to avoid problems in PBDBOccurrneceCleaner() function
	pbdbMat$accepted_name<-as.factor(pbdbMat$accepted_name)
	return(pbdbMat)
}

#testPBDB_Genus<-genus_Cleaning(brach)

##################################################################################
#Compile list of genera with first occurrences less than or equal to the minAge
PreTimeGenera<-function(pbdb,minAge=66){
	TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)])
	TNames<-unique(TNames)
	#Taxa<-unique(pbdb$accepted_no)
	nTaxa<-length(TNames)
	CenoTaxa<-c()
	CenoPos<-c()
	for(i in 1:nTaxa){
		print(i)
		taxPos<-which(pbdb$accepted_name==TNames[i])
		minMA<-min(pbdb$min_ma[taxPos])
		if(minMA<=minAge){
			CenoTaxa<-c(CenoTaxa,TNames[i])
			CenoPos<-c(CenoPos,taxPos)
		}
	}
	res<-list(CenozoicTaxa=CenoTaxa,TaxaPos=CenoPos)
}

#CenoGen<-PreTimeGenera(testPBDB_Genus,minAge=66)

############################################################################################################################
#Function to remove taxa that are observed fewer than some set number of times
	#nSectCutoff is the minimum number of unique collections
	#nLocCutoff is the minimum number of unique geographic locations
PBDB_TaxaTrim<-function(pbdb,nSectCutoff=50,nLocCutoff=50){
	#Checks for and removes any occurrences with no associated paleocoordinates
	LocUnknown<-which(is.na(pbdb$paleolng))
	pbdb<-pbdb[-LocUnknown,]
	TNames<-c(levels(pbdb$accepted_name)[as.numeric(pbdb$accepted_name)])
	TNames<-unique(TNames)
	Taxa<-unique(pbdb$accepted_no)
	nTaxa<-length(Taxa)
	TaxaCutPos<-c()
	for(i in 1:nTaxa){
		print(i)
		taxPos<-which(pbdb$accepted_no==Taxa[i])
		if(length(unique(pbdb$collection_no[taxPos]))<nSectCutoff){
			TaxaCutPos<-c(TaxaCutPos,taxPos)
		}
		else{
			LongList<-c()
			LatList<-c()
			for(j in taxPos){
				LongList<-c(LongList,pbdb$paleolng[j])
				LatList<-c(LatList,pbdb$paleolat[j])
			}
			#CoordCollapse function is in MSTDist6 script
			CoordList<-CoordCollapse(LongList,LatList)
			nCoords<-length(CoordList[,1])
			if(nCoords<nLocCutoff){
				TaxaCutPos<-c(TaxaCutPos,taxPos)
			}
		}
	}
	pbdb<-pbdb[-TaxaCutPos,]
	return(pbdb)
}

#BrachGenusPreCenoCommon<-PBDB_TaxaTrim(BrachGenusPreCeno,nSectCutoff=50,nlocCutoff=50)