# Set my working directory to put things in the right place

setwd("/Users/mmgdepartment/Dropbox/Long_Term_Avida/high_res/")

# Now we're going to look at the runs evolved for 200k generations in 
# logic-77 from the basic ancestor.  These are the runs that produced the
# ancestors used for the reintroductions from logic-77.  Everything here
# is with the additive fitness version, instead of the power distribution.


dirs <- list.dirs(path = "./l77", recursive=FALSE)


#Make sure that files from same replicate get merged appropriately

# Take just the max update for each file; looking at just endpoints
results.data <- NULL #This section adapted from stackoverflow.com answer


for (d in dirs) {
	for (d2 in list.dirs(path = d, recursive = FALSE)){
    
		average_file_path <- paste(d2, "average.dat", sep = "/")


		{
    

			fitness <- read.csv(average_file_path, header = F, sep = " ", na.strings = "", colClasses = "character", skip = 19)
			wanted <- cbind(fitness[,1], fitness[,4], fitness[,13])

			wanted[,1]<-as.numeric(as.character(wanted[,1]))
			wanted[,2]<-as.numeric(as.character(wanted[,2]))
			wanted[,3]<-as.numeric(as.character(wanted[,3]))

			grr<-as.numeric(as.character(wanted[,3]))
			grr<-floor(grr/20)
			grr<-grr*20
			#max.update.fitness<-max(fitness[,1])
			#fitness<-subset(fitness, fitness[,1]==max.update.fitness)


			output.matrix<-matrix(ncol=5, nrow=length(wanted[,1]))

			output.matrix[,1]<-as.numeric(as.character(wanted[,1]))
			output.matrix[,2]<-as.numeric(as.character(wanted[,2]))
			#output.matrix[,3]<-as.numeric(as.character(wanted[,3]))
			output.matrix[,3]<-as.numeric(as.character(grr))
			
		
			foo <- tail(unlist(strsplit(d, split = "_", fixed = T)), n=2)[1]
			output.matrix[,4] <- foo


			output.matrix[,5] <- tail(unlist(strsplit(d, split="_", fixed=T)), n = 2)[2]
			

			results.data <- rbind(results.data, output.matrix)
			}
		}
	}


backup.results.data<-results.data


results.data<-backup.results.data

results.data<-as.data.frame(results.data)

colnames(results.data)<-c("Update", "Fitness", "Generation", "Condition", "Seed")

results.data$Update<-as.numeric(as.character(results.data$Update))
results.data$Fitness<-as.numeric(as.character(results.data$Fitness))
results.data$Generation<-as.numeric(as.character(results.data$Generation))
results.data$Seed <- as.numeric(as.character(results.data$Seed))

# Remove Update -1 and 0 (generation 0)

results.data<-subset(results.data, Generation>0.5)

# Sort the data, to ensure the generations are monotonicly increasing:

results.data <- results.data[order(results.data$Generation),]

# Write this file as output:

write.csv(results.data, file="L77.Phase.1.csv")

# Set color pallette

library(RColorBrewer)
my.pallette <- brewer.pal(n=10, name="Spectral")


results.data$Log.Fitness <- log(results.data$Fitness, base=2)

# Sort the data, to ensure the generations are monotonicly increasing:

results.data <- results.data[order(results.data$Generation),]

# Scale each point relative to lowest observed fitness:

ancestral.fitness<-.246114

results.data$Scaled.Fitness<-results.data$Fitness/ancestral.fitness

# Calculate log scaled fitness:

results.data$Log.Scaled.Fitness<-log(results.data$Scaled.Fitness, 2)


# Write this file as output:

write.csv(results.data, file="L77.Phase.1.detailed.csv")


# Establish the list of seeds:

Seed.list <- unique(results.data$Seed)

# Plot Scaled Fitness as a function of Generation for this phase:

plot(x=results.data$Generation, y=results.data$Scaled.Fitness, xlab="Time (generations) ", ylab="Scaled Fitness", col="white")

Seed.list<-unique(results.data$Seed)
for (i in 1:length(Seed.list)) {
	plotting.data <- subset(results.data, Seed==Seed.list[i])
	lines(x=plotting.data$Generation, y=plotting.data$Scaled.Fitness, col=my.pallette[i], lwd=1)
	}


# Plot Fitness as a function of Generation for this phase:


graphics.off()
png(filename="Detailed.Standard.L77.Phase.1.png", height=600, width=1200, units="px", res=200)

plot(x=results.data$Generation, y=results.data$Log.Scaled.Fitness, xlab="Time (generations) ", ylab="Log 2 Scaled Fitness", col="white")

#Seed.list<-unique(results.data$Seed)
Seed.list <- c(1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010)
for (i in 1:length(Seed.list)) {
	plotting.data <- subset(results.data, Seed==Seed.list[i])
	lines(x=plotting.data$Generation, y=plotting.data$Log.Scaled.Fitness, col=my.pallette[i], lwd=1)
	}


dev.off()

######

# Now let's fit some curves:

model.results.matrix <- matrix(nrow=length(Seed.list), ncol=7)

for (i in 1:length(Seed.list)) {
	current.data <- subset(results.data, Seed==Seed.list[i])

	power.full<-nls(Log.Scaled.Fitness  ~ (b*Generation + 1)^a, data=current.data, start=list(a=0.5, b=.01), algorithm="port", trace=T, na.action=na.omit, model=T, control=nls.control(maxiter=1000, warnOnly=T))
	values.power.full<-coef(power.full)
	model.power.full.a<-values.power.full[1]
	model.power.full.b<-values.power.full[2]

	hyper.full<-nls(Log.Scaled.Fitness + 1 ~ a*Generation/(b + Generation) + 1, data=current.data, start=list(a=60, b=10000), algorithm="port", trace=T, na.action=na.omit, model=T, control=nls.control(maxiter=1000, warnOnly=T))
	values.hyper.full<-coef(hyper.full)
	model.hyper.full.a<-values.hyper.full[1]
	model.hyper.full.b<-values.hyper.full[2]

	model.results.matrix[i, 1] <- Seed.list[i]
	model.results.matrix[i, 2] <- BIC(power.full)
	model.results.matrix[i, 3] <- BIC(hyper.full)
	model.results.matrix[i, 4] <- model.power.full.a
	model.results.matrix[i, 5] <- model.power.full.b
	model.results.matrix[i, 6] <- model.hyper.full.a
	model.results.matrix[i, 7] <- model.hyper.full.b

	}

colnames(model.results.matrix) <- c("Seed", "BIC.Power", "BIC.Hyper", "Power.a", "Power.b", "Hyper.a", "Hyper.b")

model.results.frame <- as.data.frame(model.results.matrix)

model.results.frame$Diff <- model.results.frame$BIC.Power - model.results.frame$BIC.Hyper

write.csv(model.results.frame, file="Model.results.frame.csv")

# Store the results of the phase 1 analysis:

Phase.1.results.data <- results.data


#############

# Now we have to look at phase 2.  What do the new graphs look like?  And which of 
# them exceeded expectations?


setwd("/Users/mmgdepartment/Dropbox/Long_Term_Avida/l77/")

# Now we're going to look at the runs evolved for 200k generations in 
# logic-77 from ancestors which had themselves been evolved for 200k
# generations in logic-77:


dirs <- list.dirs(path = "./long", recursive=FALSE)


#Make sure that files from same replicate get merged appropriately

# Take just the max update for each file; looking at just endpoints
results.data <- NULL #This section adapted from stackoverflow.com answer


for (d in dirs) {
	for (d2 in list.dirs(path = d, recursive = FALSE)){
    
		average_file_path <- paste(d2, "average.dat", sep = "/")


		{
    

			fitness <- read.csv(average_file_path, header = F, sep = " ", na.strings = "", colClasses = "character", skip = 19)
			wanted <- cbind(fitness[,1], fitness[,4], fitness[,13])

			wanted[,1]<-as.numeric(as.character(wanted[,1]))
			wanted[,2]<-as.numeric(as.character(wanted[,2]))
			wanted[,3]<-as.numeric(as.character(wanted[,3]))

			grr<-as.numeric(as.character(wanted[,3]))
			grr<-floor(grr/20)
			grr<-grr*20
			#max.update.fitness<-max(fitness[,1])
			#fitness<-subset(fitness, fitness[,1]==max.update.fitness)


			output.matrix<-matrix(ncol=6, nrow=length(wanted[,1]))

			output.matrix[,1]<-as.numeric(as.character(wanted[,1]))
			output.matrix[,2]<-as.numeric(as.character(wanted[,2]))
			#output.matrix[,3]<-as.numeric(as.character(wanted[,3]))
			output.matrix[,3]<-as.numeric(as.character(grr))
			
		
			foo <- tail(unlist(strsplit(d, split = "/", fixed = T)), n=1)
			output.matrix[,4] <- foo


			output.matrix[,5] <- tail(unlist(strsplit(d, split="_", fixed=T)), n = 2)[1]
			output.matrix[,6] <- tail(unlist(strsplit(d, split="_", fixed=T)), n = 2)[2]
			

			results.data <- rbind(results.data, output.matrix)
			}
		}
	}


backup.results.data<-results.data


results.data<-backup.results.data

results.data<-as.data.frame(results.data)

colnames(results.data)<-c("Update", "Fitness", "Generation", "Condition", "Ancestor", "Seed")

results.data$Update<-as.numeric(as.character(results.data$Update))
results.data$Fitness<-as.numeric(as.character(results.data$Fitness))
results.data$Generation<-as.numeric(as.character(results.data$Generation))

# Remove Update -1 and 0 (generation 0)

results.data<-subset(results.data, Generation>0.5)

# Store unmodified:

Phase.2.raw.results.data <- results.data

write.csv(results.data, file="L77.Phase.2.results.csv")

Ancestor.list <- unique(results.data$Ancestor)

current.Ancestor <- Ancestor.list[1]
current.data <- subset(results.data, Ancestor==current.Ancestor)
earliest.data <- subset(current.data, Generation==20)
min.fitness <- min(earliest.data$Fitness)
current.data$Scaled.Fitness <- current.data$Fitness/min.fitness
rescaled.data <- current.data


for (i in 2:length(Ancestor.list)) {
	current.Ancestor <- Ancestor.list[i]
	current.data <- subset(results.data, Ancestor==current.Ancestor)
	earliest.data <- subset(current.data, Generation==20)
	min.fitness <- min(earliest.data$Fitness)
	current.data$Scaled.Fitness <- current.data$Fitness/min.fitness
	rescaled.data <- rbind(rescaled.data, current.data)
	}


# Establish color pallette:

library(RColorBrewer)
my.pallette <- brewer.pal(n=10, name="Spectral")


# Log transform Fitness

rescaled.data$Log.Scaled.Fitness <- log(rescaled.data$Scaled.Fitness, base=2)

# Rescale to same scale as previous data:

rescaled.data$Ancestral.Scaled.Fitness <- rescaled.data$Fitness/ancestral.fitness

rescaled.data$Log.Ancestral.Scaled.Fitness <- log(rescaled.data$Ancestral.Scaled.Fitness, base=2)

# Save these results:

Phase.2.results.data <- rescaled.data

plot(x=rescaled.data$Generation, y=rescaled.data$Log.Scaled.Fitness, xlab="Time (generations) ", ylab="Log 2 Scaled Fitness", col="white")

#Ancestor.list<-unique(rescaled.data$Ancestor)
Ancestor.list <- c(1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010)
for (i in 1:length(Ancestor.list)) {
	plotting.data <- subset(rescaled.data, Ancestor==Ancestor.list[i])
	Seed.list <- unique(plotting.data$Seed)
	for (j in 1:length(Seed.list)) {
		internal.data <- subset(plotting.data, Seed==Seed.list[j])
		internal.data <- internal.data[order(internal.data$Generation),]	
		lines(x=internal.data$Generation, y=internal.data$Log.Scaled.Fitness, col=my.pallette[i], lwd=1)
		}
	}

# Repeat this but in a png format:

graphics.off()
png(filename="L77_Phase_2.png", height=1200, width=1200, units="px", res=200)

plot(x=rescaled.data$Generation, y=rescaled.data$Log.Scaled.Fitness, xlab="Time (generations) ", ylab="Log 2 Scaled Fitness", col="white")

#Ancestor.list<-unique(rescaled.data$Ancestor)
Ancestor.list <- c(1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010)
for (i in 1:length(Ancestor.list)) {
	plotting.data <- subset(rescaled.data, Ancestor==Ancestor.list[i])
	Seed.list <- unique(plotting.data$Seed)
	for (j in 1:length(Seed.list)) {
		internal.data <- subset(plotting.data, Seed==Seed.list[j])
		internal.data <- internal.data[order(internal.data$Generation),]	
		lines(x=internal.data$Generation, y=internal.data$Log.Scaled.Fitness, col=my.pallette[i], lwd=1)
		}
	}

dev.off()


# Now, the important questions:  
# How many of these exceed their predicted values at the end of phase 2?
# How many of them exceed their predicted asymptotes?

final.points <- subset(rescaled.data, Generation==200000)

model.results.frame$Power.End <- ((model.results.frame$Power.b*400000) + 1) ^model.results.frame$Power.a  
model.results.frame$Hyper.End <- (model.results.frame$Hyper.a*400000) / (model.results.frame$Hyper.b + 400000)

Ancestor.list <- c(1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010)
Seed.list <- c(101, 102, 103, 104, 105, 106, 107, 108, 109, 110)

Exceed.matrix <- matrix(nrow=10, ncol=4)

for (i in 1:length(Ancestor.list)) {
	current.data <- subset(final.points, Ancestor==Ancestor.list[i])
	comparison.data <- subset(model.results.frame, Seed==Ancestor.list[i])
	Exceed.Hyper <- 0
	Exceed.Power <- 0
	Exceed.Asymptote <- 0
	for (j in 1:length(Seed.list)) {
		inner.data <- subset(current.data, Seed==Seed.list[j])
		if (inner.data$Log.Ancestral.Scaled.Fitness > comparison.data$Hyper.End) 
			{Exceed.Hyper <- Exceed.Hyper + 1}
		if (inner.data$Log.Ancestral.Scaled.Fitness > comparison.data$Power.End) 
			{Exceed.Power <- Exceed.Power + 1}
		if (inner.data$Log.Ancestral.Scaled.Fitness > comparison.data$Hyper.a) 
			{Exceed.Asymptote <- Exceed.Asymptote + 1}
		}
	Exceed.matrix[i, 1] <- Ancestor.list[i]
	Exceed.matrix[i, 2] <- Exceed.Hyper
	Exceed.matrix[i, 3] <- Exceed.Power
	Exceed.matrix[i, 4] <- Exceed.Asymptote
	}
			
colnames(Exceed.matrix) <- c("Ancestor.Seed", "Number.Exceed.Hyper", "Number.Exceed.Power", "Number.Exceed.Asymptote")

Exceed.frame <- as.data.frame(Exceed.matrix)

# Both models tend to overpredict fit at 400k.  
# 17 of the 100 instances exceed the Hyperbolic asymptote; these come from 5 of the initial seeds.

# Now let's look at a few specific ancestors.  The one where the hyperbola does the worst
# is 1009.  The one where the power law does the worst is 1005.  


Seed.grapher <- function(chosen.seed) {

	phase.1.data <- subset(Phase.1.results.data, Seed==chosen.seed)
	phase.2.data <- subset(Phase.2.results.data, Ancestor==chosen.seed)
	#phase.2.data <- subset(l77.results.data, Ancestor==chosen.seed)
	model.data <- subset(model.results.frame, Seed==chosen.seed)

	phase.1.x <- seq(from=0, to=200000, by=100)
	phase.2.x <- seq(from=200000, to=400000, by=100)

	phase.1.hyper <- (phase.1.x*model.data$Hyper.a)/(phase.1.x + model.data$Hyper.b)
	phase.2.hyper <- (phase.2.x*model.data$Hyper.a)/((phase.2.x) + model.data$Hyper.b)

	phase.1.power <- (model.data$Power.b*phase.1.x + 1)^model.data$Power.a
	phase.2.power <- (model.data$Power.b*phase.2.x + 1)^model.data$Power.a

	observed.max <- max(phase.2.data$Log.Ancestral.Scaled.Fitness)
	hyper.max <- max(phase.2.hyper)
	power.max <- max(phase.2.power)

	ymax <- max(observed.max, hyper.max, power.max)


	graphics.off()
	png(filename=paste(chosen.seed,".png"), height=1200, width=1200, units="px", res=200)
	plot(x=phase.2.data$Generation, y=phase.2.data$Log.Ancestral.Scaled.Fitness, xlim=c(0, 400000), ylim=c(0, ymax+2), xlab="Time (generations) ", ylab="Log 2 Scaled Fitness", col="white", xaxt='n')

	axis(side=1, at=c(0, 100000, 200000, 300000, 400000), labels=c("0", "100,000", "200,000", "300,000", "400,000"))

	for (i in 1:length(Seed.list)) {
		current.data <- subset(phase.2.data, Seed==Seed.list[i])
		lines(x=current.data$Generation+200000, y=current.data$Log.Ancestral.Scaled.Fitness, lwd=1, col="gray50")
	}

	lines(x=phase.1.data$Generation, y=phase.1.data$Log.Scaled.Fitness, col="black", lwd=2)

	lines(y=phase.1.hyper, x=phase.1.x, lwd=2, col="darkred")
	lines(y=phase.1.power, x=phase.1.x, lwd=2, col="mediumblue")

	lines(y=phase.2.hyper, x=phase.2.x, lwd=2, col="darkred", lty=2)
	lines(y=phase.2.power, x=phase.2.x, lwd=2, col="mediumblue", lty=2)

	abline(v=200000, col="black", lwd=2, lty=2)

	dev.off()
	return(0)

	}


for (i in 1001:1010) {
	Seed.grapher(i)
	}


Seed.grapher.combination <- function(chosen.seed) {

	phase.1.data <- subset(Phase.1.results.data, Seed==chosen.seed)
	phase.2.data <- subset(Phase.2.results.data, Ancestor==chosen.seed)
	#phase.2.data <- subset(l77.results.data, Ancestor==chosen.seed)
	model.data <- subset(model.results.frame, Seed==chosen.seed)

	phase.1.x <- seq(from=0, to=200000, by=100)
	phase.2.x <- seq(from=200000, to=400000, by=100)

	phase.1.hyper <- (phase.1.x*model.data$Hyper.a)/(phase.1.x + model.data$Hyper.b)
	phase.2.hyper <- (phase.2.x*model.data$Hyper.a)/((phase.2.x) + model.data$Hyper.b)

	phase.1.power <- (model.data$Power.b*phase.1.x + 1)^model.data$Power.a
	phase.2.power <- (model.data$Power.b*phase.2.x + 1)^model.data$Power.a

	observed.max <- max(phase.2.data$Log.Ancestral.Scaled.Fitness)
	hyper.max <- max(phase.2.hyper)
	power.max <- max(phase.2.power)

	ymax <- max(observed.max, hyper.max, power.max)


	#graphics.off()
	#png(filename=paste(chosen.seed,".png"), height=1200, width=1200, units="px", res=200)
	plot(x=phase.2.data$Generation, y=phase.2.data$Log.Ancestral.Scaled.Fitness, xlim=c(0, 400000), ylim=c(0, ymax+2), xlab="Time (generations) ", ylab="Log 2 Scaled Fitness", col="white", xaxt='n')

	axis(side=1, at=c(0, 100000, 200000, 300000, 400000), labels=c("0", "100,000", "200,000", "300,000", "400,000"))

	for (i in 1:length(Seed.list)) {
		current.data <- subset(phase.2.data, Seed==Seed.list[i])
		lines(x=current.data$Generation+200000, y=current.data$Log.Ancestral.Scaled.Fitness, lwd=1, col="gray50")
	}

	lines(x=phase.1.data$Generation, y=phase.1.data$Log.Scaled.Fitness, col="black", lwd=2)

	lines(y=phase.1.hyper, x=phase.1.x, lwd=2, col="darkred")
	lines(y=phase.1.power, x=phase.1.x, lwd=2, col="mediumblue")

	lines(y=phase.2.hyper, x=phase.2.x, lwd=2, col="darkred", lty=3)
	lines(y=phase.2.power, x=phase.2.x, lwd=2, col="mediumblue", lty=3)

	abline(v=200000, col="black", lwd=2, lty=2)

	return(0)

	}


# Combined plot showing 4 of the cases: 1010, 1006, 1008, 1009)

graphics.off()
png(filename="Combined.png", height=1200, width=1200, units="px", res=200)
par(mfrow=c(2,2), cex.axis=0.7)
Seed.grapher.combination(1010)
mtext("A", side=3, adj=0, font=2)
Seed.grapher.combination(1006)
mtext("B", side=3, adj=0, font=2)
Seed.grapher.combination(1008)
mtext("C", side=3, adj=0, font=2)
Seed.grapher.combination(1009)
mtext("D", side=3, adj=0, font=2)
dev.off()