# script file for Green and Hickey, 2005 in American Journal of Science

# the figures for this paper have been around for a long time
# (about 3 years) and therefore this script file only actually
# produces the raw version of Fig 3; all the other figures were
# already in sufficiently complete form to be used without
# modification

# for them, consult the script files importCI.R.txt and newimportCI.R.txt


s <- read.delim('/Users/wag/Archive/Active/CITrends/AJSMS/data/CI.csv', header = TRUE, colClasses = 'character')

# Here are the CIC codes
CICs <- c(100:155, 160:164, 170:172, 180:186, 190, 200, 210:220, 230:238, 240, 300, 350, 400, 500:509, 600, 700, 710, 800, 900, 910, 920, 930, 940, 950, 990)

# Here are the names of the ages given by the age codes Z-F
ages <- c("pleisto", "plio", "mio", "oligo", "eo", "paleo", "maas", "camp", "sant", "coni", "turo", "ceno", "alb", "apt", "barr", "neoc", "malm", "dogger", "lias", "trias")

# And here are the corresponding age codes
agesc <- c('Z', 'Y', 'X', 'W', 'V', 'U', 'T', 'S', 'R', 'Q', 'P', 'N', 'M', 'L', 'K', 'J', 'I', 'H', 'G', 'F')

# Make variable CIC
CIC <- substr(as.character(s[,1]), 1, 3)
s <- cbind (s, I(CIC))

# Make variable AGECODE
AGECODE <- substr(as.character(s[,1]), 4, 4)
s <- cbind (s, I(AGECODE))

# Just looking at what we have
table(s$AGECODE)
table(s$CIC)
str(s)

#corrupt CICs
(1:nrow(s))[s$CIC %in% CICs == FALSE]
#[1] 5680 8435 9795 9796

#corrupt age codes
(1:nrow(s))[s$AGECODE %in% agesc == FALSE]
#[1] 5680 7468 8435 9795 9796

# Add an indicator for corrupt records
CORRUPT <- s$AGECODE %in% agesc == FALSE | s$CIC %in% CICs == FALSE
s <- cbind (s, I(CORRUPT))
sum(s$CORRUPT)

#Check that corrupt records have been removed
length(table(s$CIC[!CORRUPT]))
#[1] 118
length(table(s$AGECODE[!CORRUPT]))
#[1] 20

##########
# FIG 1

plot(table(s$CIC[!CORRUPT]), type = 'h')
qqnorm(log(table(s$CIC[!CORRUPT])))


# Break things down by age and CIC

CICs.by.age <- matrix(nrow = length(ages), ncol = length(CICs))
rownames(CICs.by.age) <- ages
colnames(CICs.by.age) <- paste('cic', CICs, sep = '')
for(i in 1:20){
  foo <- s$CIC[s$AGECODE == agesc[i] & s$CORRUPT == FALSE]
  CICs.by.age[i,] <- table(factor(foo, levels = CICs))
}

##########
# FIG 3

CICs.by.age[CICs.by.age == 0] <- NA
plot.default(log(CICs.by.age[7,1:56]), log(CICs.by.age[6,1:56]), pch = '.', xaxt = 'n', yaxt = 'n')
axis(1, at = log(c(1, 5, 10, 50, 100)), labels = c(1, 5, 10, 50, 100))
axis(2, at = log(c(1, 5, 10, 50, 100)), labels = c(1, 5, 10, 50, 100))
text(log(CICs.by.age[7,1:56]), log(CICs.by.age[6,1:56]), labels = CICs[1:56])
abline(lm(log(CICs.by.age[6,1:56]) ~ log(CICs.by.age[7,1:56]))$coef, lty = 2)
abline(0,1)

summary(lm(log(CICs.by.age[6,1:56]) ~ log(CICs.by.age[7,1:56])))

cor.test(CICs.by.age[6,1:56], CICs.by.age[7,1:56])
cor.test(CICs.by.age[6,1:56], CICs.by.age[7,1:56], method = 'spearman')
cor.test(CICs.by.age[6,1:56], CICs.by.age[7,1:56], method = 'kendall')

# Why doesn't this work?
#plot.default(CICs.by.age[5,1:56], CICs.by.age[6,1:56], pch = '.', log = 'xy')
#text(CICs.by.age[5,1:56], CICs.by.age[6,1:56], labels = CICs[1:56])
#abline(lm(log(CICs.by.age[6,1:56]) ~ log(CICs.by.age[5,1:56]))$coef)