#
# This file contains R commands that reproduce the analyses 
# presented in Chapter 4
#
# Read data from a named file. You will need to edit the text within " "
# to identify the file in which you have stored the data. The optional
# argument header=T tells the software that the first line of the file
# gives the names of each of the variables whose values fill the
# remaining rows. The argument row.names=1 tells the software that the 
# first column of the data-file is a set of row-names, which in this case
# identify the individual genes. 
# The data can be accessed directly from the web-page:
#       http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/
# 
data<-read.table("../data/genes.data",header=T,row.names=1)
#
# normalised expression levels of 22810 arabadospis thaliana genes
# measured on 12 Affymetrix microarrays
#
# select random sample of 25 genes - changing the argument of set.seed()
# will give a different sample
#
set.seed(498661)
sample25<-sample(1:22810,25)
batch25<-data[sample25,1]
#
# dot-plot of expression levels of sampled genes as measured on array 1 (Figure 4.2)
#
pdf(file="dotplot.pdf",paper="special", width=6,height=2)
plot(batch25,rep(0,25),pch=19,cex=0.5,yaxt="n",xlab="expression level",ylab=" ",
     xlim=c(0,2500),cex.axis=0.8,cex.lab=0.8,bty="n")
dev.off()
#
# repeat for log-base-two-transformed expression levels (Figure 4.3)
#
pdf(file="logdotplot.pdf",paper="special",
           width=6,height=2)
plot(log2(batch25),rep(0,25),pch=19,yaxt="n",cex=0.5,
     xlab="log-base-two-expression level",ylab=" ",
     xlim=c(0,8),cex.axis=0.8,cex.lab=0.8,bty="n")
dev.off()
#
# cumulative plot of log-base-two-transformed expression levels (Figure 4.4)
#
pdf(file="cumplot.pdf",height=6,width=8)
plot(sort(log2(batch25)),(1:25)/25,pch=19,
     xlim=c(0,8),ylim=c(0,1),cex.axis=1.5,cex.lab=1.5,cex=1.5,
      xlab="sorted log-base-two-expression level",ylab="cumulative proportion")
dev.off()
#
# select a larger sample (500 genes)
#
set.seed(987235)
sample500<-sample(1:22810,500)
batch500<-log2(data[sample500,1])
#
# frequency table using log-transformed expression levels rounded up to integers, 
# for example 36 of the 500 values were between 1 and 2, 177 between 2 and 3, etc
#
counts500<-table(1+floor(batch500))
#
#  2   3   4   5   6   7   8   9  10  11  12  13  14 
# 36 177  82  51  46  36  26  14  12  14   4   1   1 
#
# frequency polygon corresponding to above table (Figure 4.5)
#
pdf(file="frequencypoly.pdf",paper="special",
           width=6,height=5)
plot((-0.5)+(2:14),counts500,pch=19,
     xlim=c(0,15),ylim=c(0,180),
      xlab="log-base-two-expression level",ylab="frequency ")
lines((-0.5)+(2:14),counts500)
dev.off()
#
# bar-chart and histogram corresponding to above table (Figure 4.6)
#
pdf(file="barhist.pdf",paper="special",
           width=8,height=5)
par(mfrow=c(1,2))
plot(as.numeric(names(counts500))-0.5,counts500, type = "h",lwd=5,
      xlab="log-base-two-expression level",ylab="frequency ",
     xlim=c(0,15),ylim=c(0,180))
hist(batch500,breaks=0:14,xlab="log-base-two-expression level",
     ylab="frequency",main=" ",
     xlim=c(0,15),ylim=c(0,180))
#
# the simple command hist(batch500) would also do the job, but would select 
# bin-widths for you, whereas the longer version of the command allows you
# to make this choice 
#
dev.off()
#
# histogram of all 22810 log-transformed expression levels on array 1 (Figure 4.7)
#
pdf(file="histall.pdf",paper="special",
           width=6,height=5)
hist(log2(data[,1]),breaks=0.5*(0:30),
     xlab="log-base-two-expression level",
     ylab="frequency",main=" ",
     xlim=c(0,14),ylim=c(0,5000))
dev.off()
#
# Dot-plots of log-transformed expression levels for a random sample of 25 genes 
# from arrays 1 to 4 (Figure 4.8)
#
pdf(file="dotplots4.pdf",paper="special",
           width=6,height=2.5)
set.seed(498661)
sample25<-sample(1:22810,25)
batch25RL<-data[sample25,1]
batch25RH<-data[sample25,4]
batch25WL<-data[sample25,7]
batch25WH<-data[sample25,10]
plot(log2(batch25RL),rep(0,25),pch=19,yaxt="n",cex=0.5,
     xlab="log-base-two-expression level",ylab=" ",
     xlim=c(0,8),ylim=c(-0.02,0.32),cex.axis=1.0,cex.lab=1.0,bty="n")
points(log2(batch25RH),rep(0.1,25),pch=1,cex=0.5)
points(log2(batch25WL),rep(0.2,25),pch=3,cex=0.5)
points(log2(batch25WH),rep(0.3,25),pch=4,cex=0.5)
dev.off()
#
# superimposed cumulative plots of the same data (Figure 4.9)
#
pdf(file="cumplot4.pdf")
set.seed(498661)
sample25<-sample(1:22810,25)
batch25RL<-data[sample25,1]
batch25RH<-data[sample25,4]
batch25WL<-data[sample25,7]
batch25WH<-data[sample25,10]
plot(sort(log2(batch25RL)),(1:25)/25,type="l",lwd=2,
     xlab="sorted log-base-two-expression level",ylab="cumulative proportion",
     xlim=c(0,8),ylim=c(0,1),cex.axis=1.5,cex.lab=1.5)
lines(sort(log2(batch25RH)),(1:25)/25,lwd=4)
lines(sort(log2(batch25WL)),(1:25)/25,lwd=2,lty=2)
lines(sort(log2(batch25WH)),(1:25)/25,lwd=4,lty=2)
dev.off()
#
# log-transformed expression levels for a random sample of 500 genes from arrays 1 to 4,
# 
multibatch<-log2(data[sample500,c(1,4,7,10)])
#
# superimposed histograms of these data... very cluttered and not recommended!)
#
pdf(file="histograms4.pdf",height=6,width=8)
par(mfrow=c(1,1))
hist(multibatch[,1],breaks=0.5*(0:30),cex.axis=1.5,cex.lab=1.5,
     xlab="log-base-two-expression level",
     ylab="frequency",
     xlim=c(0,15),ylim=c(0,120),main=" ")
hist(multibatch[,2],breaks=0.5*(0:30),lty=2,add=T)
hist(multibatch[,3],breaks=0.5*(0:30),lty=3,add=T)
hist(multibatch[,4],breaks=0.5*(0:30),lty=4,add=T)
dev.off()
#
# histograms of the same data in a two-by-two layout that mimics the
# experimental treatment structure... much less cluttered (Figure 4.10)
#
pdf(file="histograms_panels.pdf",height=6,width=8)
par(mfrow=c(2,2))
hist(multibatch[,1],breaks=0.5*(0:30),
     xlab="log-base-two-expression level",
     ylab="frequency",main="resistant, low-Ca",
     xlim=c(0,15),ylim=c(0,120))
hist(multibatch[,2],breaks=0.5*(0:30),
     xlab=" ",
     ylab=" ",main="resistant, high-Ca",
     xlim=c(0,15),ylim=c(0,120))
hist(multibatch[,3],breaks=0.5*(0:30),
     xlab=" ",
     ylab=" ",main="wild-type, low-Ca",
     xlim=c(0,15),ylim=c(0,120))
hist(multibatch[,4],breaks=0.5*(0:30),
     xlab=" ",
     ylab="frequency",main="wild-type, high-Ca",
     xlim=c(0,15 ),ylim=c(0,120))
dev.off()
#
# expression levels for two different genes measured
# on 12 arrays
#
A<-c(12.9, 15.2, 18.0, 16.7, 15.2, 15.7, 11.8, 16.4, 24.2, 17.5, 14.7, 20.2)
B <-c(7.9, 10.2,  9.1,  8.3,  8.7, 12.2,  8.7,  8.4,  8.9, 8.7,  7.3, 13.2)
#
# scatterplot (Figure 4.11)
#
pdf(file="scatter12.pdf",height=6,width=8)
plot(A,B,pch=19,cex=1.5,cex.lab=1.5,cex.axis=1.5,ylim=c(6,14))
dev.off()
#
# enhanced scatterplot using plotting symbols to identify the four
# different experimental treatments (Figure 4.12)
#
pdf(file="scatter12plus.pdf",height=6,width=8)
plot(A,B,type="n",cex.lab=1.5,cex.axis=1.5,ylim=c(6,14))
points(A[1:3],B[1:3],pch=19,cex=1.5)
points(A[4:6],B[4:6],pch=1,cex=1.5)
points(A[7:9],B[7:9],pch=3,cex=1.5)
points(A[10:12],B[10:12],pch=4,cex=1.5)
dev.off()
#
# scatterplot of log-transformed expression levels in calcium-resistant plants
# for a random sample of 500 genes, comparing results for plant subjected
# to a low calcium challenge (x-axis) and a high calcium challenge (y-axis)
# (Figure 4.13)
#
pdf(file="scatter500.pdf",height=6,width=8)
plot(multibatch[,1],multibatch[,2],pch=19,cex.axis=1.5,cex.lab=1.5,lwd=2,
   xlab="low Ca exposure",ylab="high Ca exposure")
dev.off()
#
# a levelled version of the previous scatterplot, the x-axis plots the average
# of each pair of log-transformed expression levels whilst the y-axis plots
# the difference (Figure 4.14)
#
pdf(file="scatter5002.pdf",height=6,width=8)
plot(0.5*(multibatch[,2]+multibatch[,1]),(multibatch[,2]-multibatch[,1]),pch=19,cex=0.5,
   cex.axis=1.5,cex.lab=1.5,lwd=2,
   xlab="average",ylab="difference")
lines(c(0,15),c(0,0),lty=2,lwd=2)
dev.off()
#
# For the R source-code to reproduce Figure 4.15, see Chapter 9. Time series analysis
#
#
# Measurements of a schizophrenia patient's mental state (PANSS: Positive and Negative Symptom Score)
# at weeks 0, 1, 2, 4, 6 and 8 after initiation of treatment
#
y<-c(87,73,69,54,69,52)
x<-c(0,1,2,4,6,8)
#
# time-series plot of the PANSS data from a single patient (Figure 4.16)
#
pdf(file="PANSS1.pdf",paper="special",
           width=5,height=4)
plot(x,y,type="l",lwd=2,xlab="time (weeks)",ylab="PANSS",ylim=c(50,90))
dev.off()
#
# Read PANSS data on 50 patients. The data can be accessed directly from the web-page:
#       http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/
#
PANSS<-read.table("../data/PANSS.data",header=T)
range(PANSS,na.rm=T)
#
# Spaghetti plot of the PANSS data from 50 patients (Figure 4.17)
#
pdf(file="PANSS2.pdf",height=6,width=8)
plot(x,y,type="n",cex.lab=1.5,cex.axis=1.5,xlab="time (weeks)",ylab="PANSS",ylim=c(30,150))
for (i in 1:50) {
   lines(x,PANSS[i,])
   m<-sum(!is.na(PANSS[i,]))
   if (m<6) points(x[m],PANSS[i,m],pch=19)
   }
#
# add thick line showing average PANSS score at each time over all patients
# still being measured
#
means<-apply(PANSS,2,mean,na.rm=T)
lines(x,means,lwd=4)
dev.off()
#
# John Snow's data on locations of cholera cases and water-pumps in Soho, London,
# during the 1854 cholera epidemic. The data can be accessed directly 
# from the web-page:
#       http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/
#
bspump<-c(12.57136,11.72717)
pumps<-read.table("../data/cholera_pumps.txt",header=F)
cases<-read.table("../data/cholera_deaths.txt",header=F)
#
# map based on John Snow's data (Figure 4.18). 
#
pdf(file="Snow.pdf",paper="special",
           width=6,height=6)
library(splancs)
#
# this command loads a package of R functions for displaying and analysing
# spatial point patterns
#
par(pty="s")
pointmap(pumps[-7,],pch=3,cex=1.5)
pointmap(cases,pch=19,cex=0.5,add=T)
points(pumps[7,1],pumps[7,2],pch=1,cex=5)
dev.off()
#
# Read data on locations and sizes of Spruce trees. The data can be accessed directly 
# from the web-page:
#       http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets
#
ixyz<-read.table("../data/spruce_trees.data",header=T)
library(geoR)
#
# this command loads a package of R functions for displaying and analysing
# geostatistical data
# 
spruce<-as.geodata(ixyz,coords.col=c(2,3),data.col=4)
#
# map of the spruce tree data (Figure 4.19)
# the geoR package recognises that the data have been stored in a format that takes account
# of their geostatistical character (locations and associated sizes) and adapts the built-in 
# R function points() to behave in a way that is sensible for data of this kind
#
pdf(file="spruce_trees.pdf",paper="special",height=4,width=6)
points(spruce,bty="n",xaxt="n",yaxt="n",xlab=" ",ylab=" ")
lines(c(0,56,56,0,0),c(0,0,38,38,0))
dev.off()
#
# For the R source-code to reproduce Figure 4.20, see Chapter 10. Spatial statistics
#
# proportions of species-of-origin for cases of human campylobacteriosis 
# reported in Lancashire, England
#
origins<-c(39.6,40.6,18.7,1.1)
species<-c("cattle","chicken","sheep","other")
#
# a pie chart of these data (Figure 4.21)
#
pdf(file="pie.pdf",paper="special",
           width=4,height=4)
pie(origins,species,density=c(10,20,30,0))
dev.off()
#
# summary statistics 
#
data<-read.table("../data/genes.data",header=T,row.names=1)
#
# select random sample of 25 values as before
#
set.seed(498661)
sample25<-sample(1:22810,25)
batch25<-data[sample25,1]
x<-log2(batch25)
#
# mean and standard deviation
#
xbar<-mean(x)
SD<-sqrt(var(x))
round(c(xbar,SD),3)
#
# five-number summary
#
summary(x)[-4]
#
# correlation between expression levels for two different genes measured
# on 12 arrays (same data as shown in Figure 4.12)
#
A<-c(12.9, 15.2, 18.0, 16.7, 15.2, 15.7, 11.8, 16.4, 24.2, 17.5, 14.7, 20.2)
B <-c(7.9, 10.2,  9.1,  8.3,  8.7, 12.2,  8.7,  8.4,  8.9, 8.7,  7.3, 13.2)
#
# first, the hard way
#
Abar<-mean(A)
SDA<-sqrt(var(A))
Bbar<-mean(B)
SDB<-sqrt(var(B))
n<-length(A)
r<-(sum(A*B) - n*Abar*Bbar)/((n-1)*SDA*SDB)
#
# and now the easy way, using the built-in cor() function
#
r<-cor(A,B)
#
# check that these give the same answer!
#
# synthetic data showing a strong non-linear relationship but a correlation
# close to zero (Figure 4.22)
#
set.seed(90165)
x<-0.1*((-20):20)+0.02*runif(41)-0.01
y<-x^2+0.25*rnorm(41)
cor(x,y)
pdf(file="quadratic.pdf",paper="special",height=4,width=5)
plot(x,y,pch=19)
dev.off()