# # This file contains R commands that reproduce the analyses # presented in Chapter 4 # # Read data from a named file. You will need to edit the text within " " # to identify the file in which you have stored the data. The optional # argument header=T tells the software that the first line of the file # gives the names of each of the variables whose values fill the # remaining rows. The argument row.names=1 tells the software that the # first column of the data-file is a set of row-names, which in this case # identify the individual genes. # The data can be accessed directly from the web-page: # http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/ # data<-read.table("../data/genes.data",header=T,row.names=1) # # normalised expression levels of 22810 arabadospis thaliana genes # measured on 12 Affymetrix microarrays # # select random sample of 25 genes - changing the argument of set.seed() # will give a different sample # set.seed(498661) sample25<-sample(1:22810,25) batch25<-data[sample25,1] # # dot-plot of expression levels of sampled genes as measured on array 1 (Figure 4.2) # pdf(file="dotplot.pdf",paper="special", width=6,height=2) plot(batch25,rep(0,25),pch=19,cex=0.5,yaxt="n",xlab="expression level",ylab=" ", xlim=c(0,2500),cex.axis=0.8,cex.lab=0.8,bty="n") dev.off() # # repeat for log-base-two-transformed expression levels (Figure 4.3) # pdf(file="logdotplot.pdf",paper="special", width=6,height=2) plot(log2(batch25),rep(0,25),pch=19,yaxt="n",cex=0.5, xlab="log-base-two-expression level",ylab=" ", xlim=c(0,8),cex.axis=0.8,cex.lab=0.8,bty="n") dev.off() # # cumulative plot of log-base-two-transformed expression levels (Figure 4.4) # pdf(file="cumplot.pdf",height=6,width=8) plot(sort(log2(batch25)),(1:25)/25,pch=19, xlim=c(0,8),ylim=c(0,1),cex.axis=1.5,cex.lab=1.5,cex=1.5, xlab="sorted log-base-two-expression level",ylab="cumulative proportion") dev.off() # # select a larger sample (500 genes) # set.seed(987235) sample500<-sample(1:22810,500) batch500<-log2(data[sample500,1]) # # frequency table using log-transformed expression levels rounded up to integers, # for example 36 of the 500 values were between 1 and 2, 177 between 2 and 3, etc # counts500<-table(1+floor(batch500)) # # 2 3 4 5 6 7 8 9 10 11 12 13 14 # 36 177 82 51 46 36 26 14 12 14 4 1 1 # # frequency polygon corresponding to above table (Figure 4.5) # pdf(file="frequencypoly.pdf",paper="special", width=6,height=5) plot((-0.5)+(2:14),counts500,pch=19, xlim=c(0,15),ylim=c(0,180), xlab="log-base-two-expression level",ylab="frequency ") lines((-0.5)+(2:14),counts500) dev.off() # # bar-chart and histogram corresponding to above table (Figure 4.6) # pdf(file="barhist.pdf",paper="special", width=8,height=5) par(mfrow=c(1,2)) plot(as.numeric(names(counts500))-0.5,counts500, type = "h",lwd=5, xlab="log-base-two-expression level",ylab="frequency ", xlim=c(0,15),ylim=c(0,180)) hist(batch500,breaks=0:14,xlab="log-base-two-expression level", ylab="frequency",main=" ", xlim=c(0,15),ylim=c(0,180)) # # the simple command hist(batch500) would also do the job, but would select # bin-widths for you, whereas the longer version of the command allows you # to make this choice # dev.off() # # histogram of all 22810 log-transformed expression levels on array 1 (Figure 4.7) # pdf(file="histall.pdf",paper="special", width=6,height=5) hist(log2(data[,1]),breaks=0.5*(0:30), xlab="log-base-two-expression level", ylab="frequency",main=" ", xlim=c(0,14),ylim=c(0,5000)) dev.off() # # Dot-plots of log-transformed expression levels for a random sample of 25 genes # from arrays 1 to 4 (Figure 4.8) # pdf(file="dotplots4.pdf",paper="special", width=6,height=2.5) set.seed(498661) sample25<-sample(1:22810,25) batch25RL<-data[sample25,1] batch25RH<-data[sample25,4] batch25WL<-data[sample25,7] batch25WH<-data[sample25,10] plot(log2(batch25RL),rep(0,25),pch=19,yaxt="n",cex=0.5, xlab="log-base-two-expression level",ylab=" ", xlim=c(0,8),ylim=c(-0.02,0.32),cex.axis=1.0,cex.lab=1.0,bty="n") points(log2(batch25RH),rep(0.1,25),pch=1,cex=0.5) points(log2(batch25WL),rep(0.2,25),pch=3,cex=0.5) points(log2(batch25WH),rep(0.3,25),pch=4,cex=0.5) dev.off() # # superimposed cumulative plots of the same data (Figure 4.9) # pdf(file="cumplot4.pdf") set.seed(498661) sample25<-sample(1:22810,25) batch25RL<-data[sample25,1] batch25RH<-data[sample25,4] batch25WL<-data[sample25,7] batch25WH<-data[sample25,10] plot(sort(log2(batch25RL)),(1:25)/25,type="l",lwd=2, xlab="sorted log-base-two-expression level",ylab="cumulative proportion", xlim=c(0,8),ylim=c(0,1),cex.axis=1.5,cex.lab=1.5) lines(sort(log2(batch25RH)),(1:25)/25,lwd=4) lines(sort(log2(batch25WL)),(1:25)/25,lwd=2,lty=2) lines(sort(log2(batch25WH)),(1:25)/25,lwd=4,lty=2) dev.off() # # log-transformed expression levels for a random sample of 500 genes from arrays 1 to 4, # multibatch<-log2(data[sample500,c(1,4,7,10)]) # # superimposed histograms of these data... very cluttered and not recommended!) # pdf(file="histograms4.pdf",height=6,width=8) par(mfrow=c(1,1)) hist(multibatch[,1],breaks=0.5*(0:30),cex.axis=1.5,cex.lab=1.5, xlab="log-base-two-expression level", ylab="frequency", xlim=c(0,15),ylim=c(0,120),main=" ") hist(multibatch[,2],breaks=0.5*(0:30),lty=2,add=T) hist(multibatch[,3],breaks=0.5*(0:30),lty=3,add=T) hist(multibatch[,4],breaks=0.5*(0:30),lty=4,add=T) dev.off() # # histograms of the same data in a two-by-two layout that mimics the # experimental treatment structure... much less cluttered (Figure 4.10) # pdf(file="histograms_panels.pdf",height=6,width=8) par(mfrow=c(2,2)) hist(multibatch[,1],breaks=0.5*(0:30), xlab="log-base-two-expression level", ylab="frequency",main="resistant, low-Ca", xlim=c(0,15),ylim=c(0,120)) hist(multibatch[,2],breaks=0.5*(0:30), xlab=" ", ylab=" ",main="resistant, high-Ca", xlim=c(0,15),ylim=c(0,120)) hist(multibatch[,3],breaks=0.5*(0:30), xlab=" ", ylab=" ",main="wild-type, low-Ca", xlim=c(0,15),ylim=c(0,120)) hist(multibatch[,4],breaks=0.5*(0:30), xlab=" ", ylab="frequency",main="wild-type, high-Ca", xlim=c(0,15 ),ylim=c(0,120)) dev.off() # # expression levels for two different genes measured # on 12 arrays # A<-c(12.9, 15.2, 18.0, 16.7, 15.2, 15.7, 11.8, 16.4, 24.2, 17.5, 14.7, 20.2) B <-c(7.9, 10.2, 9.1, 8.3, 8.7, 12.2, 8.7, 8.4, 8.9, 8.7, 7.3, 13.2) # # scatterplot (Figure 4.11) # pdf(file="scatter12.pdf",height=6,width=8) plot(A,B,pch=19,cex=1.5,cex.lab=1.5,cex.axis=1.5,ylim=c(6,14)) dev.off() # # enhanced scatterplot using plotting symbols to identify the four # different experimental treatments (Figure 4.12) # pdf(file="scatter12plus.pdf",height=6,width=8) plot(A,B,type="n",cex.lab=1.5,cex.axis=1.5,ylim=c(6,14)) points(A[1:3],B[1:3],pch=19,cex=1.5) points(A[4:6],B[4:6],pch=1,cex=1.5) points(A[7:9],B[7:9],pch=3,cex=1.5) points(A[10:12],B[10:12],pch=4,cex=1.5) dev.off() # # scatterplot of log-transformed expression levels in calcium-resistant plants # for a random sample of 500 genes, comparing results for plant subjected # to a low calcium challenge (x-axis) and a high calcium challenge (y-axis) # (Figure 4.13) # pdf(file="scatter500.pdf",height=6,width=8) plot(multibatch[,1],multibatch[,2],pch=19,cex.axis=1.5,cex.lab=1.5,lwd=2, xlab="low Ca exposure",ylab="high Ca exposure") dev.off() # # a levelled version of the previous scatterplot, the x-axis plots the average # of each pair of log-transformed expression levels whilst the y-axis plots # the difference (Figure 4.14) # pdf(file="scatter5002.pdf",height=6,width=8) plot(0.5*(multibatch[,2]+multibatch[,1]),(multibatch[,2]-multibatch[,1]),pch=19,cex=0.5, cex.axis=1.5,cex.lab=1.5,lwd=2, xlab="average",ylab="difference") lines(c(0,15),c(0,0),lty=2,lwd=2) dev.off() # # For the R source-code to reproduce Figure 4.15, see Chapter 9. Time series analysis # # # Measurements of a schizophrenia patient's mental state (PANSS: Positive and Negative Symptom Score) # at weeks 0, 1, 2, 4, 6 and 8 after initiation of treatment # y<-c(87,73,69,54,69,52) x<-c(0,1,2,4,6,8) # # time-series plot of the PANSS data from a single patient (Figure 4.16) # pdf(file="PANSS1.pdf",paper="special", width=5,height=4) plot(x,y,type="l",lwd=2,xlab="time (weeks)",ylab="PANSS",ylim=c(50,90)) dev.off() # # Read PANSS data on 50 patients. The data can be accessed directly from the web-page: # http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/ # PANSS<-read.table("../data/PANSS.data",header=T) range(PANSS,na.rm=T) # # Spaghetti plot of the PANSS data from 50 patients (Figure 4.17) # pdf(file="PANSS2.pdf",height=6,width=8) plot(x,y,type="n",cex.lab=1.5,cex.axis=1.5,xlab="time (weeks)",ylab="PANSS",ylim=c(30,150)) for (i in 1:50) { lines(x,PANSS[i,]) m<-sum(!is.na(PANSS[i,])) if (m<6) points(x[m],PANSS[i,m],pch=19) } # # add thick line showing average PANSS score at each time over all patients # still being measured # means<-apply(PANSS,2,mean,na.rm=T) lines(x,means,lwd=4) dev.off() # # John Snow's data on locations of cholera cases and water-pumps in Soho, London, # during the 1854 cholera epidemic. The data can be accessed directly # from the web-page: # http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets/ # bspump<-c(12.57136,11.72717) pumps<-read.table("../data/cholera_pumps.txt",header=F) cases<-read.table("../data/cholera_deaths.txt",header=F) # # map based on John Snow's data (Figure 4.18). # pdf(file="Snow.pdf",paper="special", width=6,height=6) library(splancs) # # this command loads a package of R functions for displaying and analysing # spatial point patterns # par(pty="s") pointmap(pumps[-7,],pch=3,cex=1.5) pointmap(cases,pch=19,cex=0.5,add=T) points(pumps[7,1],pumps[7,2],pch=1,cex=5) dev.off() # # Read data on locations and sizes of Spruce trees. The data can be accessed directly # from the web-page: # http://www.lancs.ac.uk/staff/diggle/intro-stats-book/datasets # ixyz<-read.table("../data/spruce_trees.data",header=T) library(geoR) # # this command loads a package of R functions for displaying and analysing # geostatistical data # spruce<-as.geodata(ixyz,coords.col=c(2,3),data.col=4) # # map of the spruce tree data (Figure 4.19) # the geoR package recognises that the data have been stored in a format that takes account # of their geostatistical character (locations and associated sizes) and adapts the built-in # R function points() to behave in a way that is sensible for data of this kind # pdf(file="spruce_trees.pdf",paper="special",height=4,width=6) points(spruce,bty="n",xaxt="n",yaxt="n",xlab=" ",ylab=" ") lines(c(0,56,56,0,0),c(0,0,38,38,0)) dev.off() # # For the R source-code to reproduce Figure 4.20, see Chapter 10. Spatial statistics # # proportions of species-of-origin for cases of human campylobacteriosis # reported in Lancashire, England # origins<-c(39.6,40.6,18.7,1.1) species<-c("cattle","chicken","sheep","other") # # a pie chart of these data (Figure 4.21) # pdf(file="pie.pdf",paper="special", width=4,height=4) pie(origins,species,density=c(10,20,30,0)) dev.off() # # summary statistics # data<-read.table("../data/genes.data",header=T,row.names=1) # # select random sample of 25 values as before # set.seed(498661) sample25<-sample(1:22810,25) batch25<-data[sample25,1] x<-log2(batch25) # # mean and standard deviation # xbar<-mean(x) SD<-sqrt(var(x)) round(c(xbar,SD),3) # # five-number summary # summary(x)[-4] # # correlation between expression levels for two different genes measured # on 12 arrays (same data as shown in Figure 4.12) # A<-c(12.9, 15.2, 18.0, 16.7, 15.2, 15.7, 11.8, 16.4, 24.2, 17.5, 14.7, 20.2) B <-c(7.9, 10.2, 9.1, 8.3, 8.7, 12.2, 8.7, 8.4, 8.9, 8.7, 7.3, 13.2) # # first, the hard way # Abar<-mean(A) SDA<-sqrt(var(A)) Bbar<-mean(B) SDB<-sqrt(var(B)) n<-length(A) r<-(sum(A*B) - n*Abar*Bbar)/((n-1)*SDA*SDB) # # and now the easy way, using the built-in cor() function # r<-cor(A,B) # # check that these give the same answer! # # synthetic data showing a strong non-linear relationship but a correlation # close to zero (Figure 4.22) # set.seed(90165) x<-0.1*((-20):20)+0.02*runif(41)-0.01 y<-x^2+0.25*rnorm(41) cor(x,y) pdf(file="quadratic.pdf",paper="special",height=4,width=5) plot(x,y,pch=19) dev.off()