# # This file contains R commands that reproduce the analyses # presented in Chapter 7 # # Read data on root-length of plants grown in glyphosate-polluted water # glyphosate<-read.table("../data/glyphosate.data",header=F) names(glyphosate)<-c("concentration","water","rep","rootlength") # # Plot root-length against glyphosate concentration, using different plotting # symbols to differentiate plants grown in distilled or tap water (Figure 7.1) # pdf(file="glyphosate_data.pdf",height=6,width=8) plot(glyphosate$conc,glyphosate$rootlength,type="n",xlab="concentration",ylab="root-length", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=19,cex=1.5) take<-glyphosate$water==2 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=1,cex=1.5) dev.off() # # Small-scale illustration of linear and non-linear relationships (Figure 7.2) # set.seed(2347393) x.fict<-c(0.5,1.5,1.5,2.5) y1.fict<-x.fict+1.5*rnorm(4) y2.fict<-y1.fict y2.fict[2]<-max(c(y2.fict[2],y2.fict[3])) y2.fict[3]<-y2.fict[2]+0.2 pdf(file="fictitious.pdf",height=6,width=8) par(mfrow=c(1,2)) plot(x.fict,y1.fict,xlab="x",ylab="y",pch=19,cex=1.5,cex.axis=1.5,cex.lab=1.5,xlim=c(0,3),ylim=c(-1,5)) plot(x.fict,y2.fict,xlab="x",ylab="y",pch=19,cex=1.5,cex.axis=1.5,cex.lab=1.5,xlim=c(0,3),ylim=c(-1,5)) dev.off() # # schematic of the simple linear model (Figure 7.3) # set.seed(481689) x<-0.5*(1:10) alpha<-3 beta<-4 sigma<-5.0 n<-length(x) pdf(file="linear_schematic.pdf",height=6,width=8) par(pty="s") z<-sigma*rnorm(n) y<-alpha+beta*x+z plot(x,y,pch=19,cex=1.5,cex.lab=1.5,cex.axis=1.5,xlim=c(0,5.5),ylim=c(0,25)) lines(c(0,x,5.5),alpha+beta*c(0,x,5.5),lwd=2) lines(c(1,2),alpha+beta*c(1,1)) lines(c(2,2),alpha+beta*c(1,2)) lines(c(x[8],x[8]),c(y[8]+0.5,alpha+beta*x[8]-0.5)) lines(c(x[8]-0.1,x[8],x[8]+0.1),c(y[8]+1,y[8]+0.5,y[8]+1)) y0<-alpha+beta*x[8] lines(c(x[8]-0.1,x[8],x[8]+0.1),c(y0-1,y0-0.5,y0-1)) y0<-alpha+beta lines(c(2.1,2.2,2.3),c(y0+0.5,y0,y0+0.5)) y0<-alpha+2*beta lines(c(2.1,2.2,2.3),c(y0-0.5,y0,y0-0.5)) lines(c(2.2,2.2),c(alpha+beta,alpha+2*beta)) lines(c(-0.1,0.0,0.1),c(0.75,0.25,0.75)) lines(c(-0.1,0.0,0.1),c(alpha-0.75,alpha-0.25,alpha-0.75)) lines(c(0.0,0.0),c(0.25,alpha-0.255)) text(x=0.5,y=alpha/2,labels="alpha") text(x=2.5,y=alpha+1.5*beta,labels="beta") text(x=x[8]+0.2,y=(alpha+beta*x[8]+y[8])/2,labels="z") lines(c(0,5.5),c(0,0),lty=2) dev.off() # # The Normal probability density funciton with mean 5 and standard # deviation 1 (Figure 7.4) mu<-5 sigma<-1 x<-mu+sigma*0.01*((-400):400) y<-dnorm(x,mu,sigma) pdf("Normal-pdf.pdf",height=5,width=6) plot(x,y,type="l",lwd=2,xlab="x",ylab="pdf",xaxp=c(1,9,8)) lines(c(mu,mu),c(0,0.04),lwd=1.5) lines(c(mu-0.1,mu,mu+0.1),c(0.015,0,0.015),lwd=1.5) text(mu,0.065,"mu") lines(c(4.5,5.5),c(0.2,0.2),lwd=1.5) lines(c(4.6,4.5,4.6),c(0.207,0.2,0.193),lwd=1.5) lines(c(5.4,5.5,5.4),c(0.207,0.2,0.193),lwd=1.5) text(5,0.235,"sigma") dev.off() # # Three small-scale simulations of the simple linear model (Figure 7.5) # set.seed(480713) x<-1:10 alpha<-3 beta<-2 sigma<-2.0 n<-length(x) pdf(file="linear_sims.pdf",paper="special",width=6,height=4) par(mfrow=c(1,3),pty="s") for (i in 1:3) { z<-sigma*rnorm(n) y<-alpha+beta*x+z estimates<-lm(y~x)$coef yhat<-estimates[1]+estimates[2]*x plot(x,y,pch=19,xlim=c(0,11),ylim=c(0,25)) lines(x,alpha+beta*x) lines(x,yhat,lty=2) } dev.off() # # Plots to show how transformations of the data change the shape of the relationship # between root-length and glyphosate concentration (Figures 7.6 and 7.7) # glyphosate<-read.table("../data/glyphosate.data",header=F) names(glyphosate)<-c("concentration","water","rep","rootlength") pdf(file="glyphosate_transformed.pdf",height=6,width=8) par(mfrow=c(1,2)) plot(glyphosate$conc,glyphosate$rootlength,type="n",xlab="concentration",ylab="root-length", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=19,cex=1.5) take<-glyphosate$water==2 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=1,cex=1.5) plot(log(1+glyphosate$conc),glyphosate$rootlength,type="n",xlab="log(1+concentration)",ylab="root-length", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(log(1+glyphosate$conc[take]),glyphosate$rootlength[take],pch=19,cex=1.5) take<-glyphosate$water==2 points(log(1+glyphosate$conc[take]),glyphosate$rootlength[take],pch=1,cex=1.5) dev.off() glyphosate<-read.table("../data/glyphosate.data",header=F) names(glyphosate)<-c("concentration","water","rep","rootlength") pdf(file="glyphosate_transformed2.pdf",height=6,width=8) par(mfrow=c(1,2)) plot(glyphosate$conc,glyphosate$rootlength,type="n",xlab="concentration",ylab="root-length", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=19,cex=1.5) take<-glyphosate$water==2 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=1,cex=1.5) plot(log(1+glyphosate$conc),log(glyphosate$rootlength),type="n",xlab="log(1+concentration)",ylab="log(root-length)", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(log(1+glyphosate$conc[take]),log(glyphosate$rootlength[take]),pch=19,cex=1.5) take<-glyphosate$water==2 points(log(1+glyphosate$conc[take]),log(glyphosate$rootlength[take]),pch=1,cex=1.5) dev.off() # # Crossover analysis of the asthma trial data using a linear model, with d # denoting the difference between PEF values for the two drugs used on the # same child, and v coded as 1 or -1 according to the order in which the two # drugs were given (Section 7.6.4) # d<-c(40,50,70,20,40,30,-35,15,90,30,30,80,130) v<-c(1,1,1,1,1,1,1,-1,-1,-1,-1,-1,-1) fit<-lm(d~v) summary(fit) # # Analysing a four-treatment experiment (log-transformed expression levels for a single # arabadopsis thaliana gene) using a linear model, with treatments initially considered # as a single factor on four levels (Section 7.6.5) # y<-c(6.469,6.575,7.080,7.714,8.429,6.998,6.357,6.278,5.906,6.874,7.870,3.872) treatment<-as.factor(c(3,3,3,4,4,4,1,1,1,2,2,2)) fit1<-lm(log2(y)~-1+treatment) summary(fit1) fit2<-lm(log2(y)~treatment) summary(fit2) # # Analysis of the same data recognising the factorial structure of the four treatments, # namely two factors (strain and challenge), each on two levels (Section 7.6.5) # strain<-c(1,1,1,1,1,1,0,0,0,0,0,0) challenge<-c(0,0,0,1,1,1,0,0,0,1,1,1) fit3<-lm(log2(y)~strain+challenge+strain:challenge) summary(fit3) # # Fitting a linear model to the transformed glyphosate data (Section 7.6.8) # glyphosate<-read.table("../data/glyphosate.data",header=F) names(glyphosate)<-c("concentration","water","rep","rootlength") y<-log(glyphosate$rootlength) x<-log(1+glyphosate$concentration) water<-as.factor(glyphosate$water) fit.glyphosate<-lm(y~water+x) summary(fit.glyphosate) n<-54 sigmasquared<-sum(fit.glyphosate$resid^2)/n L0<-(-n/2)*log(sigmasquared) fit2.glyphosate<-lm(y~water+x+water:x) summary(fit2.glyphosate) sigmasquared<-sum(fit2.glyphosate$resid^2)/n L1<-(-n/2)*log(sigmasquared) # # The preferred model fits separate mean response profiles for root-length as a function of # glyphosate concentration, according to whether plants are grown in distilled or tap water # (Figure 7.8) # x<-0.01*(0:350) ymat<-NULL # # Draw four fitted curves, corresponding to each of two parameters set at the lower and upper # ends of their respective 95% confidence intervals # for (alpha in c(4.6193,4.8749)) { for (beta in c(-1.1046, -0.7866)) { ymat<-rbind(ymat,exp(alpha+beta*log(1+x))) } } pdf(file="fitted_untransformed.pdf",height=6,width=8) par(mfrow=c(1,1)) plot(glyphosate$conc,glyphosate$rootlength,type="n",xlab="concentration",ylab="root-length", cex.axis=1.5,cex.lab=1.5) take<-glyphosate$water==1 points(glyphosate$conc[take],glyphosate$rootlength[take],pch=19,cex=1.5) for (i in 1:2) { lines(x,ymat[i,],lwd=2) } for (i in 3:4) { lines(x,ymat[i,],lwd=2,lty=2) } dev.off() # # Anscombe's quartet: four synthetic data-sets that give identical linear model fits, # but contrasting patterns of residuals (Figures 7.9, 7.10) # data<-read.table("../data/anscombe_data.txt",header=T) y<-data$y1 x<-data$x fit<-lm(y~x) summary(fit) # #Call: #lm(formula = y ~ x) # #Residuals: # Min 1Q Median 3Q Max #-1.92127 -0.45577 -0.04136 0.70941 1.83882 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 3.0001 1.1247 2.667 0.02573 * #x 0.5001 0.1179 4.241 0.00217 ** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 1.237 on 9 degrees of freedom #Multiple R-squared: 0.6665, Adjusted R-squared: 0.6295 #F-statistic: 17.99 on 1 and 9 DF, p-value: 0.002170 # xfit<-c(0,20) yfit<-3.0001+0.5001*xfit x4<-data$x4 y1<-y y2<-data$y2 y3<-data$y3 y4<-data$y4 xfit<-c(0,20) yfit<-3.0001+0.5001*xfit x4<-data$x4 y1<-y y2<-data$y2 y3<-data$y3 y4<-data$y4 pdf(file="anscombe.pdf",height=6,width=8) par(mfrow=c(2,2)) plot(xfit,yfit,type="l",xlab="x",ylab="y",xlim=c(0,20),ylim=c(0,14),cex.lab=1.5,cex.axis=1.5) points(x,y1,pch=19,cex=1.5) plot(xfit,yfit,type="l",xlab="x",ylab="y",xlim=c(0,20),ylim=c(0,14),cex.lab=1.5,cex.axis=1.5) points(x,y2,pch=19,cex=1.5) plot(xfit,yfit,type="l",xlab="x",ylab="y",xlim=c(0,20),ylim=c(0,14),cex.lab=1.5,cex.axis=1.5) points(x,y3,pch=19,cex=1.5) plot(xfit,yfit,type="l",xlab="x",ylab="y",xlim=c(0,20),ylim=c(0,14),cex.lab=1.5,cex.axis=1.5) points(x4,y4,pch=19,cex=1.5) dev.off() f1<-3.0001+0.5001*x f2<-f1 f3<-f1 f4<-3.0001+0.5001*x4 r1<-y1-f1 r2<-y2-f2 r3<-y3-f3 r4<-y4-f4 pdf(file="anscombe_residuals.pdf",height=6,width=8) par(mfrow=c(2,2)) plot(f1,r1,pch=19,cex=1.5,xlab="fitted values",ylab="residuals",cex.lab=1.5,cex.axis=1.5,xlim=c(4,14),ylim=c(-3.5,3.5)) lines(c(4,14),c(0,0),lty=2) plot(f2,r2,pch=19,cex=1.5,xlab="fitted values",ylab="residuals",cex.lab=1.5,cex.axis=1.5,xlim=c(4,14),ylim=c(-3.5,3.5)) lines(c(4,14),c(0,0),lty=2) plot(f3,r3,pch=19,cex=1.5,xlab="fitted values",ylab="residuals",cex.lab=1.5,cex.axis=1.5,xlim=c(4,14),ylim=c(-3.5,3.5)) lines(c(4,14),c(0,0),lty=2) plot(f4,r4,pch=19,cex=1.5,xlab="fitted values",ylab="residuals",cex.lab=1.5,cex.axis=1.5,xlim=c(4,14),ylim=c(-3.5,3.5)) lines(c(4,14),c(0,0),lty=2) dev.off() # # A plot of residuals against time, for a one-year series of daily maximum temperatures fitted # by a simple harmonic regression model (Figure 7.11) # maxtemp<-read.table("../data/maxtemp.data",header=F) names(maxtemp)<-c("year","month","day","temperature") n<-dim(maxtemp)[1] time<-1:n temperature<-maxtemp$temperature x1<-sin(2*pi*time/n) x2<-cos(2*pi*time/n) fit<-lm(temperature~x1+x2) pdf(file="maxtemp_residuals.pdf",height=6,width=8) plot(time,fit$resid,type="l",xlab="time (days)",ylim=c(-8,8),cex.axis=1.5, cex.lab=1.7,ylab="residual") lines(c(0,366),c(0,0),lty=2) dev.off() # # A plot of residuals against fitted values for the data from the glyphosate experiment # (Figure 7.12) # r<-fit2.glyphosate$residuals f<-fit2.glyphosate$fitted.values pdf(file="fitted_v_residuals.pdf",height=6,width=8) plot(f,r,xlab="fitted values",ylab="residuals",type="n",cex.lab=1.5,cex.axis=1.5) points(f[1:27],r[1:27],pch=19,cex=1.5) points(f[28:54],r[28:54],pch=1,cex=1.5) dev.off() # # Fitting a simple exponential growth model to world population data # # Tabulate data and differences between population in successive five-year periods # (Table 7.1) # pop<-read.table("../data/world_population.data",header=T) y<-pop[,2] d<-c(NA,y[2:13]-y[1:12]) r<-c(NA,y[2:13]/y[1:12]) round(cbind(pop,d,r),3) # # Plot log-population against year to show that exponential growth # model is reasonable up to 1990 but not beyond (Figure 7.13) # pdf("world_population.pdf",paper="special",height=6,width=6) plot(pop[,1],log(pop[,2]),pch=19,xlab="year",ylab="log(population)") y<-log(pop[1:9,2]) x<-pop[1:9,1] fit.pop<-lm(y~x) lines(x,fit.pop$fitted.values,lwd=2) dev.off() # # Plot population against year, and log-transformed population against year, # to see if exponential growth is a reasonable assumption # ratio<-c(NA,pop[2:13,2]/pop[1:12,2]) difference<-c(NA,pop[2:13,2]-pop[1:12,2]) cbind(pop,difference,ratio) year<-pop$year population<-pop$population pdf(file="world_population.pdf",height=6,width=8) par(mfrow=c(1,2)) plot(year,population,pch=19,cex=1.5,cex.lab=1.5,cex.axis=1.5) plot(year,log(population),pch=19,cex=1.5,cex.lab=1.5,cex.axis=1.5) dev.off() # # Fit split-line model, with change in slope in 1990 (Figure 7.14) # pdf("world_population_fit.pdf",paper="special",height=6,width=6) par(pty="s") plot(pop[,1],log(pop[,2]),pch=19,xlab="year",ylab="log(population)") y<-log(pop[,2]) x<-pop[,1]-1950 u<-c(rep(0,9),pop[10:13,1]-pop[9,1]) fit3<-lm(y~x+u) lines(x,fit3$fitted.values,lwd=2) dev.off() # # Graphical comparison between exponential and logistic growth (Figure 7.15) # t<-0.1*(0:100) alpha<-0 beta<-1 gamma<-100 y1<-exp(alpha+beta*t); y1<-y1/y1[1] y2<-gamma*exp(alpha + beta*t)/(gamma + exp(alpha + beta*t));y2<-y2/y2[1] pdf(file="exponential_and_logistic.pdf",height=6,width=8) plot(t,y2,type="l",lwd=2,cex.lab=1.5,cex.axis=1.5,xlab="t",ylab="mu(t)") lines(t,y1,lty=2,lwd=2) dev.off() # # Illustration of the logistic model for binary data. Each curve shows how # the modelled probability of a positive response changes with the value of # a single explanatory variable (Figure 7.16) # x<-0.01*((-500):500) y1<-exp(x)/(1+exp(x)) y2<-exp(1.0+x)/(1+exp(1.0+x)) y3<-exp(2*x)/(1+exp(2*x)) pdf(file="logit_inverse.pdf",height=6,width=8) plot(x,y1,type="l",cex.lab=1.5,cex.axis=1.5,lwd=2,ylab="mu(x)") lines(x,y2,lty=2,lwd=2) lines(x,y3,lty=3,lwd=2) dev.off() # # Graphical comparison between linear and quartic logistic models. Note # how the quartic model produces an asymmetry in the shape of the curve # (Figure 7.17) # x<-0.01*(0:1000) y1<-exp(-5+2*x)/(1+exp(-5+2*x)) y2<-exp(-5+0.01*x*x*x*x)/(1+exp(-5+0.01*x*x*x*x)) pdf(file="logistic_quartic.pdf",height=6,width=8) plot(x,y1,type="l",cex.lab=1.5,cex.axis=1.5,lwd=2,ylab="mu(x)") lines(x,y2,lty=2,lwd=2) dev.off() # # Poisson probability distributions with means 2, 4 and 8 (Figure 7.18) # x<-0:20 y2<-dpois(x,2) y4<-dpois(x,4) y8<-dpois(x,8) pdf(file="poisson.pdf",paper="special",width=6,height=5) plot(x,y2,type="l",xlab="x",ylab="p(x)",lwd=2) lines(x,y4,lty=2,lwd=2) lines(x,y8,lty=3,lwd=2) dev.off()