Teaching resources for statistics and R: Assessing normality

This will be the first in a series of articles sharing digital resources that I’ve developed to illustrate concepts in statistics. I use these in my own biostatistics course. My wish is that these will be useful for instructors and students alike.

I created the figures below to illustrate some common methods used to assess the normality of a sample: histogram overlaid with normal curve, Q-Q plot, and a normality hypothesis test (specifically, the Shapiro-Wilk test). I also chose to include measures of skewness and excess kurtosis, which I think are useful points of additional information for understanding the properties of various distributions.

The samples that I’ve used for illustration come from populations with four, distinct frequency distributions: normal, uniform, exponential, and bimodal.

I made these figures using R. The code I wrote to generate these graphics is found below each image. PDF versions of the images are available for download by using the links in the image captions.

4 x 1 image (for phone/tablet screen)

Here is a 4 x 1 version of the graphic. This would probably work best on tablets or smartphone screens.

A 4×1 version of this graphic, for display on phone/tablet screens. Image can also be downloaded as a PDF file.

Here is some R code you can use to make this graphic:

### testing for normality, 4x1 graphic ----------------------------------------------
n <- 100 # number of samples to take from each distribution

library(psych) # load library containing 'describe' function
s.list <- list() # create an empty list to hold results of sampling

s.list$normal <- rnorm(n) # normal distribution, random sample
s.list$uniform <- runif(n) # uniform distribution, random sample
s.list$exponential <- rexp(n) # exponential distribution, random sample
s.list$bimodal <- c(rnorm(n/2, 0, 1), rnorm(n/2, 10, 1)) # bimodal normal distribution

pdf("INSERT_FILE_NAME_HERE.pdf", width=6, height=10)

par(mfrow=c(4,2) # graph window, 4 rows x 2 columns
   , oma=c(3,0,4,0)) # set outer margins

for(i in 1:length(s.list)){
    samp <- s.list[[i]] # the sampled data points
    
    hist.out <- hist(samp, plot=FALSE) # histogram calculations
    
    xmin <- min(hist.out$breaks) # x min of plot
    xmax <- max(hist.out$breaks) # x max of plot
    
    des <- describe(samp) # get descriptive stats; store results in object 'des'
        
    xnorm <- seq(from=xmin, to=xmax, by=((xmax - xmin)/200)) # x values for normal distribution
    ynorm <- dnorm(x=xnorm, # using x values of normal distribution
    mean=des$mean, sd=des$sd) # use sample statistics to estimate population params
    
    ymax <- 1.4*max(hist.out$density, ynorm) # adjust the multiplier to adjust space above plot
    
    hist(samp, freq=FALSE, ylim=c(0, ymax), xlab=NULL, main=NULL) # density histogram
    lines(x=xnorm, y=ynorm, col="blue", lwd=1.5) # overlay normal curve onto histogram

    skew.text <- paste("skew =", signif(des$skew, 2))
    mtext(text=skew.text , side=3, line=-1, adj=1) # 1 lines down from top margin

    kurt.text <- paste("excess kurtosis =", signif(des$kurtosis, 2))
    mtext(text=kurt.text, side=3, line=-2.5, adj=1) # 2 lines down from top margin

    qqnorm(samp, main="") # Q-Q plot
    qqline(samp, col="red", lwd=2) # add straight line to Q-Q plot

    shap <- shapiro.test(samp) # output of Shapiro-Wilk test

    mtext(text="Shapiro-Wilk:", side=3, line=-1.5, adj=0.1, font=2) 
    
    W.text <- paste("W =", signif(shap$statistic, 2))   
    mtext(text=W.text, side=3, line=-3, adj=0.1)

    p.text <- paste("p =", signif(shap$p.value, 2))
    mtext(text=p.text, side=3, line=-4.5, adj=0.1) # 3 down from top, near right edge
    
   # add plot title
   text(x=grconvertX(0.5, "ndc", "user"), y=grconvertY(0.98, "nfc", "user")
      , labels=paste("Population:", names(s.list)[i])
      , xpd=TRUE, cex=2.2, pos=1, xpd=NA)
   
   # add divider lines   
   abline(h=grconvertY(c(0,1), "nfc", "user"), xpd=NA, lwd=2)   

} # end for() loop

# add main title to top of page
mtext("Assessing Normality", outer=TRUE, side=3, cex=2, line=1, col="darkred")


# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
   , outer=TRUE, side=1, cex=1, line=1, col="gray66")

dev.off() # finish PDF file

2 x 2 image (for slides)

This 2 x 2 version will probably work better for slides. Note that the data are slightly different than in the 1 x 4 version due to random sampling variability.

A 2 x 2 version of this graphic, for display on phone/tablet screens. Image can also be downloaded as a PDF file.

Here is some R code you can use to make this graphic. It is essentially a slightly modified version of the previous code:

### testing for normality, 2x2 graphic ----------------------------------------------
n <- 100 # number of samples to take from each distribution

library(psych) # load library containing 'describe' function
s.list <- list() # create an empty list to hold results of sampling

s.list$normal <- rnorm(n) # normal distribution, random sample
s.list$uniform <- runif(n) # uniform distribution, random sample
s.list$exponential <- rexp(n) # exponential distribution, random sample
s.list$bimodal <- c(rnorm(n/2, 0, 1), rnorm(n/2, 10, 1)) # bimodal normal distribution

pdf("INSERT_FILE_NAME_HERE.pdf", width=11)

par(mfrow=c(2,4) # graph window, 2 rows x 4 columns
   , oma=c(3,0,4,0)) # set outer margins

for(i in 1:length(s.list)){
    samp <- s.list[[i]] # the sampled data points
    
    hist.out <- hist(samp, plot=FALSE) # histogram calculations
    
    xmin <- min(hist.out$breaks) # x min of plot
    xmax <- max(hist.out$breaks) # x max of plot
    
    des <- describe(samp) # get descriptive stats; store results in object 'des'
    
    
    xnorm <- seq(from=xmin, to=xmax, by=((xmax - xmin)/200)) # x values for normal distribution
    ynorm <- dnorm(x=xnorm, # using x values of normal distribution
    mean=des$mean, sd=des$sd) # use sample statistics to estimate population params
    
    ymax <- 1.4*max(hist.out$density, ynorm) # adjust the multiplier to adjust space above plot
    
    hist(samp, freq=FALSE, ylim=c(0, ymax), xlab=NULL, main=NULL) # density histogram
    lines(x=xnorm, y=ynorm, col="blue", lwd=1.5) # overlay normal curve onto histogram

    skew.text <- paste("skew =", signif(des$skew, 2))
    mtext(text=skew.text , side=3, line=-1, adj=1) # 1 lines down from top margin

    kurt.text <- paste("excess kurtosis =", signif(des$kurtosis, 2))
    mtext(text=kurt.text, side=3, line=-2.5, adj=1) # 2 lines down from top margin

    qqnorm(samp, main="") # Q-Q plot
    qqline(samp, col="red", lwd=2) # add straight line to Q-Q plot

    shap <- shapiro.test(samp) # output of Shapiro-Wilk test

    mtext(text="Shapiro-Wilk:", side=3, line=-1.5, adj=0.1, font=2) 
    
    W.text <- paste("W =", signif(shap$statistic, 2))   
    mtext(text=W.text, side=3, line=-3, adj=0.1)

    p.text <- paste("p =", signif(shap$p.value, 2))
    mtext(text=p.text, side=3, line=-4.5, adj=0.1) # 3 down from top, near right edge
    
   # add plot title
   text(x=grconvertX(0, "nfc", "user"), y=grconvertY(0.98, "nfc", "user")
      , labels=paste("Population:", names(s.list)[i])
      , xpd=TRUE, cex=2.2, pos=1, xpd=NA)
   
   # add horizontal divider lines   
   abline(h=grconvertY(c(0,1), "nfc", "user"), xpd=NA, lwd=2)  

} # end for() loop

# add vertical divider line
lines(x=grconvertX(c(0.5,0.5), "ndc", "user") # ndc = normalized device coordinates
    , y=grconvertY(c(0,1), "nic", "user") # nic = normalized inner coordinates
    , xpd=NA, lwd=2)


# add main title to top of page
mtext("Assessing Normality", outer=TRUE, side=3, cex=2, line=1, col="darkred")


# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
   , outer=TRUE, side=1, cex=1, line=1, col="gray66")

   
dev.off() # finish PDF file

4 x 1 image (for phone/tablet screen)

2 x 2 image (for slides)

Leave a ReplyCancel Reply