This will be the first in a series of articles sharing digital resources that I’ve developed to illustrate concepts in statistics. I use these in my own biostatistics course. My wish is that these will be useful for instructors and students alike.
I created the figures below to illustrate some common methods used to assess the normality of a sample: histogram overlaid with normal curve, Q-Q plot, and a normality hypothesis test (specifically, the Shapiro-Wilk test). I also chose to include measures of skewness and excess kurtosis, which I think are useful points of additional information for understanding the properties of various distributions.
The samples that I’ve used for illustration come from populations with four, distinct frequency distributions: normal, uniform, exponential, and bimodal.
I made these figures using R. The code I wrote to generate these graphics is found below each image. PDF versions of the images are available for download by using the links in the image captions.
4 x 1 image (for phone/tablet screen)
Here is a 4 x 1 version of the graphic. This would probably work best on tablets or smartphone screens.
Here is some R code you can use to make this graphic:
### testing for normality, 4x1 graphic ----------------------------------------------
n <- 100 # number of samples to take from each distribution
library(psych) # load library containing 'describe' function
s.list <- list() # create an empty list to hold results of sampling
s.list$normal <- rnorm(n) # normal distribution, random sample
s.list$uniform <- runif(n) # uniform distribution, random sample
s.list$exponential <- rexp(n) # exponential distribution, random sample
s.list$bimodal <- c(rnorm(n/2, 0, 1), rnorm(n/2, 10, 1)) # bimodal normal distribution
pdf("INSERT_FILE_NAME_HERE.pdf", width=6, height=10)
par(mfrow=c(4,2) # graph window, 4 rows x 2 columns
, oma=c(3,0,4,0)) # set outer margins
for(i in 1:length(s.list)){
samp <- s.list[[i]] # the sampled data points
hist.out <- hist(samp, plot=FALSE) # histogram calculations
xmin <- min(hist.out$breaks) # x min of plot
xmax <- max(hist.out$breaks) # x max of plot
des <- describe(samp) # get descriptive stats; store results in object 'des'
xnorm <- seq(from=xmin, to=xmax, by=((xmax - xmin)/200)) # x values for normal distribution
ynorm <- dnorm(x=xnorm, # using x values of normal distribution
mean=des$mean, sd=des$sd) # use sample statistics to estimate population params
ymax <- 1.4*max(hist.out$density, ynorm) # adjust the multiplier to adjust space above plot
hist(samp, freq=FALSE, ylim=c(0, ymax), xlab=NULL, main=NULL) # density histogram
lines(x=xnorm, y=ynorm, col="blue", lwd=1.5) # overlay normal curve onto histogram
skew.text <- paste("skew =", signif(des$skew, 2))
mtext(text=skew.text , side=3, line=-1, adj=1) # 1 lines down from top margin
kurt.text <- paste("excess kurtosis =", signif(des$kurtosis, 2))
mtext(text=kurt.text, side=3, line=-2.5, adj=1) # 2 lines down from top margin
qqnorm(samp, main="") # Q-Q plot
qqline(samp, col="red", lwd=2) # add straight line to Q-Q plot
shap <- shapiro.test(samp) # output of Shapiro-Wilk test
mtext(text="Shapiro-Wilk:", side=3, line=-1.5, adj=0.1, font=2)
W.text <- paste("W =", signif(shap$statistic, 2))
mtext(text=W.text, side=3, line=-3, adj=0.1)
p.text <- paste("p =", signif(shap$p.value, 2))
mtext(text=p.text, side=3, line=-4.5, adj=0.1) # 3 down from top, near right edge
# add plot title
text(x=grconvertX(0.5, "ndc", "user"), y=grconvertY(0.98, "nfc", "user")
, labels=paste("Population:", names(s.list)[i])
, xpd=TRUE, cex=2.2, pos=1, xpd=NA)
# add divider lines
abline(h=grconvertY(c(0,1), "nfc", "user"), xpd=NA, lwd=2)
} # end for() loop
# add main title to top of page
mtext("Assessing Normality", outer=TRUE, side=3, cex=2, line=1, col="darkred")
# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
, outer=TRUE, side=1, cex=1, line=1, col="gray66")
dev.off() # finish PDF file
2 x 2 image (for slides)
This 2 x 2 version will probably work better for slides. Note that the data are slightly different than in the 1 x 4 version due to random sampling variability.
Here is some R code you can use to make this graphic. It is essentially a slightly modified version of the previous code:
### testing for normality, 2x2 graphic ----------------------------------------------
n <- 100 # number of samples to take from each distribution
library(psych) # load library containing 'describe' function
s.list <- list() # create an empty list to hold results of sampling
s.list$normal <- rnorm(n) # normal distribution, random sample
s.list$uniform <- runif(n) # uniform distribution, random sample
s.list$exponential <- rexp(n) # exponential distribution, random sample
s.list$bimodal <- c(rnorm(n/2, 0, 1), rnorm(n/2, 10, 1)) # bimodal normal distribution
pdf("INSERT_FILE_NAME_HERE.pdf", width=11)
par(mfrow=c(2,4) # graph window, 2 rows x 4 columns
, oma=c(3,0,4,0)) # set outer margins
for(i in 1:length(s.list)){
samp <- s.list[[i]] # the sampled data points
hist.out <- hist(samp, plot=FALSE) # histogram calculations
xmin <- min(hist.out$breaks) # x min of plot
xmax <- max(hist.out$breaks) # x max of plot
des <- describe(samp) # get descriptive stats; store results in object 'des'
xnorm <- seq(from=xmin, to=xmax, by=((xmax - xmin)/200)) # x values for normal distribution
ynorm <- dnorm(x=xnorm, # using x values of normal distribution
mean=des$mean, sd=des$sd) # use sample statistics to estimate population params
ymax <- 1.4*max(hist.out$density, ynorm) # adjust the multiplier to adjust space above plot
hist(samp, freq=FALSE, ylim=c(0, ymax), xlab=NULL, main=NULL) # density histogram
lines(x=xnorm, y=ynorm, col="blue", lwd=1.5) # overlay normal curve onto histogram
skew.text <- paste("skew =", signif(des$skew, 2))
mtext(text=skew.text , side=3, line=-1, adj=1) # 1 lines down from top margin
kurt.text <- paste("excess kurtosis =", signif(des$kurtosis, 2))
mtext(text=kurt.text, side=3, line=-2.5, adj=1) # 2 lines down from top margin
qqnorm(samp, main="") # Q-Q plot
qqline(samp, col="red", lwd=2) # add straight line to Q-Q plot
shap <- shapiro.test(samp) # output of Shapiro-Wilk test
mtext(text="Shapiro-Wilk:", side=3, line=-1.5, adj=0.1, font=2)
W.text <- paste("W =", signif(shap$statistic, 2))
mtext(text=W.text, side=3, line=-3, adj=0.1)
p.text <- paste("p =", signif(shap$p.value, 2))
mtext(text=p.text, side=3, line=-4.5, adj=0.1) # 3 down from top, near right edge
# add plot title
text(x=grconvertX(0, "nfc", "user"), y=grconvertY(0.98, "nfc", "user")
, labels=paste("Population:", names(s.list)[i])
, xpd=TRUE, cex=2.2, pos=1, xpd=NA)
# add horizontal divider lines
abline(h=grconvertY(c(0,1), "nfc", "user"), xpd=NA, lwd=2)
} # end for() loop
# add vertical divider line
lines(x=grconvertX(c(0.5,0.5), "ndc", "user") # ndc = normalized device coordinates
, y=grconvertY(c(0,1), "nic", "user") # nic = normalized inner coordinates
, xpd=NA, lwd=2)
# add main title to top of page
mtext("Assessing Normality", outer=TRUE, side=3, cex=2, line=1, col="darkred")
# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
, outer=TRUE, side=1, cex=1, line=1, col="gray66")
dev.off() # finish PDF file