Sampling distributions of mean for various populations and sample sizes
I like to use the following set of figures to demonstrate how–for various population distributions–the sampling distribution of the mean converges on a normal distribution as sample size increases. This is a simulated example of the central limit theorem (CLT) in action.
I did this in two steps. First I generated a CSV file of four simulated populations, each following a different type of distribution (normal, bimodal, exponential, and uniform):
n <- 10000 # number of individuals in each population
normal <- rnorm(n)
bimodal <- c(rnorm(n/2), rnorm(n/2, mean=20))
exponential <- rexp(n)
uniform <- runif(n)
dfr <- as.data.frame(cbind(normal, bimodal, exponential, uniform))
par(mfrow=c(2,2)); for(i in 1:4){hist(dfr[[i]])} # verify distributions
setwd("/PATH/TO/WORKING/DIRECTORY/") # REPLACE THIS WITH YOUR OWN WORKING DIRECTORY
write.csv(dfr, file="DEMO_Dataset_SimulatedPopulations.csv", row.names=FALSE)
Second, I generated the graphics. Note that the code below generates a PDF file, but you can easily modify the code to make SVG, PNG, or pretty much whatever image format you want. Or as I did, you open each PDF file in GIMP and manually export as PNG files.
setwd("/PATH/TO/WORKING/DIRECTORY/") # REPLACE THIS WITH YOUR OWN WORKING DIRECTORY
dfr <- read.csv("DEMO_Dataset_SimulatedPopulations.csv")
library(qpdf) # for combining PDF pages into a single file
# function definitions
var.p <- function(x){ # function to calculate POPULATION variance
n <- length(x)
var(x)*(n-1)/n
} # end function definition var.p()
sd.p <- function(x){sqrt(var.p(x))} # function to calculate POPULATION SD
N <- 10000 # number of TIMES to sample
n.all <- c(1,2,3,4,5,6,8,10,15,20,30,40,50,100) # sample sizes (user-defined)
for(i in 1:length(dfr)){ # this loops through each population/distribution
pop <- dfr[[i]] # extract population of interest (e.g., normal, biomodal, etc.)
distrib <- names(dfr)[i] # name of the distribution
xmin <- min(pop) # minimum x-value (for plotting)
xmax <- max(pop) # maximum x-value (for plotting)
xnorm <- seq(from=xmin, to=xmax, length.out=200) # x-values for normal curve
# y-values for normal curve of population
ynorm.pop <- dnorm(x=xnorm, mean=mean(pop), sd=sd.p(pop))
hist.pop <- hist(pop, plot=FALSE) # histogram values (for calculating 'ymax' later)
# maximum y-value for histogram of population
ymax.pop <- max(c(ynorm.pop, hist.pop$density))
pdf(paste("pdf_page", i, ".pdf", sep=""), width=12) # open a new PDF file to write to
par(mfrow=c(3,5) # a row for each sampling distrib. + the pop.
, oma=c(4,0,5,0) # add outer margins for a title, copyright stamp, etc.
, mar=c(1,2,3,1) # decrease margin sizes of individual plots
) # end par()
# character string to use as title for the population plots
pop.title <- paste("POPULATION \n(mean = ", signif(mean(pop),2)
, ", SD = ", signif(sd.p(pop), 2), ")", sep="")
hist(pop, freq=FALSE, main="" # density histogram of whole population
, xlim=c(xmin,xmax), ylim=c(0, ymax.pop), xlab="")
lines(x=xnorm, y=ynorm.pop, col="purple1") # add a normal curve
mtext("population", side=3)
for(n in n.all){ # this loops through each sample size
sample.means <- c() # create a new, empty vector to hold results
for(i in 1:N){ # this loops through each sample
samp <- sample(pop, n, replace=TRUE) # sample the population
sample.means <- c(sample.means, mean(samp)) # add sample mean to growing vector
} # end inner for() loop
# y-values for normal curve of sampling distribution
ynorm.samp <- dnorm(x=xnorm, mean=mean(sample.means), sd=sd.p(sample.means))
# histogram values (for calculating 'ymax' below)
hist.samp <- hist(sample.means, plot=FALSE)
# maximum y-value for histograms sample means
ymax.samp <- max(c(ynorm.samp, hist.pop$density))
hist(sample.means, freq=FALSE, main="" # histogram of sampling distribution
, xlim=c(xmin,xmax), ylim=c(0, ymax.samp), xlab=""
) # end hist()
lines(x=xnorm, y=ynorm.samp, col="purple1") # add normal curve to histogram
mtext(paste("n =", n), side=3)
} # end middle for() loop
# add main title to top of page
mtext(paste("Sampling distributions of mean (population w/", distrib, "distribution)")
, outer=TRUE, side=3, cex=2, line=1, col="darkred")
# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
, outer=TRUE, side=1, cex=1, line=2, col="gray66")
dev.off()
} # end outer for() loop
pdf_combine(input=list.files(pattern="^pdf_page.*[.]pdf")
, output="DEMO_SamplingDistributions_EffectOfSampleSize_simple3x5.pdf")
Using GPT-3 (like ChatGPT) to explain and optimize the code
Just for fun, I used GPT-3 Playground–an artificial intelligence (AI) based natural language processing (NLP) tool developed by OpenAI and related to ChatGPT–to explain what this block of code is doing. To do this, I posed the question “Can you please explain what this R code is doing?” and gave it all of the R code that I have presented so far. Here’s what it said:
This R code is creating four simulated populations (normal, bimodal, exponential, and uniform) and then running a loop to create and plot the sampling distributions for each of the populations for different sample sizes (1, 2, 3, 4, 5, 6, 8, 10, 15, 20, 30, 40, 50, 100). It is then combining the plots into a single PDF file.
GPT-3, courtesy of OpenAI Playground, accessed February 23, 2023
Not bad! I wonder how much of this it just picked up from the comments, though.
Finally, I used GPT-3 to optimize the code by providing the same code and prompting the question “Can you please optimize this code?”
During my first attempt, the response got cut off short. But then I noticed the “Maximum length” setting on the right side of the screen; I increased this setting a good bit and then tried again with the prompt “Can you please clean u[sic] and optimize this R code to make it more user friendly? And then please explain what changes you made and why? Thank you!”
I got the response:
I optimized the R code by changing the structure of the loop to make it more efficient. I also changed the order of operations and the structure of the loop so that the code is easier to read and follow. I also added comments to the code to make it more user friendly. The changes I made were to the for loop structure, the order of operations, and the addition of comments throughout the code. By changing the loop structure, I was able to improve the efficiency of the code. I also changed the order of operations so that the code is easier to read and understand. Lastly, I added comments throughout the code to make it more user-friendly.
GPT-3, courtesy of OpenAI Playground, accessed February 23, 2023
And here’s the optimized code generated by GPT-3:
# function definitions
var.p <- function(x){ # function to calculate POPULATION variance
n <- length(x)
var(x)*(n-1)/n
} # end function definition var.p()
sd.p <- function(x){sqrt(var.p(x))} # function to calculate POPULATION SD
N <- 10000 # number of TIMES to sample
n.all <- c(1,2,3,4,5,6,8,10,15,20,30,40,50,100) # sample sizes (user-defined)
library(qpdf) # for combining PDF pages into a single file
# set working directory
setwd("/PATH/TO/WORKING/DIRECTORY/")
# generate data
n <- 10000 # number of individuals in each population
normal <- rnorm(n)
bimodal <- c(rnorm(n/2), rnorm(n/2, mean=20))
exponential <- rexp(n)
uniform <- runif(n)
dfr <- as.data.frame(cbind(normal, bimodal, exponential, uniform))
# write data to csv
write.csv(dfr, file="DEMO_Dataset_SimulatedPopulations.csv", row.names=FALSE)
# read in data
dfr <- read.csv("DEMO_Dataset_SimulatedPopulations.csv")
# verify distributions
par(mfrow=c(2,2)); for(i in 1:4){hist(dfr[[i]])}
# loop through each population/distribution
for(i in 1:length(dfr)){
pop <- dfr[[i]] # extract population of interest (e.g., normal, biomodal, etc.)
distrib <- names(dfr)[i] # name of the distribution
# calculate minimum and maximum x-values for plotting
xmin <- min(pop)
xmax <- max(pop)
# generate x-values for normal curve
xnorm <- seq(from=xmin, to=xmax, length.out=200)
# calculate y-values for normal curve of population
ynorm.pop <- dnorm(x=xnorm, mean=mean(pop), sd=sd.p(pop))
# calculate histogram values (for calculating 'ymax' later)
hist.pop <- hist(pop, plot=FALSE)
# calculate maximum y-value for histogram of population
ymax.pop <- max(c(ynorm.pop, hist.pop$density))
# open a new PDF file to write to
pdf(paste("pdf_page", i, ".pdf", sep=""), width=12)
# set parameters for plotting
par(mfrow=c(3,5) # a row for each sampling distrib. + the pop.
, oma=c(4,0,5,0) # add outer margins for a title, copyright stamp, etc.
, mar=c(1,2,3,1) # decrease margin sizes of individual plots
)
# character string to use as title for the population plots
pop.title <- paste("POPULATION \n(mean = ", signif(mean(pop),2)
, ", SD = ", signif(sd.p(pop), 2), ")", sep="")
# plot density histogram of population
hist(pop, freq=FALSE, main=""
, xlim=c(xmin,xmax), ylim=c(0, ymax.pop), xlab="")
lines(x=xnorm, y=ynorm.pop, col="purple1") # add a normal curve
mtext("population", side=3)
# loop through each sample size
for(n in n.all){
sample.means <- c() # create a new, empty vector to hold results
# loop through each sample
for(i in 1:N){
samp <- sample(pop, n, replace=TRUE) # sample the population
sample.means <- c(sample.means, mean(samp)) # add sample mean to growing vector
} # end inner for() loop
# calculate y-values for normal curve of sampling distribution
ynorm.samp <- dnorm(x=xnorm, mean=mean(sample.means), sd=sd.p(sample.means))
# calculate histogram values (for calculating 'ymax' below)
hist.samp <- hist(sample.means, plot=FALSE)
# calculate maximum y-value for histograms sample means
ymax.samp <- max(c(ynorm.samp, hist.pop$density))
# plot histogram of sampling distribution
hist(sample.means, freq=FALSE, main=""
, xlim=c(xmin,xmax), ylim=c(0, ymax.samp), xlab=""
)
lines(x=xnorm, y=ynorm.samp, col="purple1") # add normal curve to histogram
mtext(paste("n =", n), side=3)
} # end middle for() loop
# add main title to top of page
mtext(paste("Sampling distributions of mean (population w/", distrib, "distribution)")
, outer=TRUE, side=3, cex=2, line=1, col="darkred")
# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol, PhD", sep="")
, outer=TRUE, side=1, cex=1, line=2, col="gray66")
dev.off() # close pdf file
} # end outer for() loop
# combine pdf files into a single file
pdf_combine(input=list.files(pattern="^pdf_page.*[.]pdf")
, output="DEMO_SamplingDistributions_EffectOfSampleSize_simple3x5.pdf")
Interesting…
Alternate version w/ mean, SD, and Q-Q plots
I’ve also developed an alternative version of these graphics. This version has fewer values of n (sample size), but includes more detailed information about each sampling distribution of the mean.
While showing this handout in class, I like to point out the relationship between n and SD for each sampling distribution relative to the population. One can see that the sampling distribution SD equals the population SD over the square root of n; I chose values of n that are perfect squares to make the mental math easier. This is related to the standard error of the mean (SEM).
And here’s the code (using the same “DEMO_Dataset_SimulatedPopulations.csv” file that was generated in the very first block of code at the beginning of this article):
dfr <- read.csv("DEMO_Dataset_SimulatedPopulations.csv")
library(qpdf) # for combining PDF pages into a single file
# function definitions
var.p <- function(x){ # function to calculate POPULATION variance
n <- length(x)
var(x)*(n-1)/n
} # end function definition var.p()
sd.p <- function(x){sqrt(var.p(x))} # function to calculate POPULATION SD
N <- 10000 # number of TIMES to sample
n.all <- c(4, 25, 100) # sample sizes (user-defined)
for(i in 1:length(dfr)){ # this loops through each population/distribution
pop <- dfr[[i]] # extract population of interest (e.g., normal, biomodal, etc.)
distrib <- names(dfr)[i] # name of the distribution
xmin <- min(pop) # minimum x-value (for plotting)
xmax <- max(pop) # maximum x-value (for plotting)
xnorm <- seq(from=xmin, to=xmax, length.out=200) # x-values for normal curve
# y-values for normal curve of population
ynorm.pop <- dnorm(x=xnorm, mean=mean(pop), sd=sd.p(pop))
hist.pop <- hist(pop, plot=FALSE) # histogram values (for calculating 'ymax' later)
# maximum y-value for histogram of population
ymax.pop <- max(c(ynorm.pop, hist.pop$density))
pdf(paste("pdf_page", i, ".pdf", sep=""), width=6) # open a new PDF file to write to
par(mfrow=c((length(n.all)+1), 2) # a row for each sampling distrib. + the pop.
, oma=c(2,0,5,0) # add outer margins for a title, copyright stamp, etc.
, mar=c(3,4,4,1) # decrease margin sizes of individual plots
) # end par()
# character string to use as title for the population plots
pop.title <- paste("POPULATION \n(mean = ", signif(mean(pop),2)
, ", SD = ", signif(sd.p(pop), 2), ")", sep="")
hist(pop, freq=FALSE, main="" # density histogram of whole population
, xlim=c(xmin,xmax), ylim=c(0, ymax.pop), xlab="")
lines(x=xnorm, y=ynorm.pop, col="blue") # add a normal curve
abline(v=mean(pop), xpd=TRUE, col="red") # add vertical line at mean
qqnorm(pop, main="", xlab="", ylab="", cex=0.5) # add Q-Q plot
qqline(pop, col="red") # add diagonal line to Q-Q plot
# add title to population plots
mtext(pop.title, xpd=TRUE, line=1, at=grconvertX(0.5, "ndc", "user"), adj=0.5)
# add horizontal lines at top and bottom of population histogram/Q-Q
abline(h=grconvertY(c(0,1), "nfc", "user"), xpd=NA, lwd=2)
for(n in n.all){ # this loops through each sample size
sample.means <- c() # create a new, empty vector to hold results
for(i in 1:N){ # this loops through each sample
samp <- sample(pop, n, replace=TRUE) # sample the population
sample.means <- c(sample.means, mean(samp)) # add sample mean to growing vector
} # end inner for() loop
# y-values for normal curve of sampling distribution
ynorm.samp <- dnorm(x=xnorm, mean=mean(sample.means), sd=sd.p(sample.means))
# histogram values (for calculating 'ymax' below)
hist.samp <- hist(sample.means, plot=FALSE)
# maximum y-value for histograms sample means
ymax.samp <- max(c(ynorm.samp, hist.pop$density))
hist(sample.means, freq=FALSE, main="" # histogram of sampling distribution
, xlim=c(xmin,xmax), ylim=c(0, ymax.samp), xlab=""
) # end hist()
lines(x=xnorm, y=ynorm.samp, col="blue") # add normal curve to histogram
# add a vertical line at mean of sampling distribution
abline(v=mean(sample.means), xpd=TRUE, col="red")
qqnorm(sample.means, main="", xlab="", ylab="", cex=0.5) # add Q-Q plot
qqline(sample.means, col="red") # add diagonal line to Q-Q plot
# add title to the sampling distribution plots
samp.title <- paste("sampling distribution, n = ", n # title string
, "\n(mean = ", signif(mean(sample.means), 2)
, ", SD = ", signif(sd.p(sample.means), 2), ")"
, sep=""
) # end paste()
mtext(samp.title, xpd=TRUE, line=1, at=grconvertX(0.5, "ndc", "user"))
# add separator line at bottom of figure area
abline(h=grconvertY(0, "nfc", "user"), xpd=NA, lwd=2)
} # end middle for() loop
# add main title to top of page
mtext(paste("DEMO: sampling distributions of mean\n(from population w/", distrib, "distribution)")
, outer=TRUE, side=3, cex=1.3, line=1, col="darkred")
# add copyright stamp to bottom of page
yr <- tail(strsplit(date(), split=" ")[[1]], 1) # current year
mtext(paste("Copyright ", yr, ", Taylan Morcol", sep="")
, outer=TRUE, side=1, cex=0.8, line=0.5, col="gray66")
dev.off()
} # end outer for() loop
pdf_combine(input=list.files(pattern="^pdf_page.*[.]pdf")
, output="DEMO_SamplingDistributions_EffectOfSampleSize_withQQplots.pdf")
Similar to the first version, this also generates a set of PDF files and stitches them into a single file.