# examine help file for functions related to normal distributions help(Normal) # each function takes the mean and the standard deviation as # arguments, with default value of 0 and 1, respectively; # lower.tail is a logical argument that specifies if we are # interested in values below a stated value (the default) or # above the stated value # rnorm: returns vector of n random samples drawn from a normal # distribution with stated mean and standard deviation # dnorm: returns value of f(x), the probability or density, for # normal distribution with stated mean and standard deviation # prorm: returns probability of a result ≤ x if lower.tail = # TRUE; returns probability of a result > x if lower.tail = # FALSE; results are for a normal distribution with the stated # mean and standard deviation # qnorm: returns value of x for which the range -inf to x has the # probability p for a normal distributin with the stated mean and # standard deviation # let's create a small data set by grabbing 1000 random samples # from a normal distribution described by a mean of 0 and a # standard deviation of 1 d = rnorm(n = 1000, mean = 0, sd = 1) # a plot of the 1000 values places their individual values on the # y-axis as a function of their index values on the x-axis; note # that the values appear random and that the density of points is # greater near the middle of the y-axis and lesser as we move to # the ends of the y-axis plot(d) # as expected, the mean and the standard deviation are close to # their expected values mean(d) sd(d) # a boxplot shows that the individual data points are # symmetically distributed about the median, although there are # more points beyond the upper whisker than below the lower # whisker boxplot(d, horizontal = TRUE, ylim = c(-4, 4)) # the rug() will add small tick marks along the x-axis for each # of the 1000 individual results, which helps see that the # density of points thins out from the middle of the box to the # whiskers rug(d) # a histogram of the data shows the expected shape for a normal # distribution hist(d, freq = FALSE, xlim = c(-4,4), ylim = c(0, 0.5)) rug(d) # the dnorm function returns the value of f(x), the probability # or density, for normal distribution with stated mean and # standard deviation where x is any possible value; let's take a # closer look at this by creating a sequence of x values x.data = seq(-4, 4, 0.1) # calculating the corresponding values of f(x) y.data = dnorm(x.data) # and then plotting the results, which is the normal distribution # curve plot(x = x.data, y = y.data, type = "l", lwd = 2, col = "blue", ylim = c(0, 0.5)) # this is useful way to generate a normal distibution curve that # we can overlay on a histogram of our data, which allows us to # model our data using the normal distribution; first w plot the # histogram adding freq = FALSE so that the y-axis is a plot of # probability (total probability adds to 1) instead of frequency hist(d, freq = FALSE, xlim = c(-4,4), ylim = c(0, 0.5)) rug(d) lines(x = x.data, y = y.data, lwd = 2, col = "blue") # the pnorm function returns the probability of a result ≤ x if # lower.tail = TRUE and returns the probability of a result > x # if lower.tail = FALSE; ; let's take a # closer look at this by creating a sequence of x values x.data = seq(-4, 4, 0.1) # calculating the corresponding probabilities for results less # than x y.data = pnorm(x.data, lower.tail = TRUE) # plotting the results shows that the curve gives the cumulative # probility of obtaining a result less than x, beginning with a # cumulative probability of 0 for values of x less than -4 and # increasing to a probability of 1 for values of x less than 4 plot(x = x.data, y = y.data, type = "l", lwd = 2, col = "blue") # if we set lower.tail to FALSE we get the opposite result y.data = pnorm(x.data, lower.tail = FALSE) lines(x = x.data, y = y.data, lwd = 2, col = "red") # here we just return individual values for x = -4, -3, -2, -1, # 0, 1, 2, 3, and 4 pnorm(c(-4:4)) pnorm(c(-4:4), lower.tail = FALSE) # the qnorm function returns value of x for which the range -inf # to x has the probability p; let's see what this looks like prob = seq(0, 1, 0.01) plot(prob, qnorm(prob), type = "l", lwd = 2, col = "blue") # this is a generic shape that is indicative of data that follows # a normal distribution; the qqnorm() and the qqline() functions # allows us to replot data in this format and qualitatively # evaluate the extent to which is appears to follow a normal # distribution d = rnorm(n = 1000, mean = 0, sd = 1) qqnorm(d, pch = 19, cex = 0.5, col = "blue") qqline(d, lwd = 2) # let's use this on some additional data from the # Distributions.RData file, which includes several types of data # sets (be sure your working directory points to the folder with # this file) load("Distributions.RData") hist(normal) qqnorm(normal, pch = 19, cex = 0.5, col = "blue") qqline(normal) hist(leftskew) qqnorm(leftskew, pch = 19, cex = 0.5, col = "blue") qqline(leftskew) hist(rightskew) qqnorm(rightskew, pch = 19, cex = 0.5, col = "blue") qqline(rightskew) # and now, let's look at the data from SPS02 load("MM.RData") yellow = MMdata$yellow table(yellow) hist(yellow) qqnorm(yellow, pch = 19, col = "gold") qqline(yellow, lwd = 2) # find the probability between 16 and 23; the script shadeArea.R # draws normal distribution curves using a data sets mean and # standard deviation and then shades in the are of interest source("shadeArea.R") shadeArea(yellow, 16, 23, xlab = "number of yellow M&Ms", ylab = "probability") # the area in blue is the probability of result less than 23 # minus the proability of results less than 16; thus probless23 = pnorm(23, mean(yellow), sd(yellow), lower.tail = TRUE) probless16 = pnorm(16, mean(yellow), sd(yellow), lower.tail = TRUE) probless23 - probless16 # probability greater than 17 shadeArea(yellow, 17, 35, xlab = "number of yellow M&Ms", ylab = "probability") probgreater17 = pnorm(17, mean(yellow), sd(yellow), lower.tail = FALSE) probgreater17 # probability less than 13 shadeArea(yellow, 0, 13, xlab = "number of yellow M&Ms", ylab = "probability") probless13 = pnorm(13, mean(yellow), sd(yellow), lower.tail = TRUE) probless13