# examine help file for functions related to normal distributions

help(Normal)

# each function takes the mean and the standard deviation as
# arguments, with default value of 0 and 1, respectively;
# lower.tail is a logical argument that specifies if we are
# interested in values below a stated value (the default) or
# above the stated value

# rnorm: returns vector of n random samples drawn from a normal
# distribution with stated mean and standard deviation

# dnorm: returns value of f(x), the probability or density, for
# normal distribution with stated mean and standard deviation

# prorm: returns probability of a result ≤ x if lower.tail =
# TRUE; returns probability of a result > x if lower.tail =
# FALSE; results are for a normal distribution with the stated
# mean and standard deviation

# qnorm: returns value of x for which the range -inf to x has the
# probability p for a normal distributin with the stated mean and
# standard deviation

# let's create a small data set by grabbing 1000 random samples
# from a normal distribution described by a mean of 0 and a
# standard deviation of 1

d = rnorm(n = 1000, mean = 0, sd = 1)

# a plot of the 1000 values places their individual values on the
# y-axis as a function of their index values on the x-axis; note
# that the values appear random and that the density of points is
# greater near the middle of the y-axis and lesser as we move to
# the ends of the y-axis

plot(d)

# as expected, the mean and the standard deviation are close to
# their expected values

mean(d)

sd(d)

# a boxplot shows that the individual data points are
# symmetically distributed about the median, although there are
# more points beyond the upper whisker than below the lower
# whisker

boxplot(d, horizontal = TRUE, ylim = c(-4, 4))

# the rug() will add small tick marks along the x-axis for each
# of the 1000 individual results, which helps see that the
# density of points thins out from the middle of the box to the
# whiskers

rug(d)

# a histogram of the data shows the expected shape for a normal
# distribution

hist(d, freq = FALSE, xlim = c(-4,4), ylim = c(0, 0.5))

rug(d)

# the dnorm function returns the value of f(x), the probability
# or density, for normal distribution with stated mean and
# standard deviation where x is any possible value; let's take a
# closer look at this by creating a sequence of x values

x.data = seq(-4, 4, 0.1)

# calculating the corresponding values of f(x)

y.data = dnorm(x.data)

# and then plotting the results, which is the normal distribution
# curve

plot(x = x.data, y = y.data, type = "l", lwd = 2, 
     col = "blue", ylim = c(0, 0.5))

# this is useful way to generate a normal distibution curve that
# we can overlay on a histogram of our data, which allows us to
# model our data using the normal distribution; first w plot the
# histogram adding freq = FALSE so that the y-axis is a plot of
# probability (total probability adds to 1) instead of frequency

hist(d, freq = FALSE, xlim = c(-4,4), ylim = c(0, 0.5))

rug(d)

lines(x = x.data, y = y.data, lwd = 2, col = "blue")

# the pnorm function returns the probability of a result ≤ x if
# lower.tail = TRUE and returns the probability of a result > x
# if lower.tail = FALSE; ; let's take a
# closer look at this by creating a sequence of x values

x.data = seq(-4, 4, 0.1)

# calculating the corresponding probabilities for results less
# than x

y.data = pnorm(x.data, lower.tail = TRUE)

# plotting the results shows that the curve gives the cumulative
# probility of obtaining a result less than x, beginning with a
# cumulative probability of 0 for values of x less than -4 and
# increasing to a probability of 1 for values of x less than 4

plot(x = x.data, y = y.data, type = "l", lwd = 2, col = "blue")

# if we set lower.tail to FALSE we get the opposite result

y.data = pnorm(x.data, lower.tail = FALSE)

lines(x = x.data, y = y.data, lwd = 2, col = "red")

# here we just return individual values for x = -4, -3, -2, -1,
# 0, 1, 2, 3, and 4

pnorm(c(-4:4))

pnorm(c(-4:4), lower.tail = FALSE)

# the qnorm function returns value of x for which the range -inf
# to x has the probability p; let's see what this looks like

prob = seq(0, 1, 0.01)

plot(prob, qnorm(prob), type = "l", lwd = 2, col = "blue")

# this is a generic shape that is indicative of data that follows
# a normal distribution; the qqnorm() and the qqline() functions
# allows us to replot data in this format and qualitatively
# evaluate the extent to which is appears to follow a normal
# distribution

d = rnorm(n = 1000, mean = 0, sd = 1)

qqnorm(d, pch = 19, cex = 0.5, col = "blue")

qqline(d, lwd = 2)

# let's use this on some additional data from the
# Distributions.RData file, which includes several types of data
# sets (be sure your working directory points to the folder with 
# this file)

load("Distributions.RData")

hist(normal)

qqnorm(normal, pch = 19, cex = 0.5, col = "blue")

qqline(normal)

hist(leftskew)

qqnorm(leftskew, pch = 19, cex = 0.5, col = "blue")

qqline(leftskew)

hist(rightskew)

qqnorm(rightskew, pch = 19, cex = 0.5, col = "blue")

qqline(rightskew)

# and now, let's look at the data from SPS02

load("MM.RData")

yellow = MMdata$yellow

table(yellow)

hist(yellow)

qqnorm(yellow, pch = 19, col = "gold")

qqline(yellow, lwd = 2)

# find the probability between 16 and 23; the script shadeArea.R
# draws normal distribution curves using a data sets mean and
# standard deviation and then shades in the are of interest

source("shadeArea.R")

shadeArea(yellow, 16, 23, xlab = "number of yellow M&Ms", 
          ylab = "probability")

# the area in blue is the probability of result less than 23
# minus the proability of results less than 16; thus

probless23 = pnorm(23, mean(yellow), sd(yellow), 
                   lower.tail = TRUE)

probless16 = pnorm(16, mean(yellow), sd(yellow), 
                   lower.tail = TRUE)

probless23 - probless16

# probability greater than 17

shadeArea(yellow, 17, 35, xlab = "number of yellow M&Ms", 
          ylab = "probability")

probgreater17 = pnorm(17, mean(yellow), sd(yellow), 
                      lower.tail = FALSE)

probgreater17

# probability less than 13

shadeArea(yellow, 0, 13, xlab = "number of yellow M&Ms", 
          ylab = "probability")

probless13 = pnorm(13, mean(yellow), sd(yellow), 
                   lower.tail = TRUE)

probless13