# filenmame: Basic_Stats_Visual.R
# first we set your working directory to the folder that contains
# your files (code not shown here) and load data file MM.RData
load("MM.RData")
# note that the environment now contains a single data set with
# the name MMdata that has 30 observations of 9 variables; lets
# take a look at it
str(MMdata)
# we are interested in the data for yellow M&Ms so let's read the
# appropriate column into a vector; we can do this using the
# bracket notation---in this case MMdata[ , 8]---but that
# requires that we are careful to identify the correct column by
# number; an easier approach is to identify the desired column
# by using the column's name; thus
yellow = MMdata$yellow
yellow
# BASIC DESCRIPTIVE STATISTICS
# to find the mean and the median, we use the functions mean()
# and median()
mean(yellow)
median(yellow)
# to find the variance and the standard deviation, we use the
# functions var() and sd()
var(yellow)
sd(yellow)
# or we can take the square root of the variance; note that we
# can nest one command inside of another
sqrt(var(yellow))
# to find the IQR, we use the function IQR()
IQR(yellow)
# ...but wait, this does not agree with the value of 4 given in
# SPS01; this is because R uses a different algorithm to find the
# IQR than did we, which we can see if we use summary()
summary(yellow)
# where we see that it returns the same result for F_L
# (identified here as 1st Qu.) but a different result for F_U
# (identified here 3rd Qu.); thus, R's function for calculating
# the IQR is different then the function we described in our last
# class; let's source the script file iqr.R, which defines a
# function to calculate IQR as we defined it in class, and then
# examine the script to see how it works
source("iqr.R")
View(iqr)
# okay...now let's use our new function
iqr(yellow)
# which returns the value from SPS01
# to find the MAD, we use the function mad()
mad(yellow)
#...but wait, this does not match the value of 2 given in SPS01;
#here the problem is that we did not account for the default
#inputs to mad() because we were not aware of them; let's look at
#the help file for mad()...
help(mad)
#...which tells us that the command has six inputs, five of which
#have default values that are used if we do not specify them when
#we enter the command:
# x: our data in the form of a vector
# center: defines the value for which we calculate individual
# deviations; note that it defaults to the median so we did not
# need to change its value
# constant: a scaling factor; the reason for the default value of
# 1.4826 is not clear from the help file, but setting the
# constant to 1 will solve our problem
# na.rm: a logical value on whether to exclude missing (NA)
# values; the default of FALSE returns an error if a value is
# missing on the assumption that we forgot to enter it (we can
# set this to TRUE if we know we have missing values)
# low and high: these provide alternative ways to calculate the
# median, but defaults to FALSE, which is the median as we have
# defined it
# let's try using a constant of 1
mad(yellow, constant = 1)
# gives us the correct value
# VISUALIZING DATA
# a good first step is to examine the data in a table that shows
# the frequency of each unique result; we can accomplish this
# using the table() command
table(yellow)
# R has a command for stem-and-leaf diagrams, which is stem(),
# which runs in the console
stem(yellow)
# the result is hard to interpret because it groups the unique
# results into pairs without drawing our attention to this; thus,
# the entry 4|0 is really 4|0 and 5|0 combined: when in doubt,
# look at the help file
help(stem)
# which shows us that we can use the scale argument to expand the
# number of unique stems
stem(yellow, scale = 2)
# R's stem() function does not use the plot window, but we can
# use its stripchart() function to make something similar to a
# stem-and-leaf diagram
stripchart(yellow)
# this seems to have too few points (14), but this is the number
# of unique results; clearly points for samples with the same
# number of yellow M&Ms are overlayed on each other; let's look
# at the help file
help(stripchart)
# the method argument shows us that the default option for method
# is to overlay points; let's use the stack option and change the
# offset to 1
stripchart(yellow, method = "stack", offset = 1)
# one arguments to the stripchart function is ..., which notes
# that allows us to pass parameters that affect the plot's
# appearance; see the handout on plotting in R for more details,
# but here is an example that changes the symbol used to plot
# points (pch), the color of the points (col), the labels applied
# to the x-axis (xlab) and the y-axis (ylab), and the limits for
# the x-axis (xlim) and the y-axis (ylim)
stripchart(yellow, method = "stack", offset = 1, pch = 19,
col = "gold", xlab = "number of yellow M&Ms in a bag",
ylab = "frequency", xlim = c(0, 25))
# to plot a histogram we use the hist() function
hist(yellow)
# let's look at the options for hist()
help(hist)
# and try setting the limits on the bins using break, adding some
# color, and altering the main label and the x-axis label; we
# will also assign it to an object so we can retrieve the
# returned values
hist.out = hist(yellow,
breaks = c(4.5, 7.5, 10.5, 13.5, 16.5, 19.5, 22.5, 25.5),
col = "yellow", main = "histogram of yellow M&Ms",
xlab = "number of yellow M&Ms in a bag")
hist.out
# finally, let's create a boxplot using the boxplot() function
boxplot(yellow)
# let's look at the options for boxplot()
help(boxplot)
# note that we can show multiple boxplots at once by listing
# multiple objects
boxplot(yellow, MMdata$blue, MMdata$brown)
# ...but, the default labels are not helpful; the help file
# suggests we can pass a vector of names
boxplot(yellow, MMdata$blue, MMdata$brown,
names = c("yellow", "blue", "brown"))
#we also can pass the relevant columns from a data.frame and it
#will use the column names to label the plot
boxplot(MMdata[ , 3:8])
# let's pretty it up a bit
boxplot(MMdata[ , 3:8],
col = c("blue", "brown", "green", "orange", "red", "yellow"),
main = "boxplots by color for 1.69-oz bags of M&Ms",
ylab = "number of M&Ms")