Linear Regression
#################################################
# Function: FitLinear
# Outputs summary statistics of linear regression for a given data frame
# Input: dataframe, x values in first column, y values in second column
# Output: r-squared value, slope, and pvalue
FitLinear <- function(data=cbind(runif(10),runif(10))) {
x <- data[,1]
y <- data[,2]
dataFrame <- data.frame(x,y)
regModel <- lm(y~x,data=dataFrame)
myOut<- c(slope = summary(regModel)$coefficients[2,1],
pvalue = summary(regModel)$coefficients[2,4],
Rsquared = summary(regModel)$adj.r.squared)
return(myOut)
}
#######################################################
Contingency Table
###################################################
# Function: ContTable
# Reports the results from the Chi Squared analysis for a given data frame
# Inputs: discrete numerical vectors for both
# Outputs: the results from the Chi Square analysis
ContTable <- function(data=cbind(runif(10),runif(10))) {
x <- data[,1]
y <- data[,2]
dataFrame <- data.frame(x,y)
print(chisq.test(dataFrame))
}
ContTable()
## Warning in chisq.test(dataFrame): Chi-squared approximation may be
## incorrect
##
## Pearson's Chi-squared test
##
## data: dataFrame
## X-squared = 1.6485, df = 9, p-value = 0.9959
ANOVA/My Data
# Function: AnovaData
# Reports F value (test statistic for the variation between sample means) for a given data frame
# Inputs: continuous numeric vector for x and a discrete vector for y
# Outputs: a mosaic plot with p-value and a bar plot
AnovaData <- function(data=cbind(runif(10),runif(10))) {
x <- data[,1]
y <- data[,2]
dataFrame <- data.frame(x,y)
AnovaModel<- aov(y~x,data=dataFrame)
print(summary(AnovaModel))
}
data<- read.table("SporCount.csv",header=TRUE,sep=",")
data1 <- data[,1:2]
head(data1)
## Site SporCount
## 1 AU 124
## 2 AU 185
## 3 AU 125
## 4 AU 113
## 5 AU 128
## 6 AU 127
AnovaData(data=data1)
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 3480 3480 5.529 0.019 *
## Residuals 598 376419 629
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
########################################################################
# Function: GraphAnova
# Graphs box plot with whiskers
# Input: continuous numeric vector for x and discrete vector for y
# Output: box plot with whiskers and F value
GraphAnova <- function(data=cbind(runif(10),runif(10))) {
x <- data[,1]
y <- data[,2]
dataFrame <- data.frame(x,y)
AnovaModel<- aov(y~x,data=dataFrame)
return(boxplot(y~x,data=dataFrame,col=c("purple","limegreen","royalblue")))
}
GraphAnova(data1)
Random Data
str(data1)
## 'data.frame': 600 obs. of 2 variables:
## $ Site : Factor w/ 2 levels "AU","HF": 1 1 1 1 1 1 1 1 1 1 ...
## $ SporCount: int 124 185 125 113 128 127 98 155 150 148 ...
hist(data1[,2])
summary(data1[,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.0 126.0 141.0 142.8 158.0 223.0
sd(data1[,2])
## [1] 25.18378
summary(data1[1:300,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.0 120.0 139.0 140.4 157.0 207.0
sd(data[1:300,2])
## [1] 28.81225
summary(data1[301:600,2])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 78.0 132.0 144.0 145.2 158.0 223.0
sd(data[301:600,2])
## [1] 20.70705
# we can use rnorm with this data
Column <- data1[,1]
Sporocysts <- round(c(rnorm(300,mean=140.4, sd=28.81225),rnorm(300,mean=144.0,sd=20.70705)))
head(Sporocysts)
## [1] 127 111 158 116 114 116
randomData <- cbind.data.frame(Column,Sporocysts)
head(randomData)
## Column Sporocysts
## 1 AU 127
## 2 AU 111
## 3 AU 158
## 4 AU 116
## 5 AU 114
## 6 AU 116
# Now that we have simulated data with similar parameters to the actual data, we can see if we get comparable results when we run ANOVA on the simulated data.
AnovaData(randomData)
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 2522 2521.5 4.419 0.036 *
## Residuals 598 341232 570.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
GraphAnova(randomData)
# Simulated data yielded insignificant results unlike my real data, indicating that there are true differences in sporocysts/gametocyst at HF and AU
Random Data Bling
# One interesting scenario would be that there is little overlap in the histogram of sporocysts/gametocyst at HF and AU.
# We could simulate that data by generating count data that follow a distribution which will be very different between the two sites.
Column1 <- data1[,1]
Sporocysts1 <- round(c(rnorm(300,mean=100, sd=20),rnorm(300,mean=150,sd=20)))
randomDataBling <- cbind.data.frame(Column1,Sporocysts1)
head(randomDataBling)
## Column1 Sporocysts1
## 1 AU 101
## 2 AU 73
## 3 AU 124
## 4 AU 122
## 5 AU 64
## 6 AU 129
# Now that I have created very different, non-overlapping distributions for my two sites, I can do an ANOVA to determine if they are truly different
AnovaData(randomDataBling)
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 327460 327460 745.3 <2e-16 ***
## Residuals 598 262726 439
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
GraphAnova(randomDataBling)