Linear Regression

#################################################
# Function: FitLinear
# Outputs summary statistics of linear regression for a given data frame
# Input: dataframe, x values in first column, y values in second column
# Output: r-squared value, slope, and pvalue

FitLinear <- function(data=cbind(runif(10),runif(10))) {
  x <- data[,1]
  y <- data[,2]
  dataFrame <- data.frame(x,y)

  regModel <- lm(y~x,data=dataFrame)
  
  myOut<- c(slope = summary(regModel)$coefficients[2,1],
            pvalue = summary(regModel)$coefficients[2,4],
            Rsquared = summary(regModel)$adj.r.squared)
            
  
  return(myOut)
}
#######################################################

Contingency Table

###################################################
# Function: ContTable
# Reports the results from the Chi Squared analysis for a given data frame
# Inputs: discrete numerical vectors for both
# Outputs: the results from the Chi Square analysis

ContTable <- function(data=cbind(runif(10),runif(10))) {
  x <- data[,1]
  y <- data[,2]
  dataFrame <- data.frame(x,y)
  
  print(chisq.test(dataFrame))

}
ContTable()

## Warning in chisq.test(dataFrame): Chi-squared approximation may be
## incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  dataFrame
## X-squared = 1.6485, df = 9, p-value = 0.9959

ANOVA/My Data

# Function: AnovaData
# Reports F value (test statistic for the variation between sample means) for a given data frame
# Inputs: continuous numeric vector for x and a discrete vector for y
# Outputs: a mosaic plot with p-value and a bar plot

AnovaData <- function(data=cbind(runif(10),runif(10))) {
  x <- data[,1]
  y <- data[,2]
  dataFrame <- data.frame(x,y)

 AnovaModel<- aov(y~x,data=dataFrame)
  
  print(summary(AnovaModel))

}
data<- read.table("SporCount.csv",header=TRUE,sep=",")
data1 <- data[,1:2]

head(data1)

##   Site SporCount
## 1   AU       124
## 2   AU       185
## 3   AU       125
## 4   AU       113
## 5   AU       128
## 6   AU       127

AnovaData(data=data1)

##              Df Sum Sq Mean Sq F value Pr(>F)  
## x             1   3480    3480   5.529  0.019 *
## Residuals   598 376419     629                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

########################################################################
# Function: GraphAnova
# Graphs box plot with whiskers 
# Input: continuous numeric vector for x and discrete vector for y
# Output: box plot with whiskers and F value

GraphAnova <- function(data=cbind(runif(10),runif(10))) {
  x <- data[,1]
  y <- data[,2]
  dataFrame <- data.frame(x,y)

 AnovaModel<- aov(y~x,data=dataFrame)
  
  
  return(boxplot(y~x,data=dataFrame,col=c("purple","limegreen","royalblue")))
}

GraphAnova(data1)

Random Data

str(data1)

## 'data.frame':    600 obs. of  2 variables:
##  $ Site     : Factor w/ 2 levels "AU","HF": 1 1 1 1 1 1 1 1 1 1 ...
##  $ SporCount: int  124 185 125 113 128 127 98 155 150 148 ...

hist(data1[,2])

summary(data1[,2])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    73.0   126.0   141.0   142.8   158.0   223.0

sd(data1[,2])

## [1] 25.18378

summary(data1[1:300,2])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    73.0   120.0   139.0   140.4   157.0   207.0

sd(data[1:300,2])

## [1] 28.81225

summary(data1[301:600,2])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    78.0   132.0   144.0   145.2   158.0   223.0

sd(data[301:600,2])

## [1] 20.70705

# we can use rnorm with this data

Column <- data1[,1]
Sporocysts <- round(c(rnorm(300,mean=140.4, sd=28.81225),rnorm(300,mean=144.0,sd=20.70705)))
head(Sporocysts)

## [1] 127 111 158 116 114 116

randomData <- cbind.data.frame(Column,Sporocysts)
head(randomData)

##   Column Sporocysts
## 1     AU        127
## 2     AU        111
## 3     AU        158
## 4     AU        116
## 5     AU        114
## 6     AU        116

# Now that we have simulated data with similar parameters to the actual data, we can see if we get comparable results when we run ANOVA on the simulated data.
AnovaData(randomData)

##              Df Sum Sq Mean Sq F value Pr(>F)  
## x             1   2522  2521.5   4.419  0.036 *
## Residuals   598 341232   570.6                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

GraphAnova(randomData)

# Simulated data yielded insignificant results unlike my real data, indicating that there are true differences in sporocysts/gametocyst at HF and AU

Random Data Bling

# One interesting scenario would be that there is little overlap in the histogram of sporocysts/gametocyst at HF and AU.

# We could simulate that data by generating count data that follow a distribution which will be very different between the two sites.

Column1 <- data1[,1]
Sporocysts1 <- round(c(rnorm(300,mean=100, sd=20),rnorm(300,mean=150,sd=20)))
randomDataBling <- cbind.data.frame(Column1,Sporocysts1)
head(randomDataBling)

##   Column1 Sporocysts1
## 1      AU         101
## 2      AU          73
## 3      AU         124
## 4      AU         122
## 5      AU          64
## 6      AU         129

# Now that I have created very different, non-overlapping distributions for my two sites, I can do an ANOVA to determine if they are truly different
AnovaData(randomDataBling)

##              Df Sum Sq Mean Sq F value Pr(>F)    
## x             1 327460  327460   745.3 <2e-16 ***
## Residuals   598 262726     439                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

GraphAnova(randomDataBling)

Assignment09

Erin Keller

March 22, 2017

Linear Regression

Contingency Table

ANOVA/My Data

Random Data

Random Data Bling