
#################################################################
## Introduction to R
#################################################################

## Hi there,

## I highly recommend to do some tutorials on datacamp.com, starting with:
## https://www.edx.org/course/data-science-r-basics-harvardx-ph125-1x?utm_source=sailthru&utm_medium=email&utm_campaign=newsletter_themed_salary100k_20171102
## and/or https://www.datacamp.com/courses/free-introduction-to-r
## and/or https://www.edx.org/course/introduction-r-data-science-microsoft-dat204x-6
## You'll understand the basic principles and data types in R.

## After that, you can use this script for some R code that can be useful
## to organize your data, to perform some basic statistical analysis
## and to make some line/scatter/bar graphs.

## Just Google any questions you have. The online R community is large and active.
## You'll usually find answers on stackoverflow.com

## Get Rstudio for easy use of R!
## https://www.rstudio.com/products/RStudio/

## A '#' ensures that any text behind it is not read by R as code. Use it for making notes in your code.

## When you are running through this code, click the dataframes we're making by clicking on them in the data upper right block in this interface.
## The dataframe will be opened as a tab. Keep checking what happens to that dataframe when you run a line of code.

## Happy programming!
## Eva Thuijsman


#################################################################
## Dataframes
#################################################################

# Let's make a vector of names of df
names <- c("Joris Pot","Keiji Jindo","Thomas Delaune","Catherine Kiwuka","Marcel Lubbers","Claus Sebastien","Jet Drenth","Vania Beltran","Ilse de Jager","Na Wang",
           "Gerrie van de Ven","Hanna Kool","Paul Ravensbergen","Fabian Calvo Romero","Cor Langeveld","Qin Zhou")

# Add information about gender
gender <- c("M","M","M","F","M","M","F","F","F","F",
            "F","F","M","M","M","F")

# Dataframes: tables. Columns are variables, rows are observations
df  <- data.frame(names, gender)
tab <- table(names,gender)

# Add a new column with the room
df$room  <- "Earth"
# Another way to add a column
room  <- "Earth"
df <- cbind(df, room) # column bind
df$room <- NULL # remove the column again

# Who of you has some experience in R?
df$some_exp <- c(FALSE,TRUE,FALSE,FALSE,TRUE,F,T,T,F,T, # For TRUE/FALSE it doesn't matter whether you write it fully or abbreviated
                 F,T,F,T,F,F)

# Subsetting
df[1,3] # [row,column]
df[3, ]
df[ ,3]
df[ ,4]
df[ ,"names"]
df[2,c(1,2,3)] 
df[2,c(1:3)]
df[2,c(1,4)]

df[df$gender=="F", ]
df[df$gender=="F", "gender"]

# Now add the group number
df$group <- NA
df[c(1:10),"group"] <- "one"
df[c(11:length(rownames(df))),"group"] <- "two"

# Save subsets as separate dataframes
female_participants <- df[df$gender=="F", ]
male_participants   <- df[df$gender=="M", ]
all_participants <- rbind(female_participants, male_participants)

# Counting
nr_women <- length(df[df$gender=="F","gender"])
nr_men   <- length(df[!df$gender=="F","gender"]) # Note the "!" for negating. Another option would be to write "M" instead of "F" of course

# If this then this
df$height <- ifelse(df$gender=="F", # if gender is "F"...
                    170.7,  # write this number to column "height"
                    183.8)  # If gender is NOT "F", write this number to column "height"

# A more complicated example (overwriting lines directly above this)
df$height <- ifelse(df$gender=="F",             # if gender is "F"...
                    rnorm(nr_women, mean=170.7, sd=6.3),  # ... generate random numbers (normally distributed)
                    rnorm(nr_men,   mean=183.8, sd=7.1))  # If gender is NOT "F", generate other random numbers (based on Dutch average heights)

# Round numbers
df$height <- round(df$height, digits=0)

# Data structure
str(df)

# Change classes
df$group <- as.factor(df$group)
df[ ,4]  <- as.factor(df[ ,4]) # as.numeric(), as.character()...
df$group <- factor(df$group, levels=c("one","two"))
## Errors will happen if your data are not recognized as the right class. Keep checking str()!

# Generate some columns with random numbers. rnorm(number, mean, var) The number is 16 because that is the number of rows/observations
random1 <- round(rnorm(16),digits=0)
random2 <- round(rnorm(16,100,20),digits=0)
random3 <- round(rnorm(16,70,15),digits=0)
random4 <- round(rnorm(16,150,20),digits=0)
random5 <- round(rnorm(16,50,2),digits=0)
randomdata <- cbind(random1,random2,random3,random4,random5)

# Change names of columns/variables
colnames(randomdata)[1:5] <- c("var1","var2","var3","var4","var5")
randomdata <- cbind(names,randomdata)

# Merge dataframes
df <- merge(df, randomdata, by="names")

# Changing column names
str(df) ## 'var' columns are recognized as factors but they are actually numeric
columns_to_numeric <- c("var1","var2","var3","var4","var5")
df[,columns_to_numeric] <- lapply(df[,columns_to_numeric], function(x) as.numeric(as.character(x)))

# Round numbers in multiple columns
colnames(df) # I use this to get the indices of columns of interest
df[,c(7:11)] <- round(df[,c(7:11)], digits=0)

# Bounds to numbers
df$var2[df$var2 > 100] <- 100

# Arithmetics
df$var6 <- df$var1 + df$var2

# Writing functions
percentage <- function(x,y) {x/y * 100}
df$var7 <- percentage(x = df$var5, y = df$var6)

# 'for' loops for iterating actions
for (i in 1:10) {print(i)} # Example

for (i in 5:7)
{ df[[paste0("test",i)]] <- df[, paste0("var",i)] / 100 # paste0() pastes the text and the number together and reads that as a column name. An action is performed on columns "var5", "var6" and "var7", while creating "test5", "test6" and "test7"
}

# Remove those columns again by selecting what you want
df <- df[,c(1:13)]
df <- df[,-c(14:20)] # other option

# Search for columns with 'var' in the name
var_column_name   <- colnames(df)[grep("var", colnames(df))]
var_column_logi   <- grepl("var", colnames(df))
var_column_index  <- grep("var", colnames(df))

# Automatically render email addresses

install.packages("tidyr")
require(tidyr)
df <- extract(df, names, c("first_name", "last_name"), "([^ ]+) (.*)") ## Character classes (letter) go inside []. [^ ] match everything except space. + one or more. (.*) zero or more dots 

df$email <- paste0(df$first_name,".",df$last_name,"@wur.nl")
df$email <- tolower(df$email) # set to lower case
df$email <- gsub(" ","", df$email) # remove spaces


#################################################################
## Data to and from Excel
#################################################################

## Now let's save these data to Excel.
## To do that we first have to (install and) load some packages
## and set a working directory

## First install these packages for some functions to work
## You can also install packages via the Tools tab.

install.packages("openxlsx")
install.packages("xlsx")
install.packages("rJava")
require(openxlsx)
require(xlsx)
require(rJava)

# Go to Session tab to set the working directory, or use:
setwd("~/Eva/PPS/Some R scripts/R_tutorial_PPS_CSA")

# Create Excel file of this dataframe and save it in the working directory
write.xlsx(x=df, file="test_dataframe.xlsx",row.names=FALSE)

# Now if we want to load this
df_Excel <- read.xlsx("test_dataframe.xlsx", sheetIndex = 1)
# df_csv <- read.csv(test_dataframe.xlsx", sep=",")



#################################################################
## Interpreting the data
#################################################################


## Get an impression of some data

summary(df)

install.packages("plyr")
require(plyr) # For arranging data

overview <- ddply(df, "gender", summarise,
                  N      = sum(!is.na(height)),
                  mean   = mean(height, na.rm=TRUE),
                  median = median(height, na.rm=TRUE), 
                  sd     = sd(height, na.rm=TRUE), # Standard deviation
                  se     = sd / sqrt(N)) # Standard error
overview # Check the console for results


#### Linear model:

install.packages("lsmeans")
install.packages("MuMIn")
require(lsmeans)
require(MuMIn)

lm1 <- lm(height ~ gender, data=df)
anova(lm1)
lsmeans(lm1, pairwise ~ gender)
r.squaredGLMM(lm1)

#residual checks. Run the following five lines of codes together.
par(mfrow=c(1,3)) # Three figures next to each other.
hist(residuals(lm1)/sd(residuals(lm1),na.rm=T),30) # Histogram. Normally distributed?
plot(fitted(lm1),residuals(lm1)/sd(residuals(lm1),na.rm=T)) # Residuals cloud without trend?
qqnorm(residuals(lm1)/sd(residuals(lm1),na.rm=T)) # QQ-plot straight line?
abline(c(0,1))



#### Mixed model:

install.packages("lmerTest")
install.packages("lme4")
install.packages("predictmeans")
require(lmerTest)
require(predictmeans)
require(lme4)

str(df)

# In case you want to correct for random factors, such as location/group/block, you need a linear mixed model:
lmm1<-lmer(height ~ gender + some_exp + (1|group), data=df) # Let's test whether height is explained by gender and/or experience level in R...
anova(lmm1)
lmm2<-lmer(height ~ gender + some_exp + gender*some_exp + (1|group), data=df) # Add a nonsensical interaction effect for fun
anova(lmm2)

lmm1<-lme4::lmer(height ~ gender + some_exp + (1|group), data=df)
predictmeans(lmm1,"gender")

# Residual checks
par(mfrow=c(1,3)) # Three figures next to each other.
hist(residuals(lmm1)/sd(residuals(lmm1),na.rm=T),30) # Histogram. Normally distributed?
plot(fitted(lmm1),residuals(lmm1)/sd(residuals(lmm1),na.rm=T)) # Residuals cloud without trend?
qqnorm(residuals(lmm1)/sd(residuals(lmm1),na.rm=T)) # QQ-plot straight line?
abline(c(0,1))


#################################################################
## Some plotting
#################################################################

# There are lots of ways to make plots in R. A common way is the use of the plot() function.
# Look it up for all possibilities to make them nicer.
plot(df$gender,df$height)
plot(df$height,df$var3)

# Below I show more advanced plotting

install.packages(ggplot2)
install.packages(plotrix)
install.packages(gridExtra)
install.packages(cowplot)
require(ggplot2) #For figures
require(plotrix) # For plot functions
require(gridExtra) # To arrange plots in grids
require(cowplot) # To arrange plots in grids etc

######### for the bar graph I first want to make a smaller dataframe with just the data I need (mean and SE)
# Height of women
mean <-      mean(df[df$gender=="F","height"])
SE   <- std.error(df[df$gender=="F","height"])
height_F <- data.frame(mean,SE)
height_F$gender <- "F"

# Height of men
mean <-      mean(df[df$gender=="M","height"])
SE   <- std.error(df[df$gender=="M","height"])
height_M <- data.frame(mean,SE)
height_M$gender <- "M"

# Combine
height_df <- rbind(height_F,height_M)
#########


# I always load 'cleanup' and 'texttheme' beforehand.
# You'll see that I refer to them in the scripts for plot later on.
cleanup <- theme(panel.grid.major=element_blank(), # This removes the ugly standard background and grid of ggplot figures
                 panel.grid.minor=element_blank(),
                 panel.background=element_blank(),
                 axis.line=element_line(color="black"))

texttheme<- theme(text = element_text(size=10,color="black"), # This theme is used to make all texts the colour and size you want them.
                  axis.text.x = element_text(size=9,color="black"),
                  axis.text.y = element_text(size=9,color="black"),
                  legend.title = element_text(size=10,color="black"),
                  legend.text= element_text(size=9,color="green"))

## Many of these lines of code you can just leave out. Just play with it!

# Example for making a barplot. The order of commands doesn't matter too much, and you can simply leave lines out.
bar_figure <- ggplot(height_df,aes(x = gender, y = mean,fill=gender)) + # What's on x and y (use the names of columns on your dataframe) and you can use fill=... to colour the bars differently for groups identified in your dataframe (e.g. if you have a column "gender" with levels "F" and "M")
  geom_bar(stat="identity",position=position_dodge(width=0.8)) + # Use this to get a bar graph.
  geom_errorbar(aes(ymin=mean-SE, ymax=mean+SE), width=.15,position=position_dodge(width=0.9)) + # Add error bars. In this case, I had columns called 'mean' and 'SE' in my dataframe. Width and position are used for placement and width of the error bars
  ylab("Height (cm)")+ # Adding axis labels
  xlab("Gender")+
  scale_x_discrete(labels=c("Female", "Male")) + # Names of discrete variables on the x axis (take care of the order! Test by running the code without this line!!) 
  scale_y_continuous(expand = c(0,0))+ # TO make sure that the axes start at (0,0) (otherwise the bars float a bit)
  coord_cartesian(ylim=c(0,250))+ # With this you actually 'zoom in' on a part of the figure. You identify the axis limits. You can leave this line out, R will choose the size automatically
  scale_fill_manual(name="Gender", # Name of the 'subgroup'
                    labels=c("Female","Male"),
                    values=c("gray25","gray55"))+ # Different colours for the genders.
  texttheme +
  cleanup

# Example for making a scatter plot with discrete variables on the x axis. 
scatter_figure1 <- ggplot(df,aes(x=gender, y=var3, shape=group)) + # Now I want different shapes of the scatter dots, for the different groups
  geom_point(aes(shape=group)) + # Scatter plot
  xlab("Gender")+
  ylab("Var 3") +
  coord_cartesian(ylim=c(0,125)) +
  scale_y_continuous(expand = c(0,0),breaks=seq(0,125,by=25))+ # Here I define where the tick marks on the y axis should be
  scale_x_discrete(labels=c("Female","Male")) +
  scale_shape_manual(name="Group", values=c(0,1,2,3))+ # Give a name to the legend, and the values are used to assign the shapes to those species.
  scale_color_manual(values="black") +
  texttheme +
  cleanup


# Scatter plot (all continuous variables)
scatter_figure2 <- ggplot(df, aes(x=var3, y=var4, shape=group)) +
  geom_point()  +
  stat_smooth(method=lm, se=FALSE, aes(shape=group, linetype=group), colour="black", size=0.6) + # This will add trend lines through the dots (one for each group, and all looking different [dashed or smooth etc]).
  xlab(Blabla~"(mg "*kg^"-1"*")")+ # Superscripts are tricky
  ylab(Something~~"(t "*ha^"-1"*")")+
  coord_cartesian(ylim=c(0,200), xlim=c(30:110)) +
  scale_y_continuous(expand = c(0,0),breaks=seq(0,200,by=20))+
  scale_shape_manual(name="Group", values=c(1,2))+
  scale_linetype_manual(name="Group", values=c(1,2))+
  texttheme +
  cleanup

# If you want to put figures next to each other in one new figure, with labels (A, B, C):
grid.figures <- plot_grid(bar_figure,scatter_figure1,scatter_figure2,
                         labels=c("A", "B","C"), label_size = 10,
                         ncol = 3, nrow = 1) # This means that the three figures will be placed next to each other in one row

## Save high quality
ggsave("plot_grid_test.png",plot=grid.figures,  width = 20, height = 6, units = "cm")


