# Introduction to R and R studio #R is a programming language, which is often used for statistical computing and graphical presentation to analyze and visualize data. #RStudio is an integrated development environment for R, which is a regular desktop application. #*Why Use R?** # * It is a great resource for data analysis, data visualization, data science and machine learning # * It provides many statistical techniques (such as statistical tests, classification, clustering and data reduction) # * It is easy to draw graphs in R, like pie charts, histograms, box plot, scatter plot, etc++ # * It works on different platforms (Windows, Mac, Linux) # * It is open-source and free # * It has a large community support # * It has many packages (libraries of functions) that can be used to solve different problems # Commonly used commands: ## Syntax "Hello World!" # To do simple calculations, add numbers together: 8 + 8 ## Print #Unlike many other programming languages, you can output code in R without using a print function: "Hello World!" #However, R does have a print() function available if you want to use it. This might be useful if you are familiar with other programming languages, such as Python, which often uses the print() function to output code. print("Hello World!") ## Comments # Comments can be used to explain R code, and to make it more readable. It can also be used to prevent execution when testing alternative code. It is recommended to add comment to every line of code for future references. # # Comments starts with a #. When executing code, R will ignore anything that starts with #. # # This example uses a comment before a line of code: # This is a comment "Hello World!" #Or comments can be written side wise: "Hello World!" # This is a comment #Multiline comments can also be written: # This is a comment # written in # more than just one line "Hello World!" # Creating Variables #Variables are containers for storing data values. #R does not have a command for declaring a variable. A variable is created the moment you first assign a value to it. To assign a value to a variable, use the <- sign. To output (or print) the variable value, just type the variable name: name <- "Denny" age <- 50 name # output "Denny" age # output 50 # Installing packages #The command 'install.packages("package_name")' is use to import specific package 'package_name' from CRAN to local library. #install.packages("readr") #The command 'library("package_name")' is use to import specific package 'package_name' from local library to current R environment. # Load package readr library("readr") ## Install Rtools #Go to https://cran.r-project.org/, click on 'Download R for Windows', then 'Rtools', and select the very latest version of RTools to download. #After downloading has completed run the installer. Select the default options everywhere. ## Install packages hosted on GitHub # install.packages("devtools") # library(devtools) # install_github("hadley/dplyr") # Update ## Updating Packages # update all available packages # update.packages() # Unfortunately this won't update packages installed by devtools::install_github() ## Update R #For updating R: Please reinstall R by going to https://cran.r-project.org/bin/windows/base/ (for Windows) or https://cran.r-project.org/bin/macosx/ (for Mac). #Download the installer file and follow instructions. ## Update RStudio #For updating RStudio: Open RStudio--> Menu--->Help---> Check for updates # Working Directory setup #'setwd' is used to set the working directory to the location "C:/Users/Chiranjit Dutta/Dropbox/Chiranjit Dutta/R Tutorial Summer 2022/R Tutorial 2022/Lecture_materials" setwd("C:/Users/Chiranjit Dutta/Dropbox/.dropbox-temp-relocate3640a06ba127/Chiranjit Dutta/R Tutorial Summer 2022/R Tutorial 2022/Lecture_materials") #'getwd' returns the current working directory. getwd() # Data Types # Basic data types in R can be divided into the following types: # * numeric - (10.5, 55, 787) # * integer - (1L, 55L, 100L, where the letter "L" declares this as an integer) # * complex - (9 + 3i, where "i" is the imaginary part) # * character (a.k.a. string) - ("k", "R is exciting", "FALSE", "11.5") # * logical (a.k.a. boolean) - (TRUE or FALSE) # We can use the class() function to check the data type of a variable: # numeric x <- 10.5 class(x) # integer x <- 1000L class(x) # complex x <- 9i + 3 class(x) # character/string x <- "R is exciting" class(x) # logical/boolean x <- TRUE class(x) ## Strings str <- "I am from India, India is a beautiful country." str cat(str) # print the value of str #However, note that R will add a "\n" at the end of each line break. This is called an escape character, and the n character indicates a new line. #If you want the line breaks to be inserted at the same position as in the code, use the cat() function as shown above. ## R Vectors # A vector is simply a list of items that are of the same type. # To combine the list of items to a vector, use the c() function and separate the items by a comma. #In the example below, we create a vector variable called fruits, that combine strings: # Vector of strings fruits <- c("banana", "apple", "orange") # Print fruits fruits #To create a vector with numerical values in a sequence, use the : operator: # Vector with numerical values in a sequence numbers <- 1:10 numbers numbers1 <- 1.5:6.3 numbers1 numbers2 <- seq(from = 0, to = 100, by = 20) # To make bigger or smaller steps in a sequence, use the seq() function numbers2 #Note that the last number is not used in numbers1. # Vector of logical values log_values <- c(TRUE, FALSE, TRUE, FALSE) log_values #To find out how many items a vector has, use the length() function: fruits <- c("banana", "apple", "orange") length(fruits) #To sort items in a vector alphabetically or numerically, use the sort() function: fruits <- c("banana", "apple", "orange", "mango", "lemon") numbers <- c(13, 3, 5, 7, 20, 2) sort(fruits) # Sort a string sort(numbers) # Sort numbers #You can access the vector items by referring to its index number inside brackets []. The first item has index 1, the second item has index 2, and so on: fruits <- c("banana", "apple", "orange") # Access the first item (banana) fruits[1] #You can also access multiple elements by referring to different index positions with the c() function: fruits <- c("banana", "apple", "orange", "mango", "lemon") # Access the first and third item (banana and orange) fruits[c(1, 3)] #You can also use negative index numbers to access all items except the ones specified: fruits <- c("banana", "apple", "orange", "mango", "lemon") # Access all items except for the first item fruits[c(-1)] #To change the value of a specific item, refer to the index number: fruits <- c("banana", "apple", "orange", "mango", "lemon") # Change "banana" to "pear" fruits[1] <- "pear" # Print fruits fruits #To repeat vectors, use the rep() function: repeat_each <- rep(c(1,2,3), each = 3) # Repeat each value repeat_each repeat_times <- rep(c(1,2,3), times = 3) # Repeat the sequence of the vector repeat_times repeat_indepent <- rep(c(1,2,3), times = c(5,2,1)) # Repeat each value independently repeat_indepent ## Lists #A list in R can contain many different data types inside it. A list is a collection of data which is ordered and changeable. #To create a list, use the list() function: # List of strings thislist <- list("apple", "banana", "cherry") # Print the list thislist #You can access the list items by referring to its index number, inside brackets. The first item has index 1, the second item has index 2, and so on: thislist <- list("apple", "banana", "cherry") thislist[1] #To change the value of a specific item, refer to the index number: thislist <- list("apple", "banana", "cherry") thislist[1] <- "raspberry" # Print the updated list thislist #To find out how many items a list has, use the length() function: thislist <- list("apple", "banana", "cherry") length(thislist) #To find out if a specified item is present in a list, use the %in% operator: thislist <- list("apple", "banana", "cherry") "apple" %in% thislist #To add an item to the end of the list, use the append() function: thislist <- list("apple", "banana", "cherry") append(thislist, "orange") #To add an item to the right of a specified index, add "after=index number" in the append() function: thislist <- list("apple", "banana", "cherry") append(thislist, "orange", after = 2) #Remove list items: thislist <- list("apple", "banana", "cherry") newlist <- thislist[-1] # Print the new list newlist #You can specify a range of indexes by specifying where to start and where to end the range, by using the : operator: thislist <- list("apple", "banana", "cherry", "orange", "kiwi", "melon", "mango") (thislist)[2:5] #You can loop through the list items by using a for loop: thislist <- list("apple", "banana", "cherry") # Print all items in the list, one by one for (x in thislist) { print(x) } #Join two lists: list1 <- list("a", "b", "c") list2 <- list(1,2,3) list3 <- c(list1,list2) list3 ## Matrices #A matrix is a two dimensional data set with columns and rows, which contains only one data type. #A column is a vertical representation of data, while a row is a horizontal representation of data. #A matrix can be created with the matrix() function. Specify the nrow and ncol parameters to get the amount of rows and columns: # Create a matrix thismatrix <- matrix(c(1,2,3,4,5,6), nrow = 3, ncol = 2) # Print the matrix thismatrix thismatrix <- matrix(c("apple", "banana", "cherry", "orange"), nrow = 2, ncol = 2) thismatrix #You can access the items by using [ ] brackets. The first number "1" in the bracket specifies the row-position, while the second number "2" specifies the column-position: thismatrix <- matrix(c("apple", "banana", "cherry", "orange"), nrow = 2, ncol = 2) thismatrix[1, 2] #The whole row can be accessed if you specify a comma after the number in the bracket: thismatrix <- matrix(c("apple", "banana", "cherry", "orange"), nrow = 2, ncol = 2) thismatrix[2,] #The whole column can be accessed if you specify a comma before the number in the bracket: thismatrix <- matrix(c("apple", "banana", "cherry", "orange"), nrow = 2, ncol = 2) thismatrix[,2] ## Data Frames #Data Frames are data displayed in a format as a table. #Data Frames can have different types of data inside it. While the first column can be character, the second and third can be numeric or logical. However, each column should have the same type of data. #Use the data.frame() function to create a data frame: # Create a data frame Data_Frame <- data.frame ( Training = c("Strength", "Stamina", "Other"), Pulse = c(100, 150, 120), Duration = c(60, 30, 45) ) # Print the data frame Data_Frame ### Row/Column binding #Use the rbind() function to add new rows in a Data Frame: # Add a new row New_row_DF <- rbind(Data_Frame, c("Strength", 110, 110)) # Print the new row New_row_DF #Use the cbind() function to add new columns in a Data Frame: # Add a new column New_col_DF <- cbind(Data_Frame, Steps = c(1000, 6000, 2000)) # Print the new column New_col_DF #Use the c() function to remove rows and columns in a Data Frame: # Remove the first row and column Data_Frame_New <- Data_Frame[-c(1), -c(1)] # Print the new data frame Data_Frame_New #Use the dim() function to find the amount of rows and columns in a Data Frame: dim(Data_Frame) #You can also use the ncol() function to find the number of columns and nrow() to find the number of rows: ncol(Data_Frame) nrow(Data_Frame) #Use the length() function to find the number of columns in a Data Frame (similar to ncol()): length(Data_Frame) #Use the rbind() function to combine two or more data frames in R vertically: Data_Frame1 <- data.frame ( Training = c("Strength", "Stamina", "Other"), Pulse = c(100, 150, 120), Duration = c(60, 30, 45) ) Data_Frame2 <- data.frame ( Training = c("Stamina", "Stamina", "Strength"), Pulse = c(140, 150, 160), Duration = c(30, 30, 20) ) New_Data_Frame <- rbind(Data_Frame1, Data_Frame2) New_Data_Frame #And use the cbind() function to combine two or more data frames in R horizontally: Data_Frame3 <- data.frame ( Training = c("Strength", "Stamina", "Other"), Pulse = c(100, 150, 120), Duration = c(60, 30, 45) ) Data_Frame4 <- data.frame ( Steps = c(3000, 6000, 2000), Calories = c(300, 400, 300) ) New_Data_Frame1 <- cbind(Data_Frame3, Data_Frame4) New_Data_Frame1 # Built-in Math Functions ## Basic functions #The min() and max() functions can be used to find the lowest or highest number in a set: max(5, 10, 15) min(5, 10, 15) #The sqrt() function returns the square root of a number: sqrt(16) #The abs() function returns the absolute (positive) value of a number: abs(-4.7) #The ceiling() function rounds a number upwards to its nearest integer, and the floor() function rounds a number downwards to its nearest integer, and returns the result: ceiling(1.4) floor(1.4) ## Matrix Calculations: ### Matrix addition and subtraction # A and B must have the same order (A <- matrix(c(-5, 4, 1, -3, 2, 6), nrow = 2, byrow = T)) (B <- matrix(c(7, -9, 10, 2, 6, -1), nrow = 2, byrow = T)) (C <- A + B) (D <- A - B) ### Matrix multiplication # ncol(A) must equal nrow(B) (A <- matrix(c(5, 4, 1, -3, 2, 6), nrow = 2, byrow = TRUE)) (B <- matrix(c(7, -3, 2), nrow = 3, byrow = TRUE)) (M <- A %*% B) ### Determinant of a square matrix (D <- matrix(c(2, -4, 5, 6, -7, 0, 3, 9, 7), nrow = 3, byrow = TRUE)) det(D) ### Inverse of a square matrix solve(D) ### Rank of a matrix A <- matrix(c(1, 1, 1, 0, 1, 2, 3, 1, 1, 2, 2, 1, 3, -1, 2, -1, -2, 0, -1, -1), nrow = 5) A qr(A)$rank ### Eigenvalues and eigenvectors A <- matrix(c(-1, 2, 0, 1, 2, 1, 0, 2, -1), nrow = 3, byrow = TRUE) A eigen(A) evalues <- eigen(A)$values evectors <- eigen(A)$vectors # V evalues evectors ### Vector norms v <- c(-1, 2, 1) norm(as.matrix(v), type = "1") #L-1 norm norm(as.matrix(v), type = "2") #L-2 norm norm(as.matrix(v), type = "I") #L-infinity norm ### Frobenius norm of a matrix A <- matrix(c(2, 1, 6, 4, 3, 5), nrow = 2, byrow = TRUE) norm(A, type = "F") # Factors #Factors are variables in R which take on a limited number of different values; such variables are often refered to as categorical variables. #Factors represent a very efficient way to store character values, because each unique character value is stored only once, and the data itself is stored as a vector of integers. data = c(1,2,2,3,1,2,3,3,1,2,3,3,1) fdata = factor(data) fdata rdata = factor(data,labels=c("I","II","III")) rdata #To convert the default factor fdata to roman numerals, we use the assignment form of the levels function: levels(fdata) = c('cat','dog','bird') fdata # Type Conversion #You can convert from one type to another with the following functions: # * as.numeric() # * as.integer() # * as.character() # * as.factor() x <- 1L # integer y <- 2 # numeric # convert from integer to numeric: a <- as.numeric(x) # convert from numeric to integer: b <- as.integer(y) # print values of x and y x y # print the class name of a and b class(a) class(b) ## Convert numeric to factor: num <- c(1,1,5,5,5,2,3,3,3,2,4,4,4,8,8) # print class of num class(num) # convert num to factor fac <- as.factor(num) fac # print class of fac class(fac) ## Convert numeric to character: num1 <- c(5,8,7,1,5,3,6,9,4,5,6) # print class of num1: class(num1) char <- as.character(num1) char # print class of char: class(char) ## Convert factor to numeric (Tricky!) fac <- as.factor(c(1,1,5,5,5,2,3,3,3,2,4,4,4,8,8)) # Incorrect conversion: as.numeric(fac) # Correct conversion: # 1. Convert factor into character # 2. Then convert character into numeric char <- as.character(fac) num <- as.numeric(char) print(num) # desired final output # Reading files in R ## Reading csv file #read.csv() is used to read csv files and creates a data frame in R. The argument 'header' is the logical value i.e. TRUE/FALSE indicating whether the file contains the names of the variables as its first line. nycflights <- read.csv(file="C:/Users/Chiranjit Dutta/Desktop/Lectures/Data/flights.csv",header = TRUE) head(nycflights) ## Reading excel(xls/xlsx) file #Load the library readxl to use the function read_excel for reading excel files in R. The argument 'sheet' takes a string (file name) or integer (sheet number). library(readxl) read_excel("C:/Users/Chiranjit Dutta/Desktop/Lectures/Data/planes.xlsx",sheet = 1)