Introduction to R programming | Datacamp - Introduction to R programming | R programming for data scientist | Programming language for Data Scientist


An introduction to R from Datacamp

In this chapter, we will learn about the basics and widely used data structures in R like vectors, factors, lists, and data frames. 

Arithmetic operations:

# An addition
5 + 5 
[1] 10

# A subtraction
5 - 5 
[1] 0

# A multiplication
3 * 5
[1] 15

# A division
(5 + 5) / 2 
[1] 5

# Exponentiation
2 ^ 5
[1] 32

# Modulo
28 %% 6
[1] 4


Variable Assignment:

# Assign a value to the variable my_apples
my_apples <- 5 

# Fix the assignment of my_oranges
my_oranges <- 6 

# Create the variable my_fruit and print it out
my_fruit <- my_apples + my_oranges 
my_fruit
[1] 11


Basic data types in R:

# Declare variables of different types
my_numeric <- 42
my_character <- "universe"
my_logical <- FALSE 

# Check class of my_numeric
class(my_numeric)
[1] "numeric"

# Check class of my_character
class(my_character)
[1] "character"

# Check class of my_logical
class(my_logical)
[1] "logical"


Vectors:

# Creating vectors
numeric_vector <- c(11049)
character_vector <- c("a""b""c")

# Complete the code for boolean_vector
boolean_vector <- c(TRUE,FALSE,TRUE)


# Naming vectors 1

# Poker winnings from Monday to Friday
poker_vector <- c(140-5020-120240)

# Roulette winnings from Monday to Friday
roulette_vector <- c(-24-50100-35010)

# Assign days as names of poker_vector
names(poker_vector) <- c("Monday""Tuesday""Wednesday"
"Thursday""Friday")

# Assign days as names of roulette_vector
names(roulette_vector) <- c("Monday""Tuesday""Wednesday"
"Thursday""Friday")


# Naming vectors 2

# Poker winnings from Monday to Friday
poker_vector <- c(140-5020-120240)

# Roulette winnings from Monday to Friday
roulette_vector <- c(-24-50100-35010)

# The variable days_vector
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
 
# Assign the names of the day to roulette_vector and poker_vector
names(poker_vector) <- days_vector 
names(roulette_vector) <- days_vector


# Calculating total winnings

A_vector <- c(123)
B_vector <- c(456)

# Take the sum of A_vector and B_vector
total_vector <- A_vector + B_vector
  
# Print out total_vector
total_vector
[1] 5 7 9


# Calculating total winnings 2

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Total winnings with poker
total_poker <- sum(poker_vector)

# Total winnings with roulette
total_roulette <- sum(roulette_vector

# Total winnings overall
total_week <- total_poker + total_roulette

# Print out total_week
total_week
[1] -84


# Comparing total winnings

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Calculate total gains for poker and roulette
total_poker <- sum(poker_vector)
total_roulette <- sum(roulette_vector)

# Check if you realized higher total gains in poker than in roulette
checkit <- total_poker > total_roulette
checkit
[1] TRUE


# Vector selection

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Define a new variable based on a selection
roulette_selection_vector <- roulette_vector[2:5]


# Vector selection 2

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Select poker results for Monday, Tuesday and Wednesday
poker_start <- poker_vector[c("Monday","Tuesday","Wednesday")]
  
# Calculate the average of the elements in poker_start
mean(poker_start)
[1] 36.66667


# Vector selection by comparison

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Which days did you make money on poker?
selection_vector <- poker_vector > 0
  
# Print out selection_vector
selection_vector
Monday Tuesday Wednesday Thursday Friday TRUE FALSE TRUE FALSE TRUE


# Vector selection by comparison 2

# Poker and roulette winnings from Monday to Friday:
poker_vector <- c(140-5020-120240)
roulette_vector <- c(-24-50100-35010)
days_vector <- c("Monday""Tuesday""Wednesday""Thursday""Friday")
names(poker_vector) <- days_vector
names(roulette_vector) <- days_vector

# Which days did you make money on roulette?
selection_vector <- roulette_vector >0

# Select from roulette_vector these days
roulette_winning_days <- roulette_vector[selection_vector]


Matrices:

# Construct a matrix with 3 rows that contain the numbers 1 up to 9
matrix(1:9byrow = TRUEnrow = 3)
[,1] [,2] [,3] [1,] 1 2 3 [2,] 4 5 6 [3,] 7 8 9

# Box office Star Wars (in millions!)
new_hope <- c(460.998314.4)
empire_strikes <- c(290.475247.900)
return_jedi <- c(309.306165.8)

# Box office Star Wars (in millions!)
new_hope <- c(460.998314.4)
empire_strikes <- c(290.475247.900)
return_jedi <- c(309.306165.8)

# Construct matrix
star_wars_matrix <- matrix(c(new_hopeempire_strikesreturn_jedi), 
nrow = 3byrow = TRUE)

# Vectors region and titles, used for naming
region <- c("US""non-US")
titles <- c("A New Hope""The Empire Strikes Back""Return of the Jedi")

# Name the columns with region
colnames(star_wars_matrix) <- region

# Name the rows with titles
rownames(star_wars_matrix) <- titles

# Print out star_wars_matrix
star_wars_matrix
US non-US A New Hope 460.998 314.4 The Empire Strikes Back 290.475 247.9 Return of the Jedi 309.306 165.8

# Calculating the worldwide box office

# Construct star_wars_matrix
box_office <- c(460.998314.4290.475247.900309.306165.8)
region <- c("US""non-US")
titles <- c("A New Hope"
                 "The Empire Strikes Back"
                 "Return of the Jedi")
               
star_wars_matrix <- matrix(box_office
                      nrow = 3byrow = TRUE,
                      dimnames = list(titlesregion))

# Calculate worldwide box office figures
worldwide_vector <- rowSums(star_wars_matrix)


# Adding a column for the worldwide box office

# Construct star_wars_matrix
box_office <- c(460.998314.4290.475247.900309.306165.8)
region <- c("US""non-US")
titles <- c("A New Hope"
            "The Empire Strikes Back"
            "Return of the Jedi")
               
star_wars_matrix <- matrix(box_office
                      nrow = 3byrow = TRUE,
                      dimnames = list(titlesregion))

# The worldwide box office figures
worldwide_vector <- rowSums(star_wars_matrix)

# Bind the new variable worldwide_vector as a column to star_wars_matrix
all_wars_matrix <- cbind(star_wars_matrixworldwide_vector)


# Adding a row

# star_wars_matrix and star_wars_matrix2 are available in your workspace
star_wars_matrix  
star_wars_matrix2 

# Combine both Star Wars trilogies in one matrix
all_wars_matrix <- rbind(star_wars_matrixstar_wars_matrix2)


# The total box office revenue for the entire saga

# all_wars_matrix is available in your workspace
all_wars_matrix

# Total revenue for US and non-US
total_revenue_vector <- colSums(all_wars_matrix)
  
# Print out total_revenue_vector
total_revenue_vector
US non-US 2226.3 2087.8

# Selection of matrix elements

# all_wars_matrix is available in your workspace
all_wars_matrix

# Select the non-US revenue for all movies
non_us_all <- all_wars_matrix[,2]
  
# Average non-US revenue
mean(non_us_all)
  
# Select the non-US revenue for first two movies
non_us_some <- all_wars_matrix[1:2,2]
  
# Average non-US revenue for first two movies
mean(non_us_some)
[1] 281.15


# Arithmetic with matrices

# all_wars_matrix and ticket_prices_matrix are available in your workspace
all_wars_matrix
[1]
all_wars_matrix
US non-US A New Hope 461.0 314.4 The Empire Strikes Back 290.5 247.9 Return of the Jedi 309.3 165.8 The Phantom Menace 474.5 552.5 Attack of the Clones 310.7 338.7 Revenge of the Sith 380.3 468.5
ticket_prices_matrix
[2]
ticket_prices_matrix
US non-US A New Hope 5.0 5.0 The Empire Strikes Back 6.0 6.0 Return of the Jedi 7.0 7.0 The Phantom Menace 4.0 4.0 Attack of the Clones 4.5 4.5 Revenge of the Sith 4.9 4.9

# Estimated number of visitors
visitors <- all_wars_matrix/ticket_prices_matrix

# US visitors
us_visitors <- visitors[,1]

# Average number of US visitors
mean(us_visitors)
[1] 75.01401

Factors:

# Assign to the variable theory what this chapter is about!
theory <- "factors"

# Sex vector
sex_vector <- c("Male""Female""Female""Male""Male")


# Convert sex_vector to a factor
factor_sex_vector <-factor(sex_vector)

# Print out factor_sex_vector
factor_sex_vector
[1] Male Female Female Male Male Levels: Female Male


# Animals
animals_vector <- c("Elephant""Giraffe""Donkey""Horse")
factor_animals_vector <- factor(animals_vector)
factor_animals_vector
[1] Elephant Giraffe Donkey Horse Levels: Donkey Elephant Giraffe Horse

# Temperature
temperature_vector <- c("High""Low""High","Low""Medium")
factor_temperature_vector <- factor(temperature_vectororder = TRUE
levels = c("Low""Medium""High"))
factor_temperature_vector
[1] High Low High Low Medium Levels: Low < Medium < High


# Factor levels

# Code to build factor_survey_vector
survey_vector <- c("M""F""F""M""M")
factor_survey_vector <- factor(survey_vector)

# Specify the levels of factor_survey_vector
levels(factor_survey_vector) <- c("Female","Male")

factor_survey_vector
[1] Male Female Female Male Male Levels: Female Male


# Summarizing a factor

# Build factor_survey_vector with clean levels
survey_vector <- c("M""F""F""M""M")
factor_survey_vector <- factor(survey_vector)
levels(factor_survey_vector) <- c("Female""Male")
factor_survey_vector

# Generate summary for survey_vector
summary(survey_vector)

# Generate summary for factor_survey_vector
summary(factor_survey_vector)
Female Male 2 3

# Battle of the sexes

# Build factor_survey_vector with clean levels
survey_vector <- c("M""F""F""M""M")
factor_survey_vector <- factor(survey_vector)
levels(factor_survey_vector) <- c("Female""Male")

# Male
male <- factor_survey_vector[1]

# Female
female <- factor_survey_vector[2]

# Battle of the sexes: Male 'larger' than female?
male > female


# Ordered factors

# Create speed_vector
speed_vector <- c("medium""slow""slow""medium""fast")

# Convert speed_vector to ordered factor vector
factor_speed_vector <- factor(speed_vectorordered=TRUE
levels = c("slow""medium","fast"))

# Print factor_speed_vector
factor_speed_vector
[1] medium slow slow medium fast Levels: slow < medium < fast

summary(factor_speed_vector)
slow medium fast 2 2 1


# Comparing ordered factors

# Create factor_speed_vector
speed_vector <- c("medium""slow""slow""medium""fast")
factor_speed_vector <- factor(speed_vectorordered = TRUE
levels = c("slow""medium""fast"))

# Factor value for second data analyst
da2 <- factor_speed_vector[2]

# Factor value for fifth data analyst
da5 <- factor_speed_vector[5]

# Is data analyst 2 faster than data analyst 5?
da2 > da5
[1] FALSE


Data frames:

# Print out built-in R data frame
mtcars 
# Call head() on mtcars
head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1

# finding the structure of the dataset

# Investigate the structure of mtcars
str(mtcars)
'data.frame': 32 obs. of 11 variables: $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... $ disp: num 160 160 108 258 360 ... $ hp : num 110 110 93 110 175 105 245 62 95 123 ... $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... $ wt : num 2.62 2.88 2.32 3.21 3.44 ... $ qsec: num 16.5 17 18.6 19.4 17 ... $ vs : num 0 0 1 1 0 1 0 1 1 1 ... $ am : num 1 1 1 0 0 0 0 0 0 0 ... $ gear: num 4 4 4 3 3 3 3 4 4 4 ... $ carb: num 4 4 1 1 2 1 4 2 2 4 ...

# Creating a data frame

# Definition of vectors
name <- c("Mercury""Venus""Earth"
          "Mars""Jupiter""Saturn"
          "Uranus""Neptune")
type <- c("Terrestrial planet"
          "Terrestrial planet"
          "Terrestrial planet"
          "Terrestrial planet""Gas giant"
          "Gas giant""Gas giant""Gas giant")
diameter <- c(0.3820.94910.532
              11.2099.4494.0073.883)
rotation <- c(58.64-243.0211.03
              0.410.43-0.720.67)
rings <- c(FALSEFALSEFALSEFALSETRUETRUETRUETRUE)

# Create a data frame from the vectors
planets_df <- data.frame(nametypediameterrotationrings)

# Check the structure of planets_df
str(planets_df)
'data.frame': 8 obs. of 5 variables: $ name : chr "Mercury" "Venus" "Earth" "Mars" ... $ type : chr "Terrestrial planet" "Terrestrial planet" "Terrestrial planet" 
"Terrestrial planet"... $ diameter: num 0.382 0.949 1 0.532 11.209 ... $ rotation: num 58.64 -243.02 1 1.03 0.41 ... $ rings : logi FALSE FALSE FALSE FALSE TRUE TRUE ...


# Selection of data frame elements

# The planets_df data frame from the previous exercise is pre-loaded

# Print out diameter of Mercury (row 1, column 3)
planets_df[1,3]
[1] 0.382

# Print out data for Mars (entire fourth row)
planets_df[4,]
name type diameter rotation rings 4 Mars Terrestrial planet 0.532 1.03 FALSE

# Select first 5 values of diameter column
planets_df[1:5"diameter"]
[1] 0.382 0.949 1.000 0.532 11.209

# Select the rings variable from planets_df
rings_vector <- planets_df$rings
  
# Print out rings_vector
rings_vector
[1] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE

# Adapt the code to select all columns for planets with rings
planets_df[rings_vector, ]
name type diameter rotation rings 5 Jupiter Gas giant 11.209 0.41 TRUE 6 Saturn Gas giant 9.449 0.43 TRUE 7 Uranus Gas giant 4.007 -0.72 TRUE 8 Neptune Gas giant 3.883 0.67 TRUE

# Select planets with diameter < 1
subset(planets_df,diameter < 1)
name type diameter rotation rings 1 Mercury Terrestrial planet 0.382 58.64 FALSE 2 Venus Terrestrial planet 0.949 -243.02 FALSE 4 Mars Terrestrial planet 0.532 1.03 FALSE


# Sorting the data frame

# Use order() to create positions
positions <- order(planets_df$diameter)

# Use positions to sort planets_df
planets_df[positions, ]
name type diameter rotation rings 1 Mercury Terrestrial planet 0.382 58.64 FALSE 4 Mars Terrestrial planet 0.532 1.03 FALSE 2 Venus Terrestrial planet 0.949 -243.02 FALSE 3 Earth Terrestrial planet 1.000 1.00 FALSE 8 Neptune Gas giant 3.883 0.67 TRUE 7 Uranus Gas giant 4.007 -0.72 TRUE 6 Saturn Gas giant 9.449 0.43 TRUE 5 Jupiter Gas giant 11.209 0.41 TRUE


Lists:

# Creating a list

# Vector with numerics from 1 up to 10
my_vector <- 1:10 

# Matrix with numerics from 1 up to 9
my_matrix <- matrix(1:9ncol = 3)

# First 10 elements of the built-in data frame mtcars
my_df <- mtcars[1:10,]

# Adapt list() call to give the components names
my_list <- list(my_vectormy_matrixmy_df)

# Print out my_list
my_list <- list(vec = my_vectormat = my_matrixdf = my_df)
my_list
$vec [1] 1 2 3 4 5 6 7 8 9 10 $mat [,1] [,2] [,3] [1,] 1 4 7 [2,] 2 5 8 [3,] 3 6 9 $df mpg cyl disp hp drat wt qsec vs am gear carb Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4


# The variables mov, act and rev are available

# Finish the code to build shining_list
shining_list <- list(moviename = movactors = actreviews = rev)

# Print out the vector representing the actors
shining_list$actors
[1] "Jack Nicholson" "Shelley Duvall" "Danny Lloyd" "Scatman Crothers" [5] "Barry Nelson"

# Print the second element of the vector representing the actors
shining_list$actors[2]
[1] "Shelley Duvall"

# Creating a new list for another movie

# Use the table from the exercise to define the comments and scores vectors
scores <- c(4.654.854.2)
comments <- c("I would watch it again""Amazing!""I liked it""One of 
the best movies","Fascinating plot")

# Save the average of the scores vector as avg_review
avg_review <- mean(scores)

# Combine scores and comments into the reviews_df data frame
reviews_df <- data.frame(scores,comments)

# Create and print out a list, called departed_list
departed_list <- list(movie_titlemovie_actors,reviews_dfavg_review)
departed_list
[[1]] [1] "The Departed" [[2]] [1] "Leonardo DiCaprio" "Matt Damon" "Jack Nicholson" [4] "Mark Wahlberg" "Vera Farmiga" "Martin Sheen" [[3]] scores comments 1 4.6 I would watch it again 2 5.0 Amazing! 3 4.8 I liked it 4 5.0 One of the best movies 5 4.2 Fascinating plot [[4]] [1] 4.72

Thank you.

Comments

Popular posts

Intermediate R programming | Datacamp- Intermediate R programming | R programming for data scientist | programming language for Data Scientist