This tutorial introduces how to easily manipulate data in R using the tidyverse package. Data manipulation usually involves
-
computing summary statistics.
-
rows filtering (filter()) and ordering (arrange()).
-
renaming (rename()) and selecting certain columns (select()
and adding columns
-
mutate(): compute and add new variables into a data table. It preserves existing variables and adds new columns at the end of your dataset and
-
transmute(): compute new columns and only keep the new columns,
setwd(“wdirectory”) Changes the current working directory to wdirectory.
setwd('C:/Users/USER/Desktop/JUPYTER_NOTEBOOK/A_MYTUTORIALS/MYR')
Load the needed packages
library(devtools)
library(tidyverse)
library(nycflights13)
library(readxl)
Loading required package: usethis
Warning message in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
"non-uniform 'Rounding' sampler used"
-- [1mAttaching packages[22m ------------------------------------------------------------------------------- tidyverse 1.3.1 --
[32mv[39m [34mggplot2[39m 3.3.3 [32mv[39m [34mpurrr [39m 0.3.4
[32mv[39m [34mtibble [39m 3.1.1 [32mv[39m [34mdplyr [39m 1.0.5
[32mv[39m [34mtidyr [39m 1.1.3 [32mv[39m [34mstringr[39m 1.4.0
[32mv[39m [34mreadr [39m 1.4.0 [32mv[39m [34mforcats[39m 0.5.1
Warning message in (function (kind = NULL, normal.kind = NULL, sample.kind = NULL) :
"non-uniform 'Rounding' sampler used"
-- [1mConflicts[22m ---------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m masks [34mstats[39m::lag()
Datatset
We will use titanic dataset. This dataset has 1309 observations with 14 variables. To explore the basic data manipulation verbs of dplyr, we start by converting the data into a tibble data frame for easier data manipulation
my_data<-as_tibble(read_xls('data/titanic.xls'))
names(my_data)
Warning message in read_fun(path = enc2native(normalizePath(path)), sheet_i = sheet, :
"Coercing text to numeric in M1306 / R1306C13: '328'"
Variable Selection
we can select or subset variables by names or position.
Under variable selection we will learn how to use
-
select(): allow us to extract variables or variables as a data table and can also be used to remove variables from the data frame.
-
select_if(): Select variabless based on a particular condition.
-
Variabl Selection by position
select from the my_data variable positioned from 1 to 4 inclusive
head(my_data %>% select(1:4))
pclass | survived | name | sex |
---|---|---|---|
<dbl> | <dbl> | <chr> | <chr> |
1 | 1 | Allen, Miss. Elisabeth Walton | female |
1 | 1 | Allison, Master. Hudson Trevor | male |
1 | 0 | Allison, Miss. Helen Loraine | female |
1 | 0 | Allison, Mr. Hudson Joshua Creighton | male |
1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female |
1 | 1 | Anderson, Mr. Harry | male |
select variables positioned at 1, 4,6,7
head(my_data %>% select(1,4,6,7))
pclass | sex | sibsp | parch |
---|---|---|---|
<dbl> | <chr> | <dbl> | <dbl> |
1 | female | 0 | 0 |
1 | male | 1 | 2 |
1 | female | 1 | 2 |
1 | male | 1 | 2 |
1 | female | 1 | 2 |
1 | male | 0 | 0 |
Select varaibles by name
head(select(my_data,pclass,name:age),3)
pclass | name | sex | age |
---|---|---|---|
<dbl> | <chr> | <chr> | <dbl> |
1 | Allen, Miss. Elisabeth Walton | female | 29.0000 |
1 | Allison, Master. Hudson Trevor | male | 0.9167 |
1 | Allison, Miss. Helen Loraine | female | 2.0000 |
Select all variables except variables from survived to cabin
head(select(my_data,-(survived:cabin)),3)
pclass | embarked | boat | body | home.dest |
---|---|---|---|---|
<dbl> | <chr> | <chr> | <dbl> | <chr> |
1 | S | 2 | NA | St Louis, MO |
1 | S | 11 | NA | Montreal, PQ / Chesterville, ON |
1 | S | NA | NA | Montreal, PQ / Chesterville, ON |
select variables whose name starts with bo
head(my_data %>% select(starts_with('bo')),3)
boat | body |
---|---|
<chr> | <dbl> |
2 | NA |
11 | NA |
NA | NA |
select variables whose name ends with t
head(my_data %>% select(ends_with('t')),3)
ticket | boat | home.dest |
---|---|---|
<chr> | <chr> | <chr> |
24160 | 2 | St Louis, MO |
113781 | 11 | Montreal, PQ / Chesterville, ON |
113781 | NA | Montreal, PQ / Chesterville, ON |
Select variables whose names contains “me”
head(my_data %>% select(contains('me')),4)
name | home.dest |
---|---|
<chr> | <chr> |
Allen, Miss. Elisabeth Walton | St Louis, MO |
Allison, Master. Hudson Trevor | Montreal, PQ / Chesterville, ON |
Allison, Miss. Helen Loraine | Montreal, PQ / Chesterville, ON |
Allison, Mr. Hudson Joshua Creighton | Montreal, PQ / Chesterville, ON |
Variable selection based on a condtion
select only character variables
head(my_data %>% select_if(is.character),4)
name | sex | ticket | cabin | embarked | boat | home.dest |
---|---|---|---|---|---|---|
<chr> | <chr> | <chr> | <chr> | <chr> | <chr> | <chr> |
Allen, Miss. Elisabeth Walton | female | 24160 | B5 | S | 2 | St Louis, MO |
Allison, Master. Hudson Trevor | male | 113781 | C22 C26 | S | 11 | Montreal, PQ / Chesterville, ON |
Allison, Miss. Helen Loraine | female | 113781 | C22 C26 | S | NA | Montreal, PQ / Chesterville, ON |
Allison, Mr. Hudson Joshua Creighton | male | 113781 | C22 C26 | S | NA | Montreal, PQ / Chesterville, ON |
selecting only numerical variables
head(select_if(my_data,is.numeric),4)
pclass | survived | age | sibsp | parch | fare | body |
---|---|---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
1 | 1 | 29.0000 | 0 | 0 | 211.3375 | NA |
1 | 1 | 0.9167 | 1 | 2 | 151.5500 | NA |
1 | 0 | 2.0000 | 1 | 2 | 151.5500 | NA |
1 | 0 | 30.0000 | 1 | 2 | 151.5500 | 135 |
Removing columns
For simplicity we will work with only few variables
sub_data<-my_data %>% select(sex,pclass,age,'survived',cabin,'name','sex','age','sibsp')
remove variables named pclass,age and sex
head(sub_data%>% select(-sex,-pclass,-age),3)
survived | cabin | name | sibsp |
---|---|---|---|
<dbl> | <chr> | <chr> | <dbl> |
1 | B5 | Allen, Miss. Elisabeth Walton | 0 |
1 | C22 C26 | Allison, Master. Hudson Trevor | 1 |
0 | C22 C26 | Allison, Miss. Helen Loraine | 1 |
Rows filtering (filter())
This section describes how to subset or extract samples or rows from the dataset based on certain criteria
extract male (sex==’male’) passengers who survived (survived==1) and has sibsp==1 (Number of Siblings/Spouses Aboard)
head(sub_data %>% filter(sex=='male' & sibsp==1 & survived==1),2)
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> |
male | 1 | 0.9167 | 1 | C22 C26 | Allison, Master. Hudson Trevor | 1 |
male | 1 | 37.0000 | 1 | D35 | Beckwith, Mr. Richard Leonard | 1 |
OR
head(sub_data %>% filter(sex=='male', sibsp==1,survived==1),2)
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> |
male | 1 | 0.9167 | 1 | C22 C26 | Allison, Master. Hudson Trevor | 1 |
male | 1 | 37.0000 | 1 | D35 | Beckwith, Mr. Richard Leonard | 1 |
extract rows where passengers are male(sex==’male’) or survived (survived==1) or has sibsp==1 or 2 (Number of Siblings/Spouses Aboard)
head(sub_data %>% filter(sex=='male' | sibsp==1 |sibsp==2| survived==1),3)
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> |
female | 1 | 29.0000 | 1 | B5 | Allen, Miss. Elisabeth Walton | 0 |
male | 1 | 0.9167 | 1 | C22 C26 | Allison, Master. Hudson Trevor | 1 |
female | 1 | 2.0000 | 0 | C22 C26 | Allison, Miss. Helen Loraine | 1 |
select variables sibsp,sex,age and from these variables extract rows where age<10
head(sub_data %>% select(sex,sibsp,age) %>%filter(age<10),3)
sex | sibsp | age |
---|---|---|
<chr> | <dbl> | <dbl> |
male | 1 | 0.9167 |
female | 1 | 2.0000 |
male | 0 | 4.0000 |
Selecting random rows or samples from a dataset
selecting 10 random samples without replacement from the data
head(sub_data %>% sample_n(10,replace = FALSE),2)
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> |
male | 3 | 20 | 0 | NA | Vendel, Mr. Olof Edvin | 0 |
female | 1 | 35 | 1 | C123 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | 1 |
Select 1% random samples without replacement from the data
sub_data %>% sample_frac(0.01,replace = FALSE)
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <chr> | <chr> | <dbl> |
male | 1 | 34 | 1 | NA | Seward, Mr. Frederic Kimber | 0 |
male | 3 | 44 | 0 | NA | Cribb, Mr. John Hatfield | 0 |
male | 3 | 23 | 1 | NA | Asplund, Mr. Johan Charles | 0 |
male | 3 | 26 | 0 | NA | Bostandyeff, Mr. Guentcho | 0 |
male | 2 | 27 | 0 | NA | Pulbaum, Mr. Franz | 0 |
male | 3 | 17 | 0 | NA | Elias, Mr. Joseph Jr | 1 |
male | 1 | 41 | 0 | D21 | Kenyon, Mr. Frederick R | 1 |
male | 3 | 31 | 1 | NA | Stranden, Mr. Juho | 0 |
female | 2 | 25 | 1 | NA | Shelley, Mrs. William (Imanita Parrish Hall) | 0 |
male | 3 | NA | 0 | NA | Petroff, Mr. Pastcho ("Pentcho") | 0 |
male | 3 | NA | 1 | NA | O'Keefe, Mr. Patrick | 0 |
male | 2 | 2 | 1 | NA | Wells, Master. Ralph Lester | 1 |
male | 3 | 13 | 0 | NA | Asplund, Master. Filip Oscar | 4 |
Missing values
Number of missing values in the age variable
sub_data %>% summarise(num_na=sum(is.na(age)))
num_na |
---|
<int> |
263 |
number of missing values in each variable
sub_data %>% purrr::map_df(~sum(is.na(.)))
sex | pclass | age | survived | cabin | name | sibsp |
---|---|---|---|---|---|---|
<int> | <int> | <int> | <int> | <int> | <int> | <int> |
0 | 0 | 263 | 0 | 1014 | 0 | 0 |
nrow(sub_data)
1309
drop samples of the variables age and cabin with nas
no_nas<-sub_data %>% filter_at(vars(age,cabin),all_vars(!is.na(.)))
nrow(no_nas)
272
Adding New Variables
Under this section we will use the housing dataset
housing<-readr::read_csv('data/housing.csv',guess_max = 20)
[36m--[39m [1m[1mColumn specification[1m[22m [36m------------------------------------------------------------------------------------------------[39m
cols(
longitude = [32mcol_double()[39m,
latitude = [32mcol_double()[39m,
housing_median_age = [32mcol_double()[39m,
total_rooms = [32mcol_double()[39m,
total_bedrooms = [32mcol_double()[39m,
population = [32mcol_double()[39m,
households = [32mcol_double()[39m,
median_income = [32mcol_double()[39m,
median_house_value = [32mcol_double()[39m,
ocean_proximity = [31mcol_character()[39m
)
Under this section we will select only few variables needed to created new variables
housing<-housing %>% select(total_rooms,households,total_bedrooms,total_rooms,
population,households,ocean_proximity,median_income)
mutate()
mutate() adds new variables at the end of your dataset
head(housing
%>% mutate( rooms_per_household= total_rooms/households,
bedrooms_per_room=total_bedrooms/total_rooms,
population_per_household=population/households
)
%>% select(-c(total_rooms,households,population,total_bedrooms,median_income))
,3)
ocean_proximity | rooms_per_household | bedrooms_per_room | population_per_household |
---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> |
NEAR BAY | 6.984127 | 0.1465909 | 2.555556 |
NEAR BAY | 6.238137 | 0.1557966 | 2.109842 |
NEAR BAY | 8.288136 | 0.1295160 | 2.802260 |
transmute() only keep the new variables created
head(housing %>%
transmute(
rooms_per_household= total_rooms/households,
bedrooms_per_room=total_bedrooms/total_rooms,
population_per_household=population/households
),
3)
rooms_per_household | bedrooms_per_room | population_per_household |
---|---|---|
<dbl> | <dbl> | <dbl> |
6.984127 | 0.1465909 | 2.555556 |
6.238137 | 0.1557966 | 2.109842 |
8.288136 | 0.1295160 | 2.802260 |
Summary Statistics
getmode <- function(n) {
uniqn <- unique(n)
uniqn[which.max(tabulate(match(n, uniqn)))]
}
housing %>% summarise(count=n(),mean_income=mean(median_income,na.rm=TRUE),
mode_income=getmode(median_income))
count | mean_income | mode_income |
---|---|---|
<int> | <dbl> | <dbl> |
20640 | 3.870671 | 3.125 |
Group by one variable
- Note : you can groupe by multiple variables
housing %>%group_by(ocean_proximity) %>% summarise(mean_income=mean(median_income,na.rm=TRUE),
mean_housholde=mean(households))
ocean_proximity | mean_income | mean_housholde |
---|---|---|
<chr> | <dbl> | <dbl> |
<1H OCEAN | 4.230682 | 517.7450 |
INLAND | 3.208996 | 477.4476 |
ISLAND | 2.744420 | 276.6000 |
NEAR BAY | 4.172885 | 488.6162 |
NEAR OCEAN | 4.005785 | 501.2445 |
summary statistics on numerical variables group by ocean proximity
housing%>% select(-population)%>%group_by(ocean_proximity) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
ocean_proximity | total_rooms | households | total_bedrooms | median_income |
---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <dbl> |
<1H OCEAN | 2628.344 | 517.7450 | 546.5392 | 4.230682 |
INLAND | 2717.743 | 477.4476 | 533.8816 | 3.208996 |
ISLAND | 1574.600 | 276.6000 | 420.4000 | 2.744420 |
NEAR BAY | 2493.590 | 488.6162 | 514.1828 | 4.172885 |
NEAR OCEAN | 2583.701 | 501.2445 | 538.6157 | 4.005785 |