filter
select
arrange
mutate
summarise
(plus group_by
)# load packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Loading the Boston city payroll
payroll <- read.csv("http://andrewbatran.com/ccsu-2017/slides/class6/bostonpayroll2013.csv", stringsAsFactors=F)
# Glimpse the first six rows of data
head(payroll)
## NAME TITLE
## 1 Darosa, Baltazar Police Officer
## 2 Creaven, Jacqueline D Police Lieutenant
## 3 Cawley, Stephen C Police Lieutenant
## 4 Danilecki, John H Police Captain
## 5 Kervin, Timothy M. Police Lieutenant/Hdq Dispatch
## 6 Hosein, Haseeb Police Lieutenant
## DEPARTMENT REGULAR RETRO OTHER OVERTIME INJURED
## 1 Boston Police Department $66,829 $0 $209,237 $10,473 $0
## 2 Boston Police Department $104,661 $0 $79,248 $47,572 $0
## 3 Boston Police Department $44,632 $0 $216,037 $10,158 $0
## 4 Boston Police Department $121,244 $0 $15,663 $71,198 $0
## 5 Boston Police Department $104,588 $0 $13,494 $88,906 $0
## 6 Boston Police Department $105,261 $0 $12,883 $47,437 $0
## DETAIL QUINN TOTAL.EARNINGS Community ZIPCode State X X.1
## 1 $629 $6,724 $293,892 Brockton 2302 Massachusetts
## 2 $33,918 $23,721 $289,121 Boston 2132 Massachusetts
## 3 $4,046 $6,031 $280,904 Quincy 2169 Massachusetts
## 4 $41,943 $15,156 $265,203 Walpole 2081 Massachusetts
## 5 $41,840 $12,988 $261,815 Boston 2135 Massachusetts
## 6 $76,310 $13,129 $255,020 Milton 2186 Massachusetts
tbl_df
creates a “local data frame”
Local data frame is simply a wrapper for a data frame that prints nicely
# convert to local data frame
boston_payroll <- tbl_df(payroll)
# printing only shows 10 rows and as many columns as can fit on your screen
boston_payroll
## # A tibble: 22,297 × 16
## NAME TITLE
## <chr> <chr>
## 1 Darosa, Baltazar Police Officer
## 2 Creaven, Jacqueline D Police Lieutenant
## 3 Cawley, Stephen C Police Lieutenant
## 4 Danilecki, John H Police Captain
## 5 Kervin, Timothy M. Police Lieutenant/Hdq Dispatch
## 6 Hosein, Haseeb Police Lieutenant
## 7 Lee, Thomas F Police Captain/DDC
## 8 Assad, Mark L Police Sergeant (Det)
## 9 Eversley, Eric V Police Lieutenant (Det)
## 10 Crossen, Patrick J Police Captain/DDC
## # ... with 22,287 more rows, and 14 more variables: DEPARTMENT <chr>,
## # REGULAR <chr>, RETRO <chr>, OTHER <chr>, OVERTIME <chr>,
## # INJURED <chr>, DETAIL <chr>, QUINN <chr>, TOTAL.EARNINGS <chr>,
## # Community <chr>, ZIPCode <int>, State <chr>, X <chr>, X.1 <chr>
After looking at the dataframe, you might’ve spotted the dollar signs and commas.
Also, it showed that each of the payroll figures were chr as in characters, when it should be numeric.
So, let’s clean it up before we move on to the math part later on.
boston_payroll$REGULAR <- gsub("\\$", "", boston_payroll$REGULAR)
boston_payroll$REGULAR <- gsub(",", "", boston_payroll$REGULAR)
boston_payroll$REGULAR <- as.numeric(boston_payroll$REGULAR)
boston_payroll$RETRO <- gsub("\\$", "", boston_payroll$RETRO)
boston_payroll$RETRO <- gsub(",", "", boston_payroll$RETRO)
boston_payroll$RETRO <- as.numeric(boston_payroll$RETRO)
boston_payroll$OTHER <- gsub("\\$", "", boston_payroll$OTHER)
boston_payroll$OTHER <- gsub(",", "", boston_payroll$OTHER)
boston_payroll$OTHER <- as.numeric(boston_payroll$OTHER)
boston_payroll$OTHER <- gsub("\\$", "", boston_payroll$OTHER)
boston_payroll$OTHER <- gsub(",", "", boston_payroll$OTHER)
boston_payroll$OTHER <- as.numeric(boston_payroll$OTHER)
boston_payroll$OVERTIME <- gsub("\\$", "", boston_payroll$OVERTIME)
boston_payroll$OVERTIME <- gsub(",", "", boston_payroll$OVERTIME)
boston_payroll$OVERTIME <- as.numeric(boston_payroll$OVERTIME)
boston_payroll$INJURED <- gsub("\\$", "", boston_payroll$INJURED)
boston_payroll$INJURED <- gsub(",", "", boston_payroll$INJURED)
boston_payroll$INJURED <- as.numeric(boston_payroll$INJURED)
boston_payroll$DETAIL <- gsub("\\$", "", boston_payroll$DETAIL)
boston_payroll$DETAIL <- gsub(",", "", boston_payroll$DETAIL)
boston_payroll$DETAIL <- as.numeric(boston_payroll$DETAIL)
boston_payroll$QUINN <- gsub("\\$", "", boston_payroll$QUINN)
boston_payroll$QUINN <- gsub(",", "", boston_payroll$QUINN)
boston_payroll$QUINN <- as.numeric(boston_payroll$QUINN)
boston_payroll$TOTAL.EARNINGS <- gsub("\\$", "", boston_payroll$TOTAL.EARNINGS)
boston_payroll$TOTAL.EARNINGS <- gsub(",", "", boston_payroll$TOTAL.EARNINGS)
boston_payroll$TOTAL.EARNINGS <- as.numeric(boston_payroll$TOTAL.EARNINGS)
Base R approach
# To view all employees who are Special Assistants in the Mayor's Office
boston_payroll[boston_payroll$TITLE=="Special Assistant" & boston_payroll$DEPARTMENT=="Mayor's Office", ]
dplyr approach
# note: you can use comma or ampersand to represent AND condition
filter(boston_payroll, TITLE=="Special Assistant", DEPARTMENT=="Mayor's Office")
## # A tibble: 14 × 16
## NAME TITLE DEPARTMENT REGULAR
## <chr> <chr> <chr> <dbl>
## 1 Harrington, Patrick S Special Assistant Mayor's Office 123000
## 2 Leibowitz, Howard Richard Special Assistant Mayor's Office 118087
## 3 Gannon, Margaret C Special Assistant Mayor's Office 101470
## 4 Pierce, Martha E Special Assistant Mayor's Office 96859
## 5 Osgood, Christopher Special Assistant Mayor's Office 97556
## 6 Carter, Kristopher Special Assistant Mayor's Office 75697
## 7 Lusk, Katharine A Special Assistant Mayor's Office 68268
## 8 Satti, Christopher Special Assistant Mayor's Office 64300
## 9 Depina, Joceline G. Special Assistant Mayor's Office 58019
## 10 Lopez Cortes, Nick Scott Special Assistant Mayor's Office 48769
## 11 Santos, Rui Special Assistant Mayor's Office 26848
## 12 Dickey, Landon Spiller Special Assistant Mayor's Office 25269
## 13 Arellano, Rebecca Special Assistant Mayor's Office 22470
## 14 Passacantilli, Stephen M. Special Assistant Mayor's Office 16962
## # ... with 12 more variables: RETRO <dbl>, OTHER <dbl>, OVERTIME <dbl>,
## # INJURED <dbl>, DETAIL <dbl>, QUINN <dbl>, TOTAL.EARNINGS <dbl>,
## # Community <chr>, ZIPCode <int>, State <chr>, X <chr>, X.1 <chr>
filter(boston_payroll, TITLE=="Special Assistant" | TITLE=="Athletic Assistant")
## # A tibble: 44 × 16
## NAME TITLE DEPARTMENT
## <chr> <chr> <chr>
## 1 Harrington, Patrick S Special Assistant Mayor's Office
## 2 Leibowitz, Howard Richard Special Assistant Mayor's Office
## 3 Frisch, Rebecca A. Special Assistant Administration and Finance
## 4 Gannon, Margaret C Special Assistant Mayor's Office
## 5 Pierce, Martha E Special Assistant Mayor's Office
## 6 Osgood, Christopher Special Assistant Mayor's Office
## 7 Carter, Kristopher Special Assistant Mayor's Office
## 8 Murnane, Edith Special Assistant Environment Department
## 9 Hussain, Benita Del Special Assistant Environment Department
## 10 Zhu, Charles Mihyi Special Assistant Environment Department
## # ... with 34 more rows, and 13 more variables: REGULAR <dbl>,
## # RETRO <dbl>, OTHER <dbl>, OVERTIME <dbl>, INJURED <dbl>, DETAIL <dbl>,
## # QUINN <dbl>, TOTAL.EARNINGS <dbl>, Community <chr>, ZIPCode <int>,
## # State <chr>, X <chr>, X.1 <chr>
base R approach to select Title, Department, and Regular columns
boston_payroll[, c("TITLE", "DEPARTMENT", "REGULAR")]
dplyr method
select(boston_payroll, TITLE, DEPARTMENT, REGULAR)
## # A tibble: 22,297 × 3
## TITLE DEPARTMENT REGULAR
## <chr> <chr> <dbl>
## 1 Police Officer Boston Police Department 66829
## 2 Police Lieutenant Boston Police Department 104661
## 3 Police Lieutenant Boston Police Department 44632
## 4 Police Captain Boston Police Department 121244
## 5 Police Lieutenant/Hdq Dispatch Boston Police Department 104588
## 6 Police Lieutenant Boston Police Department 105261
## 7 Police Captain/DDC Boston Police Department 106048
## 8 Police Sergeant (Det) Boston Police Department 94641
## 9 Police Lieutenant (Det) Boston Police Department 106564
## 10 Police Captain/DDC Boston Police Department 122705
## # ... with 22,287 more rows
Use colon to select multiple contiguous columns, and use contains
to match columns by name note: starts_with
, ends_with
, and matches
(for regular expressions) can also be used to match columns by name select(flights, Year:DayofMonth, contains(“Taxi”), contains(“Delay”))
select(boston_payroll, TITLE:RETRO, contains("O"))
## # A tibble: 22,297 × 9
## TITLE DEPARTMENT REGULAR RETRO
## <chr> <chr> <dbl> <dbl>
## 1 Police Officer Boston Police Department 66829 0
## 2 Police Lieutenant Boston Police Department 104661 0
## 3 Police Lieutenant Boston Police Department 44632 0
## 4 Police Captain Boston Police Department 121244 0
## 5 Police Lieutenant/Hdq Dispatch Boston Police Department 104588 0
## 6 Police Lieutenant Boston Police Department 105261 0
## 7 Police Captain/DDC Boston Police Department 106048 0
## 8 Police Sergeant (Det) Boston Police Department 94641 0
## 9 Police Lieutenant (Det) Boston Police Department 106564 0
## 10 Police Captain/DDC Boston Police Department 122705 0
## # ... with 22,287 more rows, and 5 more variables: OTHER <dbl>,
## # OVERTIME <dbl>, TOTAL.EARNINGS <dbl>, Community <chr>, ZIPCode <int>
Nesting method to select TITLE and REGULAR pay columns and filter for pay more than $150,000.
filter(select(boston_payroll, TITLE, REGULAR), REGULAR > 150000)
## # A tibble: 40 × 2
## TITLE REGULAR
## <chr> <dbl>
## 1 Superintendent 212663
## 2 Dep Fire Chief 159630
## 3 Dep Fire Chief Administration 169593
## 4 Supn-In Chief 162514
## 5 District FireChiefAdmn-AdvTech 155827
## 6 Dep Fire Chief 159630
## 7 President 176626
## 8 Commissioner (Bpd) 156289
## 9 Supn Bpd 154775
## 10 Supn Bpd 154775
## # ... with 30 more rows
Chaining method
boston_payroll %>%
select(TITLE, REGULAR) %>%
filter(REGULAR > 150000)
## # A tibble: 40 × 2
## TITLE REGULAR
## <chr> <dbl>
## 1 Superintendent 212663
## 2 Dep Fire Chief 159630
## 3 Dep Fire Chief Administration 169593
## 4 Supn-In Chief 162514
## 5 District FireChiefAdmn-AdvTech 155827
## 6 Dep Fire Chief 159630
## 7 President 176626
## 8 Commissioner (Bpd) 156289
## 9 Supn Bpd 154775
## 10 Supn Bpd 154775
## # ... with 30 more rows
base R approach to select TITLE and REGULAR pay columns and sort by REGULAR
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
dplyr approach
boston_payroll %>%
select(TITLE, REGULAR) %>%
arrange(REGULAR)
## # A tibble: 22,297 × 2
## TITLE REGULAR
## <chr> <dbl>
## 1 FF LEP Title 3 Inspector - ADR -1356
## 2 Dist Fire Chief 0
## 3 District Fire Chief-ADR 0
## 4 Fire Lieutenant-ADR 0
## 5 Fire Lieutenant-ADR 0
## 6 Fire Lieutenant-ADR 0
## 7 FF Master of the Fire Boat-ADR 0
## 8 FF Master of the Fire Boat-ADR 0
## 9 Fire Lieutenant-ADR 0
## 10 FF Master of the Fire Boat-ADR 0
## # ... with 22,287 more rows
Use desc
for descending order.
boston_payroll %>%
select(TITLE, REGULAR) %>%
arrange(desc(REGULAR))
## # A tibble: 22,297 × 2
## TITLE REGULAR
## <chr> <dbl>
## 1 Superintendent 212663
## 2 Deputy Superintendent 177704
## 3 President 176626
## 4 Dep Fire Chief Administration 175428
## 5 Dep Fire Chief Administration 175428
## 6 Dep Fire Chief Administration 175158
## 7 Dep Fire Chief Administration 175129
## 8 Mayor 175000
## 9 Commissioner (BFD) 174200
## 10 Deputy Chief Academic Officer 170355
## # ... with 22,287 more rows
Create new variables that are functions of existing variables
base R approach to create a new variable RegularAndOvertime
boston_payroll$RegularAndOvertime <- boston_payroll$REGULAR + boston_payroll$OVERTIME
boston_payroll[, c("REGULAR", "OVERTIME", "RegularAndOvertime")]
dplyr approach
boston_payroll %>%
select(REGULAR, OVERTIME) %>%
mutate(RegularAndOvertime=REGULAR+OVERTIME)
## # A tibble: 22,297 × 3
## REGULAR OVERTIME RegularAndOvertime
## <dbl> <dbl> <dbl>
## 1 66829 10473 77302
## 2 104661 47572 152233
## 3 44632 10158 54790
## 4 121244 71198 192442
## 5 104588 88906 193494
## 6 105261 47437 152698
## 7 106048 56418 162466
## 8 94641 79482 174123
## 9 106564 66888 173452
## 10 122705 90319 213024
## # ... with 22,287 more rows
boston_payroll <- boston_payroll %>%
mutate(RegularAndOvertime=REGULAR+OVERTIME)
group_by
creates the groups that will be operated onsummarise
uses the provided aggregation function to summarise each groupbase R approaches to calculate the average pay per department
head(with(boston_payroll, tapply(REGULAR, DEPARTMENT, mean, na.rm=TRUE)))
head(aggregate(REGULAR ~ DEPARTMENT, boston_payroll, mean))
dplyr approach: create a table grouped by DEPARTMENT and then summarise each group by taking the mean of REGULAR pay
boston_payroll %>%
group_by(DEPARTMENT) %>%
summarize(AVG_PAY=mean(REGULAR, na.rm=T))
## # A tibble: 51 × 2
## DEPARTMENT AVG_PAY
## <chr> <dbl>
## 1 Administration and Finance 79809.62
## 2 Arts & Cultural Development 34036.95
## 3 ASD Graphic Arts 0.00
## 4 ASD Human Resources 55284.21
## 5 ASD Intergvernmtl Relations 56367.86
## 6 ASD Office of Budget Mangmnt 65715.45
## 7 ASD Office Of Labor Relation 51245.29
## 8 ASD Purchasing Division 63156.71
## 9 Assessing Department 60384.08
## 10 Auditing Department 50754.40
## # ... with 41 more rows
summarise_each
allows you to apply the same summary function to multiple columns at oncemutate_each
is also availableThis code is a simpler version
boston_payroll %>%
group_by(TITLE) %>%
summarise_each(funs(mean), REGULAR, OVERTIME)
Than this, but they do the same thing
boston_payroll %>%
group_by(TITLE) %>%
summarise(REGULAR=mean(REGULAR, na.rm=T), OVERTIME=mean(OVERTIME, na.rm=T))
## # A tibble: 1,414 × 3
## TITLE REGULAR OVERTIME
## <chr> <dbl> <dbl>
## 1 ABA Specialist 28133.04 0.00000
## 2 Academic Superintendent 40842.67 0.00000
## 3 Academy Director (Basas 10B) 35643.33 0.00000
## 4 Academy Director (Basas 12) 43444.00 0.00000
## 5 Academy Leader 41345.00 0.00000
## 6 ACC - Attorney 57970.38 33.19231
## 7 ACC - Management 91665.67 0.00000
## 8 ACC - Sr Attorney 83023.33 148.66667
## 9 Accountant 29447.00 0.00000
## 10 Accounting Manager 74536.00 0.00000
## # ... with 1,404 more rows
For each department, calculate the minimum and maximum REGULAR pay
boston_payroll %>%
group_by(DEPARTMENT) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)), matches("REGULAR"))
boston_payroll %>%
group_by(DEPARTMENT) %>%
summarise(min=min(REGULAR, na.rm=TRUE), max=max(REGULAR, na.rm=TRUE))
## # A tibble: 51 × 3
## DEPARTMENT min max
## <chr> <dbl> <dbl>
## 1 Administration and Finance 46470 123997
## 2 Arts & Cultural Development 528 94208
## 3 ASD Graphic Arts 0 0
## 4 ASD Human Resources 0 113797
## 5 ASD Intergvernmtl Relations 0 113797
## 6 ASD Office of Budget Mangmnt 0 115310
## 7 ASD Office Of Labor Relation 1956 114050
## 8 ASD Purchasing Division 0 111022
## 9 Assessing Department 0 141301
## 10 Auditing Department 0 123347
## # ... with 41 more rows
n()
counts the number of rows in a groupn_distinct(vector)
counts the number of unique items in that vectorFor each department, count the total number employees in descending order
boston_payroll %>%
group_by(DEPARTMENT) %>%
summarise(employee_count = n()) %>%
arrange(desc(employee_count))
## # A tibble: 51 × 2
## DEPARTMENT employee_count
## <chr> <int>
## 1 BPS 13266
## 2 Boston Police Department 3074
## 3 Boston Fire Department 1687
## 4 Boston Cntr - Youth & Families 618
## 5 Boston Public Library 563
## 6 Public Works Department 464
## 7 Transportation Department 372
## 8 Parks Department 300
## 9 Inspectional Services Dept 237
## 10 Property Management 226
## # ... with 41 more rows
Rewrite more simply with the tally
function
boston_payroll %>%
group_by(DEPARTMENT) %>%
tally(sort = TRUE)
## # A tibble: 51 × 2
## DEPARTMENT n
## <chr> <int>
## 1 BPS 13266
## 2 Boston Police Department 3074
## 3 Boston Fire Department 1687
## 4 Boston Cntr - Youth & Families 618
## 5 Boston Public Library 563
## 6 Public Works Department 464
## 7 Transportation Department 372
## 8 Parks Department 300
## 9 Inspectional Services Dept 237
## 10 Property Management 226
## # ... with 41 more rows
For each DEPARTMENT, count the total number of employees and the number of distinct titles
boston_payroll %>%
group_by(DEPARTMENT) %>%
summarise(employee_count = n(), title_count = n_distinct(TITLE))
## # A tibble: 51 × 3
## DEPARTMENT employee_count title_count
## <chr> <int> <int>
## 1 Administration and Finance 8 7
## 2 Arts & Cultural Development 39 10
## 3 ASD Graphic Arts 1 1
## 4 ASD Human Resources 58 36
## 5 ASD Intergvernmtl Relations 14 6
## 6 ASD Office of Budget Mangmnt 29 15
## 7 ASD Office Of Labor Relation 14 6
## 8 ASD Purchasing Division 24 13
## 9 Assessing Department 92 30
## 10 Auditing Department 45 20
## # ... with 41 more rows
base R approach to view the structure of an object
str(boston_payroll)
## Classes 'tbl_df', 'tbl' and 'data.frame': 22297 obs. of 17 variables:
## $ NAME : chr "Darosa, Baltazar" "Creaven, Jacqueline D" "Cawley, Stephen C" "Danilecki, John H" ...
## $ TITLE : chr "Police Officer" "Police Lieutenant" "Police Lieutenant" "Police Captain" ...
## $ DEPARTMENT : chr "Boston Police Department" "Boston Police Department" "Boston Police Department" "Boston Police Department" ...
## $ REGULAR : num 66829 104661 44632 121244 104588 ...
## $ RETRO : num 0 0 0 0 0 0 0 0 0 0 ...
## $ OTHER : num 209237 79248 216037 15663 13494 ...
## $ OVERTIME : num 10473 47572 10158 71198 88906 ...
## $ INJURED : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DETAIL : num 629 33918 4046 41943 41840 ...
## $ QUINN : num 6724 23721 6031 15156 12988 ...
## $ TOTAL.EARNINGS : num 293892 289121 280904 265203 261815 ...
## $ Community : chr "Brockton" "Boston" "Quincy" "Walpole" ...
## $ ZIPCode : int 2302 2132 2169 2081 2135 2186 2339 2176 1702 2043 ...
## $ State : chr "Massachusetts" "Massachusetts" "Massachusetts" "Massachusetts" ...
## $ X : chr "" "" "" "" ...
## $ X.1 : chr "" "" "" "" ...
## $ RegularAndOvertime: num 77302 152233 54790 192442 193494 ...
dplyr approach: better formatting, and adapts to your screen width
glimpse(boston_payroll)
## Observations: 22,297
## Variables: 17
## $ NAME <chr> "Darosa, Baltazar", "Creaven, Jacqueline D"...
## $ TITLE <chr> "Police Officer", "Police Lieutenant", "Pol...
## $ DEPARTMENT <chr> "Boston Police Department", "Boston Police ...
## $ REGULAR <dbl> 66829, 104661, 44632, 121244, 104588, 10526...
## $ RETRO <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ OTHER <dbl> 209237, 79248, 216037, 15663, 13494, 12883,...
## $ OVERTIME <dbl> 10473, 47572, 10158, 71198, 88906, 47437, 5...
## $ INJURED <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7...
## $ DETAIL <dbl> 629, 33918, 4046, 41943, 41840, 76310, 8619...
## $ QUINN <dbl> 6724, 23721, 6031, 15156, 12988, 13129, 131...
## $ TOTAL.EARNINGS <dbl> 293892, 289121, 280904, 265203, 261815, 255...
## $ Community <chr> "Brockton", "Boston", "Quincy", "Walpole", ...
## $ ZIPCode <int> 2302, 2132, 2169, 2081, 2135, 2186, 2339, 2...
## $ State <chr> "Massachusetts", "Massachusetts", "Massachu...
## $ X <chr> "", "", "", "", "", "", "", "", "", "", "",...
## $ X.1 <chr> "", "", "", "", "", "", "", "", "", "", "",...
## $ RegularAndOvertime <dbl> 77302, 152233, 54790, 192442, 193494, 15269...