23 분 소요

test

dplyr

  • Tidyverse의 한 library로, dataframe 정리에 사용
  • main function
    • select
    • filter
    • arrange
    • mutate
    • summarise
    • group_by
  • 사용할 데이터로, nycflights13 사용

데이터 불러오기, library setting

library(nycflights13)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
print(flights)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

데이터 확인하기

print(summary(flights))
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

Select

특정 열을 선택해주는 함수

용법 : select(dataframe, columns(벡터로 입력해도 되고, 하나의 값으로 입력가능하고, :이나 - 사용 가능))

select(flights, year, month, day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
select(flights, c(year, month, day))
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
select(flights, year:day)
## # A tibble: 336,776 x 3
##     year month   day
##    <int> <int> <int>
##  1  2013     1     1
##  2  2013     1     1
##  3  2013     1     1
##  4  2013     1     1
##  5  2013     1     1
##  6  2013     1     1
##  7  2013     1     1
##  8  2013     1     1
##  9  2013     1     1
## 10  2013     1     1
## # ... with 336,766 more rows
select(flights, -year)
## # A tibble: 336,776 x 18
##    month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1     1     1      517            515         2      830            819
##  2     1     1      533            529         4      850            830
##  3     1     1      542            540         2      923            850
##  4     1     1      544            545        -1     1004           1022
##  5     1     1      554            600        -6      812            837
##  6     1     1      554            558        -4      740            728
##  7     1     1      555            600        -5      913            854
##  8     1     1      557            600        -3      709            723
##  9     1     1      557            600        -3      838            846
## 10     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

특정 문자열 찾아내기

  • starts_with(‘문자’) : ’문자’로 시작하는 단어 모두 찾기
  • ends_with(‘문자’) : ’문자’로 끝나는 단어 모두 찾기
  • matches(‘문자’) : ’문자’를 포함하는 단어 모두 찾기
  • contains(‘문자’) : ’문자’를 포함하는 단어 모두 찾기
select(flights, starts_with('dep'))
## # A tibble: 336,776 x 2
##    dep_time dep_delay
##       <int>     <dbl>
##  1      517         2
##  2      533         4
##  3      542         2
##  4      544        -1
##  5      554        -6
##  6      554        -4
##  7      555        -5
##  8      557        -3
##  9      557        -3
## 10      558        -2
## # ... with 336,766 more rows
select(flights, ends_with('delay'))
## # A tibble: 336,776 x 2
##    dep_delay arr_delay
##        <dbl>     <dbl>
##  1         2        11
##  2         4        20
##  3         2        33
##  4        -1       -18
##  5        -6       -25
##  6        -4        12
##  7        -5        19
##  8        -3       -14
##  9        -3        -8
## 10        -2         8
## # ... with 336,766 more rows
select(flights, matches('time'))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # ... with 336,766 more rows
select(flights, contains('time'))
## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # ... with 336,766 more rows

Filter

특정 행을 선택해주는 함수

용법 : fliter(dataframe, 조건)

filter(flights, carrier=='US')
## # A tibble: 20,536 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      622            630        -8     1017           1014
##  2  2013     1     1      627            630        -3     1018           1018
##  3  2013     1     1      629            630        -1      824            833
##  4  2013     1     1      643            645        -2      837            848
##  5  2013     1     1      752            759        -7      955            959
##  6  2013     1     1      811            815        -4     1026           1016
##  7  2013     1     1      823            825        -2     1019           1024
##  8  2013     1     1      908            915        -7     1004           1033
##  9  2013     1     1      955           1000        -5     1336           1325
## 10  2013     1     1      959           1000        -1     1151           1206
## # ... with 20,526 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month>=3, month<=5)
## # A tibble: 85,960 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3     1        4           2159       125      318             56
##  2  2013     3     1       50           2358        52      526            438
##  3  2013     3     1      117           2245       152      223           2354
##  4  2013     3     1      454            500        -6      633            648
##  5  2013     3     1      505            515       -10      746            810
##  6  2013     3     1      521            530        -9      813            827
##  7  2013     3     1      537            540        -3      856            850
##  8  2013     3     1      541            545        -4     1014           1023
##  9  2013     3     1      549            600       -11      639            703
## 10  2013     3     1      550            600       -10      747            801
## # ... with 85,950 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month>=3 & month<=5)
## # A tibble: 85,960 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3     1        4           2159       125      318             56
##  2  2013     3     1       50           2358        52      526            438
##  3  2013     3     1      117           2245       152      223           2354
##  4  2013     3     1      454            500        -6      633            648
##  5  2013     3     1      505            515       -10      746            810
##  6  2013     3     1      521            530        -9      813            827
##  7  2013     3     1      537            540        -3      856            850
##  8  2013     3     1      541            545        -4     1014           1023
##  9  2013     3     1      549            600       -11      639            703
## 10  2013     3     1      550            600       -10      747            801
## # ... with 85,950 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Arrange

dataframe의 배열을 바꿔주는 함수

용법 : arrange(dataframe, column1, column2, … )

해당하는 column 순서대로 오름차순 or 내림차순으로 정렬해줌(default : 오름차순)

내림차순 하는 방법 : desc(columns)

# year에 대해 오름차순으로 정렬

arrange(flights, year)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# year에 대해 오름차순으로 정렬하고, month에 대해 오름차순으로 정렬한 후, day에 대해 오름차순으로 정렬

arrange(flights, year, month, day)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# year에 대해 오름차순으로 정렬하고, month에 대해 내림차순으로 정렬한 후, day에 대해 오름차순으로 정렬

arrange(flights, year, desc(month), day)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     1       13           2359        14      446            445
##  2  2013    12     1       17           2359        18      443            437
##  3  2013    12     1      453            500        -7      636            651
##  4  2013    12     1      520            515         5      749            808
##  5  2013    12     1      536            540        -4      845            850
##  6  2013    12     1      540            550       -10     1005           1027
##  7  2013    12     1      541            545        -4      734            755
##  8  2013    12     1      546            545         1      826            835
##  9  2013    12     1      549            600       -11      648            659
## 10  2013    12     1      550            600       -10      825            854
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Mutate

dataframe의 열을 추가 or 바꿔주는 함수

용법 : mutate(dataframe, column_name = 조건)

만약 새로운 column_name을 사용하였다면, dataframe에 새로운 column이 추가 됨

만약 기존에 있던 column_name을 사용하였다면, 원래 있던 column의 데이터가 변화됨.

mutate(flights, new_time = hour+(minute/60))
## # A tibble: 336,776 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   new_time <dbl>
mutate(flights, hour = hour+(minute/60))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

transmute

mutate할 열만 보여주는 함수

transmute(flights, new_time = hour+(minute/60))
## # A tibble: 336,776 x 1
##    new_time
##       <dbl>
##  1     5.25
##  2     5.48
##  3     5.67
##  4     5.75
##  5     6   
##  6     5.97
##  7     6   
##  8     6   
##  9     6   
## 10     6   
## # ... with 336,766 more rows
transmute(flights, hour = hour+(minute/60))
## # A tibble: 336,776 x 1
##     hour
##    <dbl>
##  1  5.25
##  2  5.48
##  3  5.67
##  4  5.75
##  5  6   
##  6  5.97
##  7  6   
##  8  6   
##  9  6   
## 10  6   
## # ... with 336,766 more rows

Summarise

dataframe의 열의 특징을 요약해주는 함수

용법 : summarise(dataframe, name = function results)

summarise(flights, dep_delay=mean(dep_delay, na.rm=T), arr_delay=mean(arr_delay, na.rm=T))
## # A tibble: 1 x 2
##   dep_delay arr_delay
##       <dbl>     <dbl>
## 1      12.6      6.90

Group_by

dataframe의 행들을 group화 시켜주는 함수

용법 : group_by(dataframe, column)

해당 column에 속하는 성분을 이용하여 grouping을 진행

group_by(flights, origin)
## # A tibble: 336,776 x 19
## # Groups:   origin [3]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
group_by(flights, origin, tailnum)
## # A tibble: 336,776 x 19
## # Groups:   origin, tailnum [7,944]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

위 작업만 진행시 데이터프레임의 큰 변화가 없음(group만 표시해줌) 이 후 다른 작업 진행시 group을 인식한 상태로 작업 진행

grouping and summarise

grouping 후 summarise 진행 시, group에 대해 summarise를 진행함

a=group_by(flights, origin)

summarise(a, delay=mean(dep_delay, na.rm=T))
## # A tibble: 3 x 2
##   origin delay
##   <chr>  <dbl>
## 1 EWR     15.1
## 2 JFK     12.1
## 3 LGA     10.3

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows

## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

## # A tibble: 12 x 3
## # Groups:   year [1]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135

## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776

summarise에 사용되는 여러 함수

  • sum, mean, sd, var, … : 기본적인 통계량 측정 함수(합, 평균, 표준편차, 분산)
  • n() : count 함수
  • n_distinct() : unique한 값 개수
  • first(x), last(x), nth(x, n) : x vector에서의 첫번째, 마지막, nth value
a=group_by(flights, tailnum)
summarise(a, dep_delay_mean=mean(dep_delay, na.rm=T), dep_delay_sd=sd(dep_delay, na.rm=T), count=n())
## # A tibble: 4,044 x 4
##    tailnum dep_delay_mean dep_delay_sd count
##    <chr>            <dbl>        <dbl> <int>
##  1 D942DN          31.5          30.9      4
##  2 N0EGMQ           8.49         33.3    371
##  3 N10156          17.8          36.3    153
##  4 N102UW           8            46.3     48
##  5 N103US          -3.20          4.29    46
##  6 N104UW           9.94         44.4     47
##  7 N10575          22.7          48.5    289
##  8 N105UW           2.58         22.2     45
##  9 N107US          -0.463        17.1     41
## 10 N108UW           4.22         25.7     60
## # ... with 4,034 more rows

Using pipeline

  • pipeline (%>%)을 이용하여 coding을 간결하게 진행할 수 있음
  • 작업 1 %>% 작업 2 - 작업 1의 결과에서 작업 2를 시작하게 됨
  • 일일히 dataframe에 이름을 지정해주지 않고 바로 coding 가능해짐
# r group_summarise 1

a=group_by(flights, origin)

summarise(a, delay=mean(dep_delay, na.rm=T))
## # A tibble: 3 x 2
##   origin delay
##   <chr>  <dbl>
## 1 EWR     15.1
## 2 JFK     12.1
## 3 LGA     10.3
# using pipeline

flights %>% group_by(origin) %>% summarise(delay=mean(dep_delay, na.rm=T))
## # A tibble: 3 x 2
##   origin delay
##   <chr>  <dbl>
## 1 EWR     15.1
## 2 JFK     12.1
## 3 LGA     10.3
# 같은 결과가 나오는 것을 알 수 있음
# r group_summarise 2 example
daily=group_by(flights, year, month, day)

per_day = summarise(daily, flights=n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
per_day
## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows
#가장 낮은 level인 day의 flight 계산

per_month = summarise(per_day, flights=sum(flights))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
per_month
## # A tibble: 12 x 3
## # Groups:   year [1]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135
# 그 다음 level인 month의 flight summarise

per_year = summarise(per_month, flights=sum(flights))
per_year
## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776
# using pipeline

flights %>% group_by(year, month, day) %>% summarise(flights=n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows
flights %>% group_by(year, month, day) %>% summarise(flights=n()) %>% summarise(flights = sum(flights))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

## # A tibble: 12 x 3
## # Groups:   year [1]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135
flights %>% group_by(year, month, day) %>% summarise(flights=n()) %>% summarise(flights = sum(flights)) %>%
  summarise(flights = sum(flights))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.

## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776
a1 = group_by(flights, year, month, day)
a2 = select(a1, arr_delay, dep_delay)
## Adding missing grouping variables: `year`, `month`, `day`
a3 = summarise(a2, 
               arr = mean(arr_delay, na.rm=T),
               dep = mean(dep_delay, na.rm=T))
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
a4 = filter(a3, arr > 30 | dep > 30)

print(a4)
## # A tibble: 49 x 5
## # Groups:   year, month [11]
##     year month   day   arr   dep
##    <int> <int> <int> <dbl> <dbl>
##  1  2013     1    16  34.2  24.6
##  2  2013     1    31  32.6  28.7
##  3  2013     2    11  36.3  39.1
##  4  2013     2    27  31.3  37.8
##  5  2013     3     8  85.9  83.5
##  6  2013     3    18  41.3  30.1
##  7  2013     4    10  38.4  33.0
##  8  2013     4    12  36.0  34.8
##  9  2013     4    18  36.0  34.9
## 10  2013     4    19  47.9  46.1
## # ... with 39 more rows
# using pipeline

flights %>% group_by(year, month, day) %>% select(arr_delay, dep_delay) %>%
  summarise(arr=mean(arr_delay, na.rm=T), dep=mean(dep_delay, na.rm=T)) %>%
  filter(arr > 30 | dep >30)
## Adding missing grouping variables: `year`, `month`, `day`

## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

## # A tibble: 49 x 5
## # Groups:   year, month [11]
##     year month   day   arr   dep
##    <int> <int> <int> <dbl> <dbl>
##  1  2013     1    16  34.2  24.6
##  2  2013     1    31  32.6  28.7
##  3  2013     2    11  36.3  39.1
##  4  2013     2    27  31.3  37.8
##  5  2013     3     8  85.9  83.5
##  6  2013     3    18  41.3  30.1
##  7  2013     4    10  38.4  33.0
##  8  2013     4    12  36.0  34.8
##  9  2013     4    18  36.0  34.9
## 10  2013     4    19  47.9  46.1
## # ... with 39 more rows

Other pipeline - using magrittr

  • %<>% : 최종 결과값을 처음에 지정한 dataframe에 저장할 것
  • %$% : dataframe의 특정 열만 사용하고 싶을 때 사용
library(magrittr)
## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract
# %>% vs %<>%

x=1

x %<>% multiply_by(0.5) %>% print
## [1] 0.5
x %>% multiply_by(0.5) %>% print
## [1] 0.25
print(x)
## [1] 0.5
# %$%

flights %>% filter(dep_delay>0 & arr_delay>0) %$% cor(dep_delay, arr_delay) 
## [1] 0.9418739

업데이트:

댓글남기기