Dingrui(Frank) Zhang

R data.table的几个小技巧(二)

Frank / 2020-06-19


首先再次说明一下,本系列只包括一些个人的小技巧,一些比较简单的应用就不再赘述了。

系列文章

R data.table的几个小技巧(一)
R data.table的几个小技巧(二)

本期内容

导入 data.table

library(data.table)

创建example data

set.seed(42)
dt <- data.table(
  A1 = sample(1:100,10),
  A2 = sample(1:100,10),
  A3 = sample(1:100,10),
  B1 = sample(1:100,10),
  B2 = sample(1:100,10),
  B3 = sample(1:100,10),
  C  = sample(1:100,10)
)
A1 A2 A3 B1 B2 B3 C
49 37 34 58 99 2 84
65 20 92 8 88 58 9
25 26 3 36 87 10 35
74 3 58 68 49 40 93
18 41 42 86 26 5 16
100 89 24 18 6 33 92
47 27 30 92 95 49 69
24 36 43 69 2 73 95
71 5 15 4 3 29 2
89 84 22 50 21 76 82

Filter多列

所有列都要大于20

# 当有NA时注意使用 na.rm = TRUE
dt[rowMeans(dt>20)==1]
##    A1 A2 A3 B1 B2 B3  C
## 1: 47 27 30 92 95 49 69
## 2: 89 84 22 50 21 76 82

任意列>90

# 当有NA时注意使用 na.rm = TRUE
dt[rowSums(dt>90)>0]
##     A1 A2 A3 B1 B2 B3  C
## 1:  49 37 34 58 99  2 84
## 2:  65 20 92  8 88 58  9
## 3:  74  3 58 68 49 40 93
## 4: 100 89 24 18  6 33 92
## 5:  47 27 30 92 95 49 69
## 6:  24 36 43 69  2 73 95

所有列(除了C)> C

dt[dt[,Reduce("&",lapply(.SD,">",C)),.SDcols = !c("C")]]
##    A1 A2 A3 B1 B2 B3 C
## 1: 71  5 15  4  3 29 2

任意列(除了C)> C

dt[dt[,Reduce("|",lapply(.SD,">",C)),.SDcols = !c("C")]]
##     A1 A2 A3 B1 B2 B3  C
## 1:  49 37 34 58 99  2 84
## 2:  65 20 92  8 88 58  9
## 3:  25 26  3 36 87 10 35
## 4:  18 41 42 86 26  5 16
## 5: 100 89 24 18  6 33 92
## 6:  47 27 30 92 95 49 69
## 7:  71  5 15  4  3 29  2
## 8:  89 84 22 50 21 76 82

cbind

Base R

dt2 <- copy(dt)

# base R
cbind(dt,dt2)
##      A1 A2 A3 B1 B2 B3  C  A1 A2 A3 B1 B2 B3  C
##  1:  49 37 34 58 99  2 84  49 37 34 58 99  2 84
##  2:  65 20 92  8 88 58  9  65 20 92  8 88 58  9
##  3:  25 26  3 36 87 10 35  25 26  3 36 87 10 35
##  4:  74  3 58 68 49 40 93  74  3 58 68 49 40 93
##  5:  18 41 42 86 26  5 16  18 41 42 86 26  5 16
##  6: 100 89 24 18  6 33 92 100 89 24 18  6 33 92
##  7:  47 27 30 92 95 49 69  47 27 30 92 95 49 69
##  8:  24 36 43 69  2 73 95  24 36 43 69  2 73 95
##  9:  71  5 15  4  3 29  2  71  5 15  4  3 29  2
## 10:  89 84 22 50 21 76 82  89 84 22 50 21 76 82

data.table way

setDT(unlist(list(dt,dt2),recursive = FALSE),check.names = TRUE)[]
##      A1 A2 A3 B1 B2 B3  C A1.1 A2.1 A3.1 B1.1 B2.1 B3.1 C.1
##  1:  49 37 34 58 99  2 84   49   37   34   58   99    2  84
##  2:  65 20 92  8 88 58  9   65   20   92    8   88   58   9
##  3:  25 26  3 36 87 10 35   25   26    3   36   87   10  35
##  4:  74  3 58 68 49 40 93   74    3   58   68   49   40  93
##  5:  18 41 42 86 26  5 16   18   41   42   86   26    5  16
##  6: 100 89 24 18  6 33 92  100   89   24   18    6   33  92
##  7:  47 27 30 92 95 49 69   47   27   30   92   95   49  69
##  8:  24 36 43 69  2 73 95   24   36   43   69    2   73  95
##  9:  71  5 15  4  3 29  2   71    5   15    4    3   29   2
## 10:  89 84 22 50 21 76 82   89   84   22   50   21   76  82

生成多列

使用set

#注意区别于lapply使用的场景

for (i in 1:3) {
  set(dt2,j = paste0("diff","_",i),value = dt[[paste0("A",i)]] - dt[[paste0("B",i)]])
}
dt2
##      A1 A2 A3 B1 B2 B3  C diff_1 diff_2 diff_3
##  1:  49 37 34 58 99  2 84     -9    -62     32
##  2:  65 20 92  8 88 58  9     57    -68     34
##  3:  25 26  3 36 87 10 35    -11    -61     -7
##  4:  74  3 58 68 49 40 93      6    -46     18
##  5:  18 41 42 86 26  5 16    -68     15     37
##  6: 100 89 24 18  6 33 92     82     83     -9
##  7:  47 27 30 92 95 49 69    -45    -68    -19
##  8:  24 36 43 69  2 73 95    -45     34    -30
##  9:  71  5 15  4  3 29  2     67      2    -14
## 10:  89 84 22 50 21 76 82     39     63    -54

使用Map

A_cols <- grep("A",names(dt),value = TRUE)
B_cols <- grep("B",names(dt),value = TRUE)
new_diff_cols <- paste0("diff","_",1:3)

dt2[,(new_diff_cols) := Map("-",mget(A_cols),mget(B_cols))][]
##      A1 A2 A3 B1 B2 B3  C diff_1 diff_2 diff_3
##  1:  49 37 34 58 99  2 84     -9    -62     32
##  2:  65 20 92  8 88 58  9     57    -68     34
##  3:  25 26  3 36 87 10 35    -11    -61     -7
##  4:  74  3 58 68 49 40 93      6    -46     18
##  5:  18 41 42 86 26  5 16    -68     15     37
##  6: 100 89 24 18  6 33 92     82     83     -9
##  7:  47 27 30 92 95 49 69    -45    -68    -19
##  8:  24 36 43 69  2 73 95    -45     34    -30
##  9:  71  5 15  4  3 29  2     67      2    -14
## 10:  89 84 22 50 21 76 82     39     63    -54