Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: tidyft
Title: Tidy Verbs for Fast Data Operations by Reference
Version: 0.4.5
Authors@R:
Authors@R:
person(given = "Tian-Yuan",
family = "Huang",
role = c("aut", "cre"),
Expand All @@ -18,13 +18,13 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.1.0
Imports:
Imports:
data.table (>= 1.12.8),
stringr (>= 1.4.0),
fst (>= 0.9.0)
Suggests:
Suggests:
bench,
knitr,
rmarkdown,
profvis,
dplyr
VignetteBuilder: knitr
123 changes: 54 additions & 69 deletions vignettes/Introduction.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class(b)

# convert codes
lapply(ls(),get) %>%
lapply(setDT) %>%
lapply(setDT) %>%
invisible()

# after
Expand Down Expand Up @@ -80,7 +80,7 @@ ls() # only the ft exists
The `as_fst` could save any data.frame as ".fst" file in temporary file and parse it back as fst table. Fst table is small in RAM, but if you want to get any part of the data.frame, you can get it in almost no time:

```{r}
ft %>%
ft %>%
slice_fst(5555:6666) # get 5555 to 6666 row
```

Expand All @@ -91,17 +91,17 @@ Except for `slice_fst`, there are also other functions for subsetting the data,
```{r}

sys_time_print({
res = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width) %>%
rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
arrange(group,sl) %>%
filter(sl > 5) %>%
distinct(sl,.keep_all = TRUE) %>%
res = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width) %>%
rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
arrange(group,sl) %>%
filter(sl > 5) %>%
distinct(sl,.keep_all = TRUE) %>%
summarise(sw = max(sw),by = group)
})

res

```

This should be pretty fast. Becasue when we use the data in fst table, we never get them until using the "_fst" suffix functions, so the tidyft functions never modify the data in the fst file or fst table. That is to say, we do not have to worry about the modification by reference any more. No copies made, fastest ever.
Expand All @@ -113,7 +113,6 @@ The fst workflow could also be working with other tools, though less efficient.

rm(list = ls())

library(profvis)
library(data.table)
library(dplyr)
library(dtplyr)
Expand All @@ -128,57 +127,53 @@ dim(dt)
as_fst(dt) -> ft
# remove the data.frame from RAM
rm(dt)

profvis({
res1 = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
dplyr::select(-Petal.Length) %>%
dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
dplyr::arrange(group,sl) %>%
dplyr::filter(sl > 5) %>%
dplyr::distinct(sl,.keep_all = TRUE) %>%
dplyr::group_by(group) %>%
dplyr::summarise(sw = max(sw))
res2 = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
lazy_dt() %>%
dplyr::select(-Petal.Length) %>%
dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
dplyr::arrange(group,sl) %>%
dplyr::filter(sl > 5) %>%
dplyr::distinct(sl,.keep_all = TRUE) %>%
dplyr::group_by(group) %>%
dplyr::summarise(sw = max(sw)) %>%
as.data.table()
res3 = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%


bench::mark(

dplyr = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
dplyr::select(-Petal.Length) %>%
dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
dplyr::arrange(group,sl) %>%
dplyr::filter(sl > 5) %>%
dplyr::distinct(sl,.keep_all = TRUE) %>%
dplyr::group_by(group) %>%
dplyr::summarise(sw = max(sw)),

dtplyr = ft %>%
select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
lazy_dt() %>%
dplyr::select(-Petal.Length) %>%
dplyr::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
dplyr::arrange(group,sl) %>%
dplyr::filter(sl > 5) %>%
dplyr::distinct(sl,.keep_all = TRUE) %>%
dplyr::group_by(group) %>%
dplyr::summarise(sw = max(sw)) %>%
as.data.table(),

data.table = ft[,c("Species","Sepal.Length","Sepal.Width","Petal.Length")] %>%
setDT() %>%
.[,.SD,.SDcols = -"Petal.Length"] %>%
.[,.SD,.SDcols = -"Petal.Length"] %>%
setnames(old =c("Species","Sepal.Length","Sepal.Width"),
new = c("group","sl","sw")) %>%
setorder(group,sl) %>%
.[sl>5] %>% unique(by = "sl") %>%
.[,.(sw = max(sw)),by = group]


res4 = ft %>%
tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
tidyft::select(-Petal.Length) %>%
tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
tidyft::arrange(group,sl) %>%
tidyft::filter(sl > 5) %>%
tidyft::distinct(sl,.keep_all = TRUE) %>%
tidyft::summarise(sw = max(sw),by = group)


})

setequal(res1,res2)
setequal(res2,res3)
setequal(res3,res4)
new = c("group","sl","sw")) %>%
setorder(group,sl) %>%
.[sl>5] %>% unique(by = "sl") %>%
.[,.(sw = max(sw)),by = group],


tidyft = ft %>%
tidyft::select_fst(Species,Sepal.Length,Sepal.Width,Petal.Length) %>%
tidyft::select(-Petal.Length) %>%
tidyft::rename(group = Species,sl = Sepal.Length,sw = Sepal.Width) %>%
tidyft::arrange(group,sl) %>%
tidyft::filter(sl > 5) %>%
tidyft::distinct(sl,.keep_all = TRUE) %>%
tidyft::summarise(sw = max(sw),by = group),

check = setequal
)

```

Expand All @@ -189,13 +184,3 @@ Because tidyft is based on data.table, therefore, if you always use data.table c
```{r}
sessionInfo()
```