diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c3082b5..ec5da9c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer @@ -10,21 +10,21 @@ repos: - id: check-yaml - id: check-added-large-files - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 6.0.1 hooks: - id: isort name: isort (python) - - repo: https://github.com/pre-commit/mirrors-autopep8 - rev: v1.5.7 + - repo: https://github.com/hhatto/autopep8 + rev: v2.3.2 hooks: - id: autopep8 args: ["--max-line-length=120", "-i"] - repo: https://github.com/pre-commit/pygrep-hooks - rev: v1.9.0 + rev: v1.10.0 hooks: - id: python-use-type-annotations - repo: https://github.com/lorenzwalthert/precommit - rev: v0.1.3.9135 + rev: v0.4.3.9011 hooks: - id: style-files args: [--style_pkg=styler, --style_fun=tidyverse_style] @@ -43,4 +43,3 @@ repos: # language: system # types: [python] # args: ["--fail-under=7.0","--max-line-length=120"] - diff --git a/docs/overview.Rmd b/docs/overview.Rmd index b5b1e09..4b6d283 100644 --- a/docs/overview.Rmd +++ b/docs/overview.Rmd @@ -1,7 +1,7 @@ --- title: "General statistical overviews of FILTER data" date: "`r Sys.Date()`" -output: +output: html_notebook: code_folding: hide toc: yes @@ -11,7 +11,7 @@ output: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(message=FALSE,dpi=300,fig.retina=2,fig.width=8) +knitr::opts_chunk$set(message = FALSE, dpi = 300, fig.retina = 2, fig.width = 8) source(here::here("src/common_basis.R")) tmap_mode("plot") ``` @@ -19,53 +19,53 @@ tmap_mode("plot") # Temporal overview ```{r} -p_year %>% - inner_join(poems,by=c("p_id")) %>% - count(collection,year) %>% - mutate(measure="yearly count") %>% +p_year %>% + inner_join(poems, by = c("p_id")) %>% + count(collection, year) %>% + mutate(measure = "yearly count") %>% union_all( p_year %>% # 10 year rolling mean - distinct(year) %>% - left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% - inner_join(p_year,by=c("year.y"="year")) %>% - inner_join(poems,by=c("p_id")) %>% - group_by(collection=collection,year=year.x) %>% - summarize(n=n()/n_distinct(year.y),.groups="drop") %>% - mutate(measure="10 year rolling mean") + distinct(year) %>% + left_join(p_year %>% distinct(year), sql_on = "RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% + inner_join(p_year, by = c("year.y" = "year")) %>% + inner_join(poems, by = c("p_id")) %>% + group_by(collection = collection, year = year.x) %>% + summarize(n = n() / n_distinct(year.y), .groups = "drop") %>% + mutate(measure = "10 year rolling mean") ) %>% - filter(collection!="literary",!year %in% c(0,9999)) %>% - mutate(year=if_else(year>=1800,year,1780)) %>% - group_by(collection,measure,year) %>% - summarise(n=sum(n),.groups="drop") %>% + filter(collection != "literary", !year %in% c(0, 9999)) %>% + mutate(year = if_else(year >= 1800, year, 1780)) %>% + group_by(collection, measure, year) %>% + summarise(n = sum(n), .groups = "drop") %>% collect() %>% - complete(year,collection,measure,fill=list(n=0)) %>% - mutate(collection=fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")) %>% - group_by(collection,measure) %>% + complete(year, collection, measure, fill = list(n = 0)) %>% + mutate(collection = fct_relevel(str_to_upper(collection), "ERAB", "SKVR", "JR")) %>% + group_by(collection, measure) %>% arrange(year) %>% - filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>% + filter(n != 0 | lag(n) != 0 | lead(n) != 0) %>% ungroup() %>% - mutate(youtlier=n>4600,xoutlier=year<1800) %>% - ggplot(aes(x=year,y=n,color=collection)) + - geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE),size=0.5) + - geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=5000) + - geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=5000, show.legend=FALSE) + - geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + - geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + mutate(youtlier = n > 4600, xoutlier = year < 1800) %>% + ggplot(aes(x = year, y = n, color = collection)) + + geom_point(data = ~ .x %>% filter(measure == "yearly count", youtlier == FALSE), size = 0.5) + + geom_point(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year), y = 5000) + + geom_text_repel(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year, label = scales::number(n)), y = 5000, show.legend = FALSE) + + geom_point(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = year, y = n)) + + geom_text_repel(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = year, y = n, label = scales::number(n)), show.legend = FALSE) + + geom_line(data = ~ .x %>% filter(xoutlier == FALSE, measure == "10 year rolling mean")) + + theme_hsci_discrete(base_family = "Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), - legend.key=element_blank() - ) + - labs(color=NULL) + - coord_cartesian(ylim=c(0,4600),xlim=c(1800,1970),clip="off") + - scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + -# ylab("Poems") + + legend.justification = c(0, 1), + legend.position = c(0.02, 0.98), + legend.background = element_blank(), + legend.key = element_blank() + ) + + labs(color = NULL) + + coord_cartesian(ylim = c(0, 4600), xlim = c(1800, 1970), clip = "off") + + scale_y_continuous(breaks = seq(0, 20000, by = 1000), labels = scales::number) + + # ylab("Poems") + ylab("Runojen määrä") + - scale_x_continuous(breaks=seq(1000,2000,by=10)) + -# xlab("Year") + + scale_x_continuous(breaks = seq(1000, 2000, by = 10)) + + # xlab("Year") + xlab("Vuosi") + ggtitle("") # ggtitle("Runojen määrä vuosittain ja kokoelmittain") @@ -73,97 +73,97 @@ p_year %>% ``` ```{r} -top_top_themes <- poem_theme %>% +top_top_themes <- poem_theme %>% inner_join(poems) %>% inner_join(themes_to_top_level_themes) %>% count(collection, ancestor_t_id) %>% group_by(collection) %>% - slice_max(n,n=9) %>% - ungroup() %>% - mutate(top_theme=TRUE) %>% - select(ancestor_t_id,top_theme) %>% - compute_a(temporary=TRUE, overwrite=TRUE) + slice_max(n, n = 9) %>% + ungroup() %>% + mutate(top_theme = TRUE) %>% + select(ancestor_t_id, top_theme) %>% + compute_a(temporary = TRUE, overwrite = TRUE) ``` ```{r} -d <- p_year %>% - inner_join(poems,by=c("p_id")) %>% +d <- p_year %>% + inner_join(poems, by = c("p_id")) %>% inner_join(poem_theme %>% - inner_join(themes_to_top_level_themes %>% - inner_join(themes %>% - filter(!str_detect(theme_id,"^erab_orig")) %>% - select(ancestor_t_id=t_id,ancestor_theme_name=name)))) %>% + inner_join(themes_to_top_level_themes %>% + inner_join(themes %>% + filter(!str_detect(theme_id, "^erab_orig")) %>% + select(ancestor_t_id = t_id, ancestor_theme_name = name)))) %>% left_join(top_top_themes) %>% mutate( - ancestor_theme_name=if_else(!is.na(top_theme),ancestor_theme_name,"Muut"), - ancestor_t_id=if_else(!is.na(top_theme),ancestor_t_id,-1), - ) %>% - replace_na(list(ancestor_theme_name="Tuntematon", ancestor_t_id=-2)) %>% - distinct(ancestor_t_id,ancestor_theme_name, collection, year, p_id) %>% - count(ancestor_t_id,ancestor_theme_name, collection, year) %>% - mutate(measure="yearly count") %>% + ancestor_theme_name = if_else(!is.na(top_theme), ancestor_theme_name, "Muut"), + ancestor_t_id = if_else(!is.na(top_theme), ancestor_t_id, -1), + ) %>% + replace_na(list(ancestor_theme_name = "Tuntematon", ancestor_t_id = -2)) %>% + distinct(ancestor_t_id, ancestor_theme_name, collection, year, p_id) %>% + count(ancestor_t_id, ancestor_theme_name, collection, year) %>% + mutate(measure = "yearly count") %>% union_all( p_year %>% # 10 year rolling mean - distinct(year) %>% - left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% - inner_join(p_year,by=c("year.y"="year")) %>% - inner_join(poems,by=c("p_id")) %>% + distinct(year) %>% + left_join(p_year %>% distinct(year), sql_on = "RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% + inner_join(p_year, by = c("year.y" = "year")) %>% + inner_join(poems, by = c("p_id")) %>% inner_join(poem_theme %>% - inner_join(themes_to_top_level_themes %>% - inner_join(themes %>% - filter(!str_detect(theme_id,"^erab_orig")) %>% - select(ancestor_t_id=t_id,ancestor_theme_name=name)))) %>% + inner_join(themes_to_top_level_themes %>% + inner_join(themes %>% + filter(!str_detect(theme_id, "^erab_orig")) %>% + select(ancestor_t_id = t_id, ancestor_theme_name = name)))) %>% left_join(top_top_themes) %>% mutate( - ancestor_theme_name=if_else(!is.na(top_theme),ancestor_theme_name,"Muut"), - ancestor_t_id=if_else(!is.na(top_theme),ancestor_t_id,-1), - ) %>% - replace_na(list(ancestor_theme_name="Tuntematon", ancestor_t_id=-2)) %>% - distinct(ancestor_t_id,ancestor_theme_name, collection, year.x, year.y, p_id) %>% - group_by(ancestor_t_id,ancestor_theme_name, collection, year=year.x) %>% - summarize(n=n()/n_distinct(year.y),.groups="drop") %>% - mutate(measure="10 year rolling mean") + ancestor_theme_name = if_else(!is.na(top_theme), ancestor_theme_name, "Muut"), + ancestor_t_id = if_else(!is.na(top_theme), ancestor_t_id, -1), + ) %>% + replace_na(list(ancestor_theme_name = "Tuntematon", ancestor_t_id = -2)) %>% + distinct(ancestor_t_id, ancestor_theme_name, collection, year.x, year.y, p_id) %>% + group_by(ancestor_t_id, ancestor_theme_name, collection, year = year.x) %>% + summarize(n = n() / n_distinct(year.y), .groups = "drop") %>% + mutate(measure = "10 year rolling mean") ) %>% - filter(collection!="literary",!year %in% c(0L,9999L)) %>% - mutate(year=if_else(year>=1800L,year,1780L)) %>% + filter(collection != "literary", !year %in% c(0L, 9999L)) %>% + mutate(year = if_else(year >= 1800L, year, 1780L)) %>% group_by(ancestor_theme_name, collection, measure, year) %>% - summarise(n=sum(n),.groups="drop") %>% + summarise(n = sum(n), .groups = "drop") %>% collect() ``` ```{r} d %>% - mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB","JR")) %>% - filter(collection=="SKVR") %>% - complete(ancestor_theme_name, year,collection,measure,fill=list(n=0)) %>% - group_by(ancestor_theme_name, collection,measure) %>% + mutate(collection = fct_relevel(str_to_upper(collection), "SKVR", "ERAB", "JR")) %>% + filter(collection == "SKVR") %>% + complete(ancestor_theme_name, year, collection, measure, fill = list(n = 0)) %>% + group_by(ancestor_theme_name, collection, measure) %>% arrange(year) %>% - filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>% + filter(n != 0 | lag(n) != 0 | lead(n) != 0) %>% ungroup() %>% - mutate(youtlier=n>1300,xoutlier=year<1800) %>% - ggplot(aes(x=year,y=n,color=ancestor_theme_name)) + -# facet_wrap(~collection) + - geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE,xoutlier==FALSE),size=0.5) + - geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=1400) + - geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=1400, show.legend=FALSE) + - geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + - geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + mutate(youtlier = n > 1300, xoutlier = year < 1800) %>% + ggplot(aes(x = year, y = n, color = ancestor_theme_name)) + + # facet_wrap(~collection) + + geom_point(data = ~ .x %>% filter(measure == "yearly count", youtlier == FALSE, xoutlier == FALSE), size = 0.5) + + geom_point(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year), y = 1400) + + geom_text_repel(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year, label = scales::number(n)), y = 1400, show.legend = FALSE) + + geom_point(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = 1785, y = n)) + + geom_text_repel(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = 1785, y = n, label = scales::number(n)), show.legend = FALSE) + + geom_line(data = ~ .x %>% filter(xoutlier == FALSE, measure == "10 year rolling mean")) + + theme_hsci_discrete(base_family = "Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), - legend.key=element_blank() - ) + - labs(color=NULL) + - coord_cartesian(ylim=c(0,1300),xlim=c(1800,1940),clip="off") + - scale_y_continuous(breaks=seq(0,20000,by=500),labels=scales::number) + -# ylab("Poems") + + legend.justification = c(0, 1), + legend.position = c(0.02, 0.98), + legend.background = element_blank(), + legend.key = element_blank() + ) + + labs(color = NULL) + + coord_cartesian(ylim = c(0, 1300), xlim = c(1800, 1940), clip = "off") + + scale_y_continuous(breaks = seq(0, 20000, by = 500), labels = scales::number) + + # ylab("Poems") + ylab("Runojen määrä") + - scale_x_continuous(breaks=seq(1000,2000,by=10)) + -# xlab("Year") + + scale_x_continuous(breaks = seq(1000, 2000, by = 10)) + + # xlab("Year") + xlab("Vuosi") + ggtitle("") # ggtitle("Runojen määrä vuosittain ja kokoelmittain") @@ -172,15 +172,16 @@ d %>% ```{r} d %>% - mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB")) %>% - filter(collection=="ERAB") %>% - complete(ancestor_theme_name, year,collection,measure,fill=list(n=0)) %>% - group_by(ancestor_theme_name, collection,measure) %>% + mutate(collection = fct_relevel(str_to_upper(collection), "SKVR", "ERAB")) %>% + filter(collection == "ERAB") %>% + complete(ancestor_theme_name, year, collection, measure, fill = list(n = 0)) %>% + group_by(ancestor_theme_name, collection, measure) %>% arrange(year) %>% - filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>% + filter(n != 0 | lag(n) != 0 | lead(n) != 0) %>% ungroup() %>% - mutate(youtlier=n>820,xoutlier=year<1800) %>% - mutate(ancestor_theme_name=case_match(ancestor_theme_name, + mutate(youtlier = n > 820, xoutlier = year < 1800) %>% + mutate(ancestor_theme_name = case_match( + ancestor_theme_name, "Laulud noorrahva elust" ~ "Laulut nuorison elämästä (Laulud noorrahva elust)", "Muut" ~ "Muut (sisältää 17 luokkaa)", "Laulud meelelahutamiseks" ~ "Viihdytyslaulut (Laulud meelelahutamiseks)", @@ -193,31 +194,31 @@ d %>% "Laulud abielust" ~ "Laulut avioelämästä (Laulud abielust)", "Kalendrilaulud" ~ "Kalendaarilaulut (Kalendrilaulud)" )) %>% - mutate(ancestor_theme_name=fct_reorder(ancestor_theme_name,n,.fun=sum,.desc=TRUE)) %>% - mutate(ancestor_theme_name=fct_relevel(ancestor_theme_name, "Muut (sisältää 17 luokkaa)", after=Inf)) %>% - ggplot(aes(x=year,y=n,color=ancestor_theme_name)) + -# facet_wrap(~collection) + - geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE,xoutlier==FALSE),size=0.5) + - geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=900) + - geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=900, show.legend=FALSE) + - geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + - geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + mutate(ancestor_theme_name = fct_reorder(ancestor_theme_name, n, .fun = sum, .desc = TRUE)) %>% + mutate(ancestor_theme_name = fct_relevel(ancestor_theme_name, "Muut (sisältää 17 luokkaa)", after = Inf)) %>% + ggplot(aes(x = year, y = n, color = ancestor_theme_name)) + + # facet_wrap(~collection) + + geom_point(data = ~ .x %>% filter(measure == "yearly count", youtlier == FALSE, xoutlier == FALSE), size = 0.5) + + geom_point(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year), y = 900) + + geom_text_repel(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year, label = scales::number(n)), y = 900, show.legend = FALSE) + + geom_point(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = 1785, y = n)) + + geom_text_repel(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = 1785, y = n, label = scales::number(n)), show.legend = FALSE) + + geom_line(data = ~ .x %>% filter(xoutlier == FALSE, measure == "10 year rolling mean")) + + theme_hsci_discrete(base_family = "Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), - legend.key=element_blank() - ) + - labs(color=NULL) + - coord_cartesian(ylim=c(0,820),xlim=c(1820,1950),clip="off") + - scale_y_continuous(breaks=seq(0,20000,by=200),labels=scales::number) + -# ylab("Poems") + + legend.justification = c(0, 1), + legend.position = c(0.02, 0.98), + legend.background = element_blank(), + legend.key = element_blank() + ) + + labs(color = NULL) + + coord_cartesian(ylim = c(0, 820), xlim = c(1820, 1950), clip = "off") + + scale_y_continuous(breaks = seq(0, 20000, by = 200), labels = scales::number) + + # ylab("Poems") + ylab("Runojen määrä") + -# guides(color=guide_legend(nrow=2)) + - scale_x_continuous(breaks=seq(1000,2000,by=10)) + -# xlab("Year") + + # guides(color=guide_legend(nrow=2)) + + scale_x_continuous(breaks = seq(1000, 2000, by = 10)) + + # xlab("Year") + xlab("Vuosi") + ggtitle("") # ggtitle("Runojen määrä vuosittain ja kokoelmittain") @@ -226,36 +227,36 @@ d %>% ```{r} d %>% - mutate(collection=fct_relevel(str_to_upper(collection),"SKVR","ERAB","JR")) %>% - filter(collection=="JR") %>% - complete(ancestor_theme_name, year,collection,measure,fill=list(n=0)) %>% - group_by(ancestor_theme_name, collection,measure) %>% + mutate(collection = fct_relevel(str_to_upper(collection), "SKVR", "ERAB", "JR")) %>% + filter(collection == "JR") %>% + complete(ancestor_theme_name, year, collection, measure, fill = list(n = 0)) %>% + group_by(ancestor_theme_name, collection, measure) %>% arrange(year) %>% - filter(n!=0 | lag(n)!=0 | lead(n)!=0) %>% + filter(n != 0 | lag(n) != 0 | lead(n) != 0) %>% ungroup() %>% - mutate(youtlier=n>6500,xoutlier=year<1800) %>% - ggplot(aes(x=year,y=n,color=ancestor_theme_name)) + -# facet_wrap(~collection) + - geom_point(data=~.x %>% filter(measure=="yearly count",youtlier==FALSE),size=0.5) + - geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=7200) + - geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=7200, show.legend=FALSE) + - geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + - geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + mutate(youtlier = n > 6500, xoutlier = year < 1800) %>% + ggplot(aes(x = year, y = n, color = ancestor_theme_name)) + + # facet_wrap(~collection) + + geom_point(data = ~ .x %>% filter(measure == "yearly count", youtlier == FALSE), size = 0.5) + + geom_point(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year), y = 7200) + + geom_text_repel(data = ~ .x %>% filter(youtlier == TRUE), aes(x = year, label = scales::number(n)), y = 7200, show.legend = FALSE) + + geom_point(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = year, y = n)) + + geom_text_repel(data = ~ .x %>% filter(xoutlier == TRUE, measure == "yearly count"), aes(x = year, y = n, label = scales::number(n)), show.legend = FALSE) + + geom_line(data = ~ .x %>% filter(xoutlier == FALSE, measure == "10 year rolling mean")) + + theme_hsci_discrete(base_family = "Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), - legend.key=element_blank() - ) + - labs(color=NULL) + - coord_cartesian(ylim=c(0,6500),xlim=c(1800,1960),clip="off") + - scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + -# ylab("Poems") + + legend.justification = c(0, 1), + legend.position = c(0.02, 0.98), + legend.background = element_blank(), + legend.key = element_blank() + ) + + labs(color = NULL) + + coord_cartesian(ylim = c(0, 6500), xlim = c(1800, 1960), clip = "off") + + scale_y_continuous(breaks = seq(0, 20000, by = 1000), labels = scales::number) + + # ylab("Poems") + ylab("Runojen määrä") + - scale_x_continuous(breaks=seq(1000,2000,by=10)) + -# xlab("Year") + + scale_x_continuous(breaks = seq(1000, 2000, by = 10)) + + # xlab("Year") + xlab("Vuosi") + ggtitle("") # ggtitle("Runojen määrä vuosittain ja kokoelmittain") @@ -264,49 +265,49 @@ d %>% ```{r} -p_year %>% - filter(year %in% c(0,9999)) %>% - left_join(poems) %>% - count(collection,year) %>% +p_year %>% + filter(year %in% c(0, 9999)) %>% + left_join(poems) %>% + count(collection, year) %>% ungroup() %>% gt() %>% - tab_header(title="Abnormal years") %>% + tab_header(title = "Abnormal years") %>% fmt_integer(n) ``` # Overview of collectors ```{r collectors_overview, fig.width=8, fig.height=11} -poems %>% +poems %>% distinct(collection) %>% pull() %>% - map(~p_col %>% - inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>% + map(~ p_col %>% + inner_join(poems %>% filter(collection == .x), by = c("p_id")) %>% count(col_id) %>% - left_join(collectors,by=c("col_id")) %>% - select(col_id,name,n) %>% + left_join(collectors, by = c("col_id")) %>% + select(col_id, name, n) %>% collect() %>% - mutate(col_id=fct_reorder(str_c(col_id,"|",name),n)) %>% - mutate(col_id=fct_lump_n(col_id,n=100,w=n)) %>% - mutate(col_id=fct_relevel(col_id,"Other")) %>% + mutate(col_id = fct_reorder(str_c(col_id, "|", name), n)) %>% + mutate(col_id = fct_lump_n(col_id, n = 100, w = n)) %>% + mutate(col_id = fct_relevel(col_id, "Other")) %>% group_by(col_id) %>% - tally(wt=n) %>% { - ggplot(.,aes(x=col_id,y=n)) + - geom_col() + - geom_text(aes(label=p(n)),hjust='left',nudge_y = 100) + - theme_hsci_discrete(base_family="Arial") + - coord_flip() + - labs(title=str_c("Collectors in ",.x)) - } - ) + tally(wt = n) %>% + { + ggplot(., aes(x = col_id, y = n)) + + geom_col() + + geom_text(aes(label = p(n)), hjust = "left", nudge_y = 100) + + theme_hsci_discrete(base_family = "Arial") + + coord_flip() + + labs(title = str_c("Collectors in ", .x)) + }) ``` ```{r} -p_col %>% +p_col %>% anti_join(collectors) %>% count(col_id) %>% gt() %>% - tab_header(title="Collectors without a name") %>% + tab_header(title = "Collectors without a name") %>% fmt_integer(n) ``` @@ -314,73 +315,72 @@ p_col %>% p_col %>% inner_join(collectors) %>% inner_join(poems) %>% - filter(collection!="literary") %>% - mutate(collection=str_to_upper(collection)) %>% - count(collection,col_id,name) %>% + filter(collection != "literary") %>% + mutate(collection = str_to_upper(collection)) %>% + count(collection, col_id, name) %>% group_by(collection) %>% - slice_max(order_by=n,n=10) %>% + slice_max(order_by = n, n = 10) %>% ungroup() %>% select(-col_id) %>% gt(groupname_col = "collection", rowname_col = "name") %>% - row_group_order(c("ERAB","SKVR","JR")) %>% - fmt_integer(n,sep_mark=" ") + row_group_order(c("ERAB", "SKVR", "JR")) %>% + fmt_integer(n, sep_mark = " ") ``` ```{r,fig.height=3} p_col %>% inner_join(collectors) %>% inner_join(poems) %>% - filter(collection!="literary") %>% - mutate(collection=str_to_upper(collection)) %>% - count(collection,col_id,name) %>% + filter(collection != "literary") %>% + mutate(collection = str_to_upper(collection)) %>% + count(collection, col_id, name) %>% group_by(collection) %>% - slice_max(order_by=n,n=10) %>% + slice_max(order_by = n, n = 10) %>% ungroup() %>% inner_join(p_col) %>% inner_join(poem_theme) %>% inner_join(themes_to_top_level_themes) %>% - count(collection, col_id, t_id=ancestor_t_id) %>% - inner_join(themes %>% rename(theme_name=name)) %>% + count(collection, col_id, t_id = ancestor_t_id) %>% + inner_join(themes %>% rename(theme_name = name)) %>% inner_join(collectors) %>% - filter(collection=="SKVR") %>% - collect() %>% -# group_by(collection) %>% -# mutate(theme_name=fct_lump_n(theme_name, n, n=5, other_level="Muut")) %>% -# ungroup() %>% + filter(collection == "SKVR") %>% + collect() %>% + # group_by(collection) %>% + # mutate(theme_name=fct_lump_n(theme_name, n, n=5, other_level="Muut")) %>% + # ungroup() %>% group_by(col_id) %>% - mutate(tn=sum(n)) %>% + mutate(tn = sum(n)) %>% ungroup() %>% - mutate(name=fct_reorder(name,tn)) %>% - ggplot(aes(x=name,fill=theme_name,y=n)) + -# facet_wrap(~collection,scales="free",ncol=1) + + mutate(name = fct_reorder(name, tn)) %>% + ggplot(aes(x = name, fill = theme_name, y = n)) + + # facet_wrap(~collection,scales="free",ncol=1) + geom_col() + theme_hsci_discrete() + coord_flip() + - scale_y_continuous(labels=scales::number) + - labs(fill="Päätyyppi") + + scale_y_continuous(labels = scales::number) + + labs(fill = "Päätyyppi") + xlab("Kerääjä") + ylab("Tyyppimerkintöjä") - ``` # Geographical overview ```{r} -d <- p_loc %>% - count(loc_id) %>% +d <- p_loc %>% + count(loc_id) %>% inner_join(locations) %>% - select(name,n) %>% + select(name, n) %>% collect() -poems_without_location <- poems %>% - anti_join(p_loc) %>% - count() %>% +poems_without_location <- poems %>% + anti_join(p_loc) %>% + count() %>% pull() unprojected_locations <- d %>% anti_join(polygons) %>% - add_row(name=NA,n=poems_without_location) + add_row(name = NA, n = poems_without_location) ``` @@ -388,8 +388,8 @@ unprojected_locations <- d %>% polygons %>% left_join(d) %>% tm_shape() + - tm_polygons(col='n', id='name', style='fisher', palette='plasma') + - tm_layout(title=str_c("Geographical overview. Missing ",unprojected_locations %>% tally(wt=n) %>% pull() %>% p," poems.")) + tm_polygons(col = "n", id = "name", style = "fisher", palette = "plasma") + + tm_layout(title = str_c("Geographical overview. Missing ", unprojected_locations %>% tally(wt = n) %>% pull() %>% p(), " poems.")) ``` ## Poem locations not mapped @@ -405,19 +405,19 @@ unprojected_locations %>% ## Geographical overview by collection ```{r} -d <- p_loc %>% +d <- p_loc %>% left_join(poems) %>% - count(collection,loc_id) %>% + count(collection, loc_id) %>% ungroup() %>% inner_join(locations) %>% - select(collection,name,n) %>% + select(collection, name, n) %>% collect() -poems_without_location <- poems %>% - anti_join(p_loc) %>% - count(collection) %>% +poems_without_location <- poems %>% + anti_join(p_loc) %>% + count(collection) %>% collect() %>% - mutate(name=NA_character_) + mutate(name = NA_character_) unprojected_locations <- d %>% anti_join(polygons) %>% @@ -425,52 +425,50 @@ unprojected_locations <- d %>% ``` ```{r, fig.height=11} -poems %>% +poems %>% distinct(collection) %>% pull() %>% map(~ tm_shape( polygons %>% left_join( - p_loc %>% - inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>% - count(loc_id) %>% + p_loc %>% + inner_join(poems %>% filter(collection == .x), by = c("p_id")) %>% + count(loc_id) %>% inner_join(locations) %>% - select(name,n) %>% + select(name, n) %>% collect() ) ) + - tm_polygons(col='n', id='name', style='fisher', palette='plasma') + - tm_layout(title=str_c("Geography of ",.x,". Missing ",unprojected_locations %>% filter(collection==.x) %>% tally(wt=n) %>% pull() %>% p," poems.")) - ) + tm_polygons(col = "n", id = "name", style = "fisher", palette = "plasma") + + tm_layout(title = str_c("Geography of ", .x, ". Missing ", unprojected_locations %>% filter(collection == .x) %>% tally(wt = n) %>% pull() %>% p(), " poems."))) ``` ## Poem locations not mapped by collection ```{r, results="asis"} -poems %>% +poems %>% distinct(collection) %>% pull() %>% map(~ unprojected_locations %>% - filter(collection==.x) %>% + filter(collection == .x) %>% arrange(desc(n)) %>% select(-collection) %>% gt() %>% - tab_header(str_c("Poem locations not mapped in ",.x)) %>% - fmt_integer(n) - ) + tab_header(str_c("Poem locations not mapped in ", .x)) %>% + fmt_integer(n)) ``` # Informants ```{r} -raw_meta %>% - filter(field=="INF") %>% - mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>% - mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>% - group_by(name) %>% - summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>% +raw_meta %>% + filter(field == "INF") %>% + mutate(value_c = str_replace_all(value, "^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.", "")) %>% + mutate(name = str_replace_all(value_c, "\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*", "\\1")) %>% + group_by(name) %>% + summarise(origs = str_flatten(sql("distinct value"), collapse = "|"), n = n(), .groups = "drop") %>% collect() ``` @@ -478,26 +476,27 @@ raw_meta %>% # Poem types ```{r} -poems %>% - filter(collection!="literary") %>% - left_join(poem_theme %>% filter(is_minor==0) %>% inner_join(themes %>% mutate(theme_type=if_else(str_detect(theme_id,"^erab_orig"),"Non-unified","Unified")))) %>% - group_by(collection,p_id) %>% - summarise(theme_type=case_when( - any(theme_type=="Unified") ~ "Systematisoituja", - any(theme_type=="Non-unified") ~ "Vain ei-systematisoituja", - T ~ "Ei annotointeja"), .groups="drop") %>% - count(collection,theme_type) %>% +poems %>% + filter(collection != "literary") %>% + left_join(poem_theme %>% filter(is_minor == 0) %>% inner_join(themes %>% mutate(theme_type = if_else(str_detect(theme_id, "^erab_orig"), "Non-unified", "Unified")))) %>% + group_by(collection, p_id) %>% + summarise(theme_type = case_when( + any(theme_type == "Unified") ~ "Systematisoituja", + any(theme_type == "Non-unified") ~ "Vain ei-systematisoituja", + T ~ "Ei annotointeja" + ), .groups = "drop") %>% + count(collection, theme_type) %>% collect() %>% - mutate(collection=fct_rev(fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")),theme_type=fct_rev(fct_relevel(theme_type,"Systematisoituja","Vain ei-systematisoituja","Ei annotointeja"))) %>% - ggplot(aes(x=collection,y=n,fill=theme_type)) + - geom_col() + - theme_hsci_discrete() + + mutate(collection = fct_rev(fct_relevel(str_to_upper(collection), "ERAB", "SKVR", "JR")), theme_type = fct_rev(fct_relevel(theme_type, "Systematisoituja", "Vain ei-systematisoituja", "Ei annotointeja"))) %>% + ggplot(aes(x = collection, y = n, fill = theme_type)) + + geom_col() + + theme_hsci_discrete() + xlab("Kokoelma") + ylab("Runoja") + - labs(fill="Runotyyppiannotaatiot") + - theme(legend.position="bottom") + + labs(fill = "Runotyyppiannotaatiot") + + theme(legend.position = "bottom") + guides(fill = guide_legend(reverse = TRUE)) + - scale_y_continuous(labels=scales::number) + + scale_y_continuous(labels = scales::number) + coord_flip() ``` @@ -506,26 +505,26 @@ poems %>% ```{r} d <- poems %>% - left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>% + left_join(p_year %>% mutate(year = if_else(year %in% c(0L, 9999L), NA, year))) %>% collect() %>% - mutate(year_ntile=ntile(year,11)) %>% + mutate(year_ntile = ntile(year, 11)) %>% group_by(year_ntile) %>% - mutate(years=str_c(min(year),"-",max(year))) %>% + mutate(years = str_c(min(year), "-", max(year))) %>% ungroup() %>% - left_join(p_loc %>% collect()) %>% - count(years,loc_id) %>% + left_join(p_loc %>% collect()) %>% + count(years, loc_id) %>% ungroup() %>% - left_join(locations %>% select(loc_id,name) %>% collect()) + left_join(locations %>% select(loc_id, name) %>% collect()) ``` ```{r,fig.height=11, results="asis"} -polygons %>% - left_join(d %>% complete(name,years)) %>% +polygons %>% + left_join(d %>% complete(name, years)) %>% tm_shape() + - tm_polygons(col='n', id='name', style='fisher', palette='plasma') + - tm_layout(main.title="Geographical overviews by time",legend.outside.size=0.1) + - tm_facets(by="years",ncol=4) + tm_polygons(col = "n", id = "name", style = "fisher", palette = "plasma") + + tm_layout(main.title = "Geographical overviews by time", legend.outside.size = 0.1) + + tm_facets(by = "years", ncol = 4) ``` # Poem length statistics @@ -534,36 +533,36 @@ polygons %>% ```{r} poem_stats %>% - filter(nverses<=75) %>% + filter(nverses <= 75) %>% inner_join(poems) %>% - count(collection,nverses) %>% + count(collection, nverses) %>% ungroup() %>% - ggplot(aes(x=nverses,y=n)) + - geom_col(width=1) + - facet_wrap(~collection,scales="free_y") + - theme_hsci_discrete(base_family="Arial") + - scale_y_continuous(labels=scales::comma_format()) + + ggplot(aes(x = nverses, y = n)) + + geom_col(width = 1) + + facet_wrap(~collection, scales = "free_y") + + theme_hsci_discrete(base_family = "Arial") + + scale_y_continuous(labels = scales::comma_format()) + xlab("Number of verse lines") + ylab("Poems") + - labs(title="Number of verse lines") + labs(title = "Number of verse lines") ``` ```{r} poem_stats %>% inner_join(poems) %>% - count(collection,nverses) %>% + count(collection, nverses) %>% ungroup() %>% group_by(collection) %>% - mutate(prop=n/sum(n)) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nverses<=75) %>% - ggplot(aes(x=nverses,y=collection,fill=collection,height=prop)) + - geom_density_ridges(stat='identity') + - theme_hsci_discrete(base_family="Arial") + -# scale_y_continuous(labels=scales::percent_format()) + + filter(nverses <= 75) %>% + ggplot(aes(x = nverses, y = collection, fill = collection, height = prop)) + + geom_density_ridges(stat = "identity") + + theme_hsci_discrete(base_family = "Arial") + + # scale_y_continuous(labels=scales::percent_format()) + xlab("Number of verse lines") + ylab("Poems") + - labs(title="Number of verse lines") + labs(title = "Number of verse lines") ``` ### Poems with more than 75 verse lines @@ -571,13 +570,13 @@ poem_stats %>% ```{r} poem_stats %>% inner_join(poems) %>% - count(collection,nverses) %>% - mutate(nl=if_else(nverses>75,n,0L)) %>% + count(collection, nverses) %>% + mutate(nl = if_else(nverses > 75, n, 0L)) %>% group_by(collection) %>% - summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>% + summarise(lines = sum(nl), proportion = sum(nl) / sum(n), .groups = "drop") %>% arrange(desc(lines)) %>% gt() %>% - tab_header(title="Poems with more than 75 verse lines") %>% + tab_header(title = "Poems with more than 75 verse lines") %>% fmt_integer(lines) %>% fmt_percent(proportion) ``` @@ -585,27 +584,27 @@ poem_stats %>% ## By county ```{r,fig.height=11} -poem_stats %>% - left_join(p_loc) %>% - left_join(locations) %>% - left_join(locations,by=c("par_id"="loc_id")) %>% - mutate(name=if_else(type.x=="county",name.x,name.y)) %>% - count(name,nverses) %>% +poem_stats %>% + left_join(p_loc) %>% + left_join(locations) %>% + left_join(locations, by = c("par_id" = "loc_id")) %>% + mutate(name = if_else(type.x == "county", name.x, name.y)) %>% + count(name, nverses) %>% ungroup() %>% group_by(name) %>% - mutate(prop=n/sum(n)) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nverses<=40,name!="Ahvenanmaa") %>% + filter(nverses <= 40, name != "Ahvenanmaa") %>% collect() %>% - mutate(name=fct_reorder(name,prop,.fun=max)) %>% - ggplot(aes(x=nverses,y=name,height=prop)) + - geom_density_ridges(stat='identity') + - theme_hsci_continuous(base_family="Arial") + -# scale_y_continuous(labels=scales::percent_format()) + + mutate(name = fct_reorder(name, prop, .fun = max)) %>% + ggplot(aes(x = nverses, y = name, height = prop)) + + geom_density_ridges(stat = "identity") + + theme_hsci_continuous(base_family = "Arial") + + # scale_y_continuous(labels=scales::percent_format()) + xlab("Number of verse lines") + ylab("Poems") + - guides(fill="none") + - labs(title="Number of verse lines by county") + guides(fill = "none") + + labs(title = "Number of verse lines by county") ``` # Poem verse statistics @@ -615,20 +614,20 @@ poem_stats %>% ### Line types ```{r} -d <- verses %>% - left_join(verse_poem) %>% - left_join(poems) %>% - count(collection,type) %>% +d <- verses %>% + left_join(verse_poem) %>% + left_join(poems) %>% + count(collection, type) %>% ungroup() %>% - arrange(collection,desc(n)) %>% + arrange(collection, desc(n)) %>% collect() ``` ```{r} -d %>% +d %>% group_by(collection) %>% - mutate(proportion=n/sum(n)) %>% + mutate(proportion = n / sum(n)) %>% gt() %>% fmt_integer(n) %>% fmt_percent(proportion) @@ -638,64 +637,64 @@ d %>% ```{r} d_nr_characters <- verses_cl %>% - mutate(nr_characters=str_length(text)) %>% - left_join(verse_poem) %>% - left_join(poems) %>% - count(collection,nr_characters) %>% + mutate(nr_characters = str_length(text)) %>% + left_join(verse_poem) %>% + left_join(poems) %>% + count(collection, nr_characters) %>% ungroup() %>% - arrange(collection,desc(n)) %>% + arrange(collection, desc(n)) %>% collect() d_nr_words <- word_occ %>% group_by(v_id) %>% - summarise(nr_words=max(pos),.groups="drop") %>% + summarise(nr_words = max(pos), .groups = "drop") %>% left_join(verse_poem) %>% - left_join(poems) %>% - count(collection,nr_words) %>% + left_join(poems) %>% + count(collection, nr_words) %>% ungroup() %>% - arrange(collection,desc(n)) %>% + arrange(collection, desc(n)) %>% collect() ``` #### Verse line lengths in characters ```{r} -d_nr_characters %>% - filter(nr_characters<=60) %>% - ggplot(aes(x=nr_characters,y=n)) + - geom_col(width=1) + - facet_wrap(~collection,scales="free_y") + - theme_hsci_discrete(base_family="Arial") + - scale_y_continuous(labels=scales::comma_format()) + +d_nr_characters %>% + filter(nr_characters <= 60) %>% + ggplot(aes(x = nr_characters, y = n)) + + geom_col(width = 1) + + facet_wrap(~collection, scales = "free_y") + + theme_hsci_discrete(base_family = "Arial") + + scale_y_continuous(labels = scales::comma_format()) + xlab("Number of characters") + ylab("Verses") + - labs(title="Number of characters in verse lines") + labs(title = "Number of characters in verse lines") ``` ```{r} -d_nr_characters %>% +d_nr_characters %>% group_by(collection) %>% - mutate(prop=n/sum(n)) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nr_characters<=60) %>% - ggplot(aes(x=nr_characters,y=collection,fill=collection,height=prop)) + - geom_density_ridges(stat='identity') + - theme_hsci_discrete(base_family="Arial") + -# scale_y_continuous(labels=scales::percent_format()) + + filter(nr_characters <= 60) %>% + ggplot(aes(x = nr_characters, y = collection, fill = collection, height = prop)) + + geom_density_ridges(stat = "identity") + + theme_hsci_discrete(base_family = "Arial") + + # scale_y_continuous(labels=scales::percent_format()) + xlab("Number of characters") + ylab("Verses") + - labs(title="Number of characters in verse lines") + labs(title = "Number of characters in verse lines") ``` #### Verse lines with more than 60 characters ```{r} -d_nr_characters %>% - mutate(nl=if_else(nr_characters>60,n,0L)) %>% +d_nr_characters %>% + mutate(nl = if_else(nr_characters > 60, n, 0L)) %>% group_by(collection) %>% - summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>% + summarise(lines = sum(nl), proportion = sum(nl) / sum(n), .groups = "drop") %>% arrange(desc(lines)) %>% gt() %>% - tab_header(title="Verse lines with more than 60 characters") %>% + tab_header(title = "Verse lines with more than 60 characters") %>% fmt_integer(lines) %>% fmt_percent(proportion) ``` @@ -703,64 +702,64 @@ d_nr_characters %>% #### Verse line lengths in words ```{r} -d_nr_words %>% - filter(nr_words<=10) %>% - ggplot(aes(x=nr_words,y=n)) + - geom_col(width=1) + - facet_wrap(~collection,scales="free_y") + - scale_x_continuous(breaks=seq(0,10,by=2)) + - scale_y_continuous(labels=scales::comma_format()) + - theme_hsci_discrete(base_family="Arial") + +d_nr_words %>% + filter(nr_words <= 10) %>% + ggplot(aes(x = nr_words, y = n)) + + geom_col(width = 1) + + facet_wrap(~collection, scales = "free_y") + + scale_x_continuous(breaks = seq(0, 10, by = 2)) + + scale_y_continuous(labels = scales::comma_format()) + + theme_hsci_discrete(base_family = "Arial") + xlab("Number of words") + ylab("Verses") + - labs(title="Number of words in verse lines") + labs(title = "Number of words in verse lines") ``` ```{r} -d_nr_words %>% - filter(nr_words<=10) %>% +d_nr_words %>% + filter(nr_words <= 10) %>% uncount(n) %>% - ggplot(aes(x=nr_words,y=collection,fill=collection)) + - stat_binline(binwidth=1) + - theme_hsci_discrete(base_family="Arial") + - scale_x_continuous(breaks=seq(0,10,by=2)) + + ggplot(aes(x = nr_words, y = collection, fill = collection)) + + stat_binline(binwidth = 1) + + theme_hsci_discrete(base_family = "Arial") + + scale_x_continuous(breaks = seq(0, 10, by = 2)) + xlab("Number of words") + ylab("Verses") + -# scale_y_continuous(labels=scales::percent_format()) + - labs(title="Number of words in verse lines") + # scale_y_continuous(labels=scales::percent_format()) + + labs(title = "Number of words in verse lines") ``` #### Verse lines with more than 10 words ```{r} -d_nr_words %>% - mutate(nl=if_else(nr_words>10,n,0L)) %>% +d_nr_words %>% + mutate(nl = if_else(nr_words > 10, n, 0L)) %>% group_by(collection) %>% - summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>% + summarise(lines = sum(nl), proportion = sum(nl) / sum(n), .groups = "drop") %>% arrange(desc(lines)) %>% gt() %>% - tab_header(title="Verse lines with more than 10 words") %>% + tab_header(title = "Verse lines with more than 10 words") %>% fmt_integer(lines) %>% fmt_percent(proportion) ``` ```{r} -verse_nr_words <- word_occ %>% +verse_nr_words <- word_occ %>% group_by(v_id) %>% - summarise(nr_words=max(pos)) %>% - compute_a(unique_indexes=list(c("v_id","nr_words"))) + summarise(nr_words = max(pos)) %>% + compute_a(unique_indexes = list(c("v_id", "nr_words"))) word_nr_characters <- words %>% - mutate(nr_characters=str_length(text)) %>% - select(w_id,nr_characters) %>% - compute_a(unique_indexes=list(c("w_id","nr_characters"))) + mutate(nr_characters = str_length(text)) %>% + select(w_id, nr_characters) %>% + compute_a(unique_indexes = list(c("w_id", "nr_characters"))) d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% - left_join(poems) %>% - count(collection,nr_words,pos,nr_characters) %>% + left_join(verse_poem %>% select(-pos), by = c("v_id")) %>% + left_join(poems) %>% + count(collection, nr_words, pos, nr_characters) %>% collect() ``` @@ -770,126 +769,126 @@ d <- word_occ %>% ```{r} d_nr_characters <- verses_cl %>% - mutate(nr_characters=str_length(text)) %>% - left_join(verse_poem) %>% - left_join(p_loc) %>% - left_join(locations) %>% - left_join(locations,by=c("par_id"="loc_id")) %>% - mutate(name=if_else(type.x=="county",name.x,name.y)) %>% - count(name,nr_characters) %>% + mutate(nr_characters = str_length(text)) %>% + left_join(verse_poem) %>% + left_join(p_loc) %>% + left_join(locations) %>% + left_join(locations, by = c("par_id" = "loc_id")) %>% + mutate(name = if_else(type.x == "county", name.x, name.y)) %>% + count(name, nr_characters) %>% ungroup() %>% - arrange(name,desc(n)) %>% + arrange(name, desc(n)) %>% collect() d_nr_words <- word_occ %>% group_by(v_id) %>% - summarise(nr_words=max(pos),.groups="drop") %>% + summarise(nr_words = max(pos), .groups = "drop") %>% left_join(verse_poem) %>% - left_join(p_loc) %>% - left_join(locations) %>% - left_join(locations,by=c("par_id"="loc_id")) %>% - mutate(name=if_else(type.x=="county",name.x,name.y)) %>% - count(name,nr_words) %>% + left_join(p_loc) %>% + left_join(locations) %>% + left_join(locations, by = c("par_id" = "loc_id")) %>% + mutate(name = if_else(type.x == "county", name.x, name.y)) %>% + count(name, nr_words) %>% ungroup() %>% - arrange(name,desc(n)) %>% + arrange(name, desc(n)) %>% collect() ``` #### Verse line lengths in characters ```{r} -d_nr_characters %>% +d_nr_characters %>% group_by(name) %>% - mutate(prop=n/sum(n)) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nr_characters<=40,name!="Ahvenanmaa") %>% - mutate(name=fct_reorder(name,prop,.fun=max)) %>% - ggplot(aes(x=nr_characters,y=name,height=prop)) + - geom_density_ridges(stat='identity') + - theme_hsci_discrete(base_family="Arial") + -# scale_y_continuous(labels=scales::percent_format()) + + filter(nr_characters <= 40, name != "Ahvenanmaa") %>% + mutate(name = fct_reorder(name, prop, .fun = max)) %>% + ggplot(aes(x = nr_characters, y = name, height = prop)) + + geom_density_ridges(stat = "identity") + + theme_hsci_discrete(base_family = "Arial") + + # scale_y_continuous(labels=scales::percent_format()) + xlab("Number of characters") + ylab("Verses") + - labs(title="Number of characters in verse lines") + labs(title = "Number of characters in verse lines") ``` #### Verse line lengths in words ```{r,fig.height=11} -d_nr_words %>% - filter(nr_words<8,name!="Ahvenanmaa") %>% - mutate(name=fct_reorder(name,n,.fun=max)) %>% +d_nr_words %>% + filter(nr_words < 8, name != "Ahvenanmaa") %>% + mutate(name = fct_reorder(name, n, .fun = max)) %>% uncount(n) %>% - ggplot(aes(x=nr_words,y=name)) + - stat_binline(binwidth=1,scale=0.9) + - theme_hsci_discrete(base_family="Arial") + - scale_x_continuous(breaks=seq(0,10,by=2)) + + ggplot(aes(x = nr_words, y = name)) + + stat_binline(binwidth = 1, scale = 0.9) + + theme_hsci_discrete(base_family = "Arial") + + scale_x_continuous(breaks = seq(0, 10, by = 2)) + xlab("Number of words") + ylab("Verses") + -# scale_y_continuous(labels=scales::percent_format()) + - labs(title="Number of words in verse lines") + # scale_y_continuous(labels=scales::percent_format()) + + labs(title = "Number of words in verse lines") ``` ## Number of characters in words by their position ```{r} -verse_nr_words <- word_occ %>% +verse_nr_words <- word_occ %>% group_by(v_id) %>% - summarise(nr_words=max(pos)) %>% - compute_a(unique_indexes=list(c("v_id","nr_words"))) + summarise(nr_words = max(pos)) %>% + compute_a(unique_indexes = list(c("v_id", "nr_words"))) word_nr_characters <- words %>% - mutate(nr_characters=str_length(text)) %>% - select(w_id,nr_characters) %>% - compute_a(unique_indexes=list(c("w_id","nr_characters"))) + mutate(nr_characters = str_length(text)) %>% + select(w_id, nr_characters) %>% + compute_a(unique_indexes = list(c("w_id", "nr_characters"))) d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% - left_join(poems) %>% - count(collection,nr_words,pos,nr_characters) %>% + left_join(verse_poem %>% select(-pos), by = c("v_id")) %>% + left_join(poems) %>% + count(collection, nr_words, pos, nr_characters) %>% collect() ``` ```{r} d %>% - group_by(collection,nr_words,pos) %>% - mutate(prop=n/sum(n)) %>% + group_by(collection, nr_words, pos) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nr_words>=2L,nr_words<=5L) %>% - mutate(nr_words=as_factor(nr_words),pos=as_factor(pos)) %>% + filter(nr_words >= 2L, nr_words <= 5L) %>% + mutate(nr_words = as_factor(nr_words), pos = as_factor(pos)) %>% uncount(n) %>% - ggplot(aes(x=nr_characters,y=nr_words,fill=pos)) + - stat_binline(binwidth=1) + - facet_grid(collection~pos,labeller = labeller(pos=label_both)) + + ggplot(aes(x = nr_characters, y = nr_words, fill = pos)) + + stat_binline(binwidth = 1) + + facet_grid(collection ~ pos, labeller = labeller(pos = label_both)) + xlab("Number of characters in word") + ylab("Number of words in verse") + labs( - title="Number of characters in words by their position", - subtitle="According to length of verse and collection" - ) + - guides(fill="none") + - theme_hsci_discrete(base_family="Arial") + title = "Number of characters in words by their position", + subtitle = "According to length of verse and collection" + ) + + guides(fill = "none") + + theme_hsci_discrete(base_family = "Arial") ``` ```{r} d %>% - group_by(collection,nr_words,pos) %>% - mutate(prop=n/sum(n)) %>% + group_by(collection, nr_words, pos) %>% + mutate(prop = n / sum(n)) %>% ungroup() %>% - filter(nr_words>=2L,nr_words<=5L) %>% - mutate(nr_words=as_factor(nr_words),pos=as_factor(pos)) %>% + filter(nr_words >= 2L, nr_words <= 5L) %>% + mutate(nr_words = as_factor(nr_words), pos = as_factor(pos)) %>% uncount(n) %>% - ggplot(aes(x=nr_characters,y=pos,fill=nr_words)) + - stat_binline(binwidth=1) + - facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + + ggplot(aes(x = nr_characters, y = pos, fill = nr_words)) + + stat_binline(binwidth = 1) + + facet_grid(collection ~ nr_words, labeller = labeller(nr_words = label_both)) + xlab("Number of characters in word") + ylab("Position") + labs( - title="Number of characters in words by their position", - subtitle="According to length of verse and collection" - ) + - guides(fill="none") + - theme_hsci_discrete(base_family="Arial") + title = "Number of characters in words by their position", + subtitle = "According to length of verse and collection" + ) + + guides(fill = "none") + + theme_hsci_discrete(base_family = "Arial") ``` diff --git a/docs/overview.html b/docs/overview.html index 987ec10..35b555d 100644 --- a/docs/overview.html +++ b/docs/overview.html @@ -491,13 +491,13 @@

2024-04-18

Temporal overview

-
p_year %>% 
+
p_year %>%
   inner_join(poems,by=c("p_id")) %>%
   count(collection,year) %>%
   mutate(measure="yearly count") %>%
   union_all(
     p_year %>% # 10 year rolling mean
-      distinct(year) %>% 
+      distinct(year) %>%
       left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>%
       inner_join(p_year,by=c("year.y"="year")) %>%
       inner_join(poems,by=c("p_id")) %>%
@@ -522,15 +522,15 @@ 

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=5000) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=5000, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,4600),xlim=c(1800,1970),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + @@ -553,15 +553,15 @@

Temporal overview

count(collection, ancestor_t_id) %>% group_by(collection) %>% slice_max(n,n=9) %>% - ungroup() %>% + ungroup() %>% mutate(top_type=TRUE) %>% select(ancestor_t_id,top_type) %>% compute_a(temporary=TRUE, overwrite=TRUE)
-
d <- p_year %>% 
+
d <- p_year %>%
   inner_join(poems,by=c("p_id")) %>%
   left_join(p_typ %>%
               filter(!is_minor) %>%
-    inner_join(types_to_top_level_types %>% 
+    inner_join(types_to_top_level_types %>%
                  inner_join(types %>%
                               filter(!str_detect(type_orig_id,"^erab_orig")) %>%
                         select(ancestor_t_id=t_id,ancestor_type_name=name))) %>%
@@ -577,14 +577,14 @@ 

Temporal overview

mutate(measure="yearly count") %>% union_all( p_year %>% # 10 year rolling mean - distinct(year) %>% + distinct(year) %>% left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% inner_join(p_year,by=c("year.y"="year")) %>% inner_join(poems,by=c("p_id")) %>% left_join(p_typ %>% - inner_join(types_to_top_level_types %>% - inner_join(types %>% - filter(!str_detect(type_orig_id,"^erab_orig")) %>% + inner_join(types_to_top_level_types %>% + inner_join(types %>% + filter(!str_detect(type_orig_id,"^erab_orig")) %>% select(ancestor_t_id=t_id,ancestor_type_name=name))) %>% left_join(top_top_types) %>% mutate( @@ -618,15 +618,15 @@

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=1400) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=1400, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,1300),xlim=c(1800,1940),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=500),labels=scales::number) + @@ -671,15 +671,15 @@

Temporal overview

geom_point(data=. %>% filter(youtlier==TRUE),aes(x=year),y=youtlier_limit+100) + geom_text_repel(data=. %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=youtlier_limit+100, show.legend=FALSE) + geom_point(data=. %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=. %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=. %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=. %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,youtlier_limit),xlim=c(1800,1950),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=200),labels=scales::number) + @@ -708,15 +708,15 @@

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=7200) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=7200, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,6500),xlim=c(1800,1960),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + @@ -729,9 +729,9 @@

Temporal overview

#  ggtitle("Runojen määrä vuosittain ja kokoelmittain")
 #  ggtitle("Number of poems by year and collection")
-
p_year %>% 
-  filter(year %in% c(0,9999)) %>% 
-  left_join(poems) %>% 
+
p_year %>%
+  filter(year %in% c(0,9999)) %>%
+  left_join(poems) %>%
   count(collection,year) %>%
   ungroup() %>%
   gt() %>%
@@ -1131,7 +1131,7 @@ 

Temporal overview

Abnormal years - + collection year @@ -1146,17 +1146,17 @@

Temporal overview

0 6,670 - - + +

Overview of collectors

-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
-  map(~p_col %>% 
+  map(~p_col %>%
     inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
     count(col_id) %>%
     left_join(collectors,by=c("col_id")) %>%
@@ -1181,16 +1181,16 @@ 

Overview of collectors

## ! 1 unknown level in `f`: Other
## [[1]]

-
## 
+
##
 ## [[2]]

-
## 
+
##
 ## [[3]]

-
## 
+
##
 ## [[4]]

-
p_col %>% 
+
p_col %>%
   anti_join(collectors) %>%
   count(col_id) %>%
   gt() %>%
@@ -1590,7 +1590,7 @@ 

Overview of collectors

Collectors without a name - + col_id n @@ -1602,8 +1602,8 @@

Overview of collectors

7650 1 - - + +
p_col %>%
@@ -2010,7 +2010,7 @@ 

Overview of collectors

- + @@ -2087,8 +2087,8 @@

Overview of collectors

- - + +
n
Lönnrot, Elias 1 239
p_col %>%
@@ -2107,7 +2107,7 @@ 

Overview of collectors

inner_join(types %>% rename(type_name=name)) %>% inner_join(collectors) %>% filter(collection=="SKVR") %>% - collect() %>% + collect() %>% # group_by(collection) %>% # mutate(type_name=fct_lump_n(type_name, n, n=5, other_level="Muut")) %>% # ungroup() %>% @@ -2128,26 +2128,26 @@

Overview of collectors

Geographical overview

-
d <- p_pl %>% 
+
d <- p_pl %>%
   inner_join(poems %>% filter(collection!="literary")) %>%
-  count(collection, pl_id) %>% 
+  count(collection, pl_id) %>%
   inner_join(places) %>%
   select(collection, pl_id, name,n) %>%
   collect()
 
-poems_without_location <- poems %>% 
+poems_without_location <- poems %>%
   filter(collection!="literary") %>%
-  anti_join(p_pl, join_by(p_id)) %>% 
-  count() %>% 
+  anti_join(p_pl, join_by(p_id)) %>%
+  count() %>%
   pull()
 
 unprojected_locations <- d %>%
   anti_join(polygons, join_by(pl_id)) %>%
   add_row(name=NA,n=poems_without_location)
 
-best_polygons <- polygons %>% 
-  filter(is_primary==T) %>% 
-  group_by(pl_id) %>% 
+best_polygons <- polygons %>%
+  filter(is_primary==T) %>%
+  group_by(pl_id) %>%
   filter(map_id==min(map_id)) %>%
   ungroup()
tm_shape(polygons %>%
@@ -2161,19 +2161,19 @@ 

Geographical overview

library(SpatialKDE)
 md <- best_polygons %>%
-    inner_join(d, join_by(pl_id)) 
+    inner_join(d, join_by(pl_id))
 rp <- md %>%
   group_by(collection) %>%
   group_map(~st_sample(.x,size=.x %>% pull(n), progress=TRUE) %>% st_sf())
 names(rp) = md %>%
-  group_by(collection) %>% 
+  group_by(collection) %>%
   group_keys() %>% pull()
library(mappp)
 cell_size <- 15000
 band_width <- 15000
 grid <- md %>% create_grid_hexagonal(cell_size = cell_size, side_offset = band_width)
 kde2 <- rp %>% mappp(~kde(.x, band_width = band_width,kernel = "quartic", grid = grid), parallel=TRUE)
-
## 
+
##
   |                                          |   0%, ETA NA
kde2 %>% imap(~
 tm_shape(.x %>%
@@ -2190,10 +2190,10 @@ 

Geographical overview

## all geometries
## $erab

-
## 
+
##
 ## $jr

-
## 
+
##
 ## $skvr

@@ -2597,7 +2597,7 @@

Poem locations not mapped

Poem locations not mapped - + collection pl_id @@ -2699,45 +2699,45 @@

Poem locations not mapped

Pohjois-Varanki 1 - - + +

Geographical overview by collection

-
d <- p_pl %>% 
+
d <- p_pl %>%
   inner_join(poems %>% filter(collection!="literary")) %>%
-  count(collection, pl_id) %>% 
+  count(collection, pl_id) %>%
   inner_join(places) %>%
   select(collection, pl_id, name,n) %>%
   collect()
 
-poems_without_location <- poems %>% 
+poems_without_location <- poems %>%
   filter(collection!="literary") %>%
-  anti_join(p_pl, join_by(p_id)) %>% 
-  count() %>% 
+  anti_join(p_pl, join_by(p_id)) %>%
+  count() %>%
   pull()
 
 unprojected_locations <- d %>%
   anti_join(polygons, join_by(pl_id)) %>%
   add_row(name=NA,n=poems_without_location)
 
-best_polygons <- polygons %>% 
-  filter(is_primary==T) %>% 
-  group_by(pl_id) %>% 
+best_polygons <- polygons %>%
+  filter(is_primary==T) %>%
+  group_by(pl_id) %>%
   filter(map_id==min(map_id)) %>%
   ungroup()
-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
   map(~
     tm_shape(
       polygons %>%
         left_join(
-          p_pl %>% 
+          p_pl %>%
             inner_join(poems %>% filter(collection==.x),by=c("p_id")) %>%
-            count(pl_id) %>% 
+            count(pl_id) %>%
             inner_join(places) %>%
             select(pl_id,n) %>%
             collect()
@@ -2748,19 +2748,19 @@ 

Geographical overview by collection

)
## [[1]]

-
## 
+
##
 ## [[2]]

-
## 
+
##
 ## [[3]]

-
## 
+
##
 ## [[4]]

Poem locations not mapped by collection

-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
   map(~
@@ -4670,12 +4670,12 @@ 

Poem locations not mapped by collection

Informants

-
raw_meta %>% 
+
raw_meta %>%
   filter(field=="INF") %>%
-  mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>% 
-  mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>% 
-  group_by(name) %>% 
-  summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>% 
+  mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>%
+  mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>%
+  group_by(name) %>%
+  summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>%
   collect() %>%
   arrange(desc(n))
## # A tibble: 15,276 × 3
@@ -4695,20 +4695,20 @@ 

Informants

Poem types

-
poems %>% 
+
poems %>%
   filter(collection!="literary") %>%
-  left_join(p_typ %>% filter(is_minor==0) %>% inner_join(types %>% mutate(type_type=if_else(str_detect(t_id,"^erab_orig"),"Non-unified","Unified")))) %>% 
+  left_join(p_typ %>% filter(is_minor==0) %>% inner_join(types %>% mutate(type_type=if_else(str_detect(t_id,"^erab_orig"),"Non-unified","Unified")))) %>%
   group_by(collection,p_id) %>%
   summarise(type_type=case_when(
-    any(type_type=="Unified") ~ "Systematisoituja", 
-    any(type_type=="Non-unified") ~ "Vain ei-systematisoituja", 
+    any(type_type=="Unified") ~ "Systematisoituja",
+    any(type_type=="Non-unified") ~ "Vain ei-systematisoituja",
     T ~ "Ei annotointeja"), .groups="drop") %>%
   count(collection,type_type) %>%
   collect() %>%
   mutate(collection=fct_rev(fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")),type_type=fct_rev(fct_relevel(type_type,"Systematisoituja","Vain ei-systematisoituja","Ei annotointeja"))) %>%
-  ggplot(aes(x=collection,y=n,fill=type_type)) + 
-  geom_col() + 
-  theme_hsci_discrete() + 
+  ggplot(aes(x=collection,y=n,fill=type_type)) +
+  geom_col() +
+  theme_hsci_discrete() +
   xlab("Kokoelma") +
   ylab("Runoja") +
   labs(fill="Runotyyppiannotaatiot") +
@@ -4728,17 +4728,17 @@ 

Poem types

Spatiotemporal overview

d <- poems %>%
-  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>% 
+  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>%
   collect() %>%
   mutate(year_ntile=ntile(year,11)) %>%
   group_by(year_ntile) %>%
   mutate(years=str_c(min(year),"-",max(year))) %>%
   ungroup() %>%
-  left_join(p_pl %>% collect()) %>% 
-  count(years,pl_id) %>% 
+  left_join(p_pl %>% collect()) %>%
+  count(years,pl_id) %>%
   ungroup() %>%
   left_join(places %>% select(pl_id,name) %>% collect())
-
best_polygons %>% 
+
best_polygons %>%
   left_join(d %>% complete(pl_id,years), join_by(pl_id)) %>%
   tm_shape() +
   tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
@@ -5187,7 +5187,7 @@ 

Poems with more than 75 verse lines

Poems with more than 75 verse lines - + collection lines @@ -5208,15 +5208,15 @@

Poems with more than 75 verse lines

395 3.75% - - + +

By county

-
poem_stats %>% 
+
poem_stats %>%
   left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>%
   count(name,nverses) %>%
   ungroup() %>%
@@ -5243,14 +5243,14 @@ 

Poem verse statistics

By collection

Line types

-
d <- verses %>% 
-  left_join(verse_poem) %>% 
-  left_join(poems) %>% 
-  count(collection,type) %>% 
+
d <- verses %>%
+  left_join(verse_poem) %>%
+  left_join(poems) %>%
+  count(collection,type) %>%
   ungroup() %>%
   arrange(collection,desc(n)) %>%
   collect()
-
d %>% 
+
d %>%
   group_by(collection) %>%
   mutate(proportion=n/sum(n)) %>%
   gt() %>%
@@ -5647,7 +5647,7 @@ 

Line types

- + @@ -5740,8 +5740,8 @@

Line types

- - + +
type n 2,727 1.28%
@@ -5749,9 +5749,9 @@

Line types

Verse line lengths

d_nr_characters <- verses_cl %>%
   mutate(nr_characters=str_length(text)) %>%
-  left_join(verse_poem) %>% 
-  left_join(poems) %>% 
-  count(collection,nr_characters) %>% 
+  left_join(verse_poem) %>%
+  left_join(poems) %>%
+  count(collection,nr_characters) %>%
   ungroup() %>%
   arrange(collection,desc(n)) %>%
   collect()
@@ -5760,14 +5760,14 @@ 

Verse line lengths

group_by(v_id) %>% summarise(nr_words=max(pos),.groups="drop") %>% left_join(verse_poem) %>% - left_join(poems) %>% - count(collection,nr_words) %>% + left_join(poems) %>% + count(collection,nr_words) %>% ungroup() %>% arrange(collection,desc(n)) %>% collect()

Verse line lengths in characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   filter(nr_characters<=60) %>%
   ggplot(aes(x=nr_characters,y=n)) +
   geom_col(width=1) +
@@ -5778,7 +5778,7 @@ 

Verse line lengths in characters

ylab("Verses") + labs(title="Number of characters in verse lines")

-
d_nr_characters %>% 
+
d_nr_characters %>%
   group_by(collection) %>%
   mutate(prop=n/sum(n)) %>%
   ungroup() %>%
@@ -5794,7 +5794,7 @@ 

Verse line lengths in characters

Verse lines with more than 60 characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   mutate(nl=if_else(nr_characters>60,n,0L)) %>%
   group_by(collection) %>%
   summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
@@ -6197,7 +6197,7 @@ 

Verse lines with more than 60 characters

Verse lines with more than 60 characters - + collection lines @@ -6218,14 +6218,14 @@

Verse lines with more than 60 characters

350 0.02% - - + +

Verse line lengths in words

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<=10) %>%
   ggplot(aes(x=nr_words,y=n)) +
   geom_col(width=1) +
@@ -6237,7 +6237,7 @@ 

Verse line lengths in words

ylab("Verses") + labs(title="Number of words in verse lines")

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<=10) %>%
   uncount(n) %>%
   ggplot(aes(x=nr_words,y=collection,fill=collection)) +
@@ -6252,7 +6252,7 @@ 

Verse line lengths in words

Verse lines with more than 10 words

-
d_nr_words %>% 
+
d_nr_words %>%
   mutate(nl=if_else(nr_words>10,n,0L)) %>%
   group_by(collection) %>%
   summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups="drop") %>%
@@ -6655,7 +6655,7 @@ 

Verse lines with more than 10 words

Verse lines with more than 10 words - + collection lines @@ -6676,11 +6676,11 @@

Verse lines with more than 10 words

306 0.02% - - + +
-
verse_nr_words <- word_occ %>% 
+
verse_nr_words <- word_occ %>%
   group_by(v_id) %>%
   summarise(nr_words=max(pos)) %>%
   compute_a(unique_indexes=list(c("v_id","nr_words")))
@@ -6693,8 +6693,8 @@ 

Verse lines with more than 10 words

d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% - left_join(poems) %>% + left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% + left_join(poems) %>% count(collection,nr_words,pos,nr_characters) %>% collect()
@@ -6706,9 +6706,9 @@

By county

Verse line lengths

d_nr_characters <- verses_cl %>%
   mutate(nr_characters=str_length(text)) %>%
-  left_join(verse_poem) %>% 
+  left_join(verse_poem) %>%
   left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>%
-  count(name,nr_characters) %>% 
+  count(name,nr_characters) %>%
   ungroup() %>%
   arrange(name,desc(n)) %>%
   collect()
@@ -6718,7 +6718,7 @@ 

Verse line lengths

summarise(nr_words=max(pos),.groups="drop") %>% left_join(verse_poem) %>% left_join(p_pl %>% inner_join(places_to_counties) %>% inner_join(places, join_by(county_id==pl_id))) %>% - count(name,nr_words) %>% + count(name,nr_words) %>% ungroup() %>% arrange(name,desc(n)) %>% collect()
@@ -6727,7 +6727,7 @@

Verse line lengths

## This warning is displayed once every 8 hours.

Verse line lengths in characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   group_by(name) %>%
   mutate(prop=n/sum(n)) %>%
   ungroup() %>%
@@ -6744,7 +6744,7 @@ 

Verse line lengths in characters

Verse line lengths in words

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<8,name!="Ahvenanmaa") %>%
   mutate(name=fct_reorder(name,n,.fun=max)) %>%
   uncount(n) %>%
@@ -6762,7 +6762,7 @@ 

Verse line lengths in words

Number of characters in words by their position

-
verse_nr_words <- word_occ %>% 
+
verse_nr_words <- word_occ %>%
   group_by(v_id) %>%
   summarise(nr_words=max(pos)) %>%
   compute_a(unique_indexes=list(c("v_id","nr_words")))
@@ -6777,8 +6777,8 @@

Number of characters in words by their position

d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% - left_join(poems) %>% + left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% + left_join(poems) %>% count(collection,nr_words,pos,nr_characters) %>% collect()
d %>%
@@ -6790,7 +6790,7 @@ 

Number of characters in words by their position

uncount(n) %>% ggplot(aes(x=nr_characters,y=nr_words,fill=pos)) + stat_binline(binwidth=1) + - facet_grid(collection~pos,labeller = labeller(pos=label_both)) + + facet_grid(collection~pos,labeller = labeller(pos=label_both)) + xlab("Number of characters in word") + ylab("Number of words in verse") + labs( @@ -6809,7 +6809,7 @@

Number of characters in words by their position

uncount(n) %>% ggplot(aes(x=nr_characters,y=pos,fill=nr_words)) + stat_binline(binwidth=1) + - facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + + facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + xlab("Number of characters in word") + ylab("Position") + labs( diff --git a/docs/overview.nb.html b/docs/overview.nb.html index 00bcde0..037bd06 100644 --- a/docs/overview.nb.html +++ b/docs/overview.nb.html @@ -1807,13 +1807,13 @@

Temporal overview

-
p_year %>% 
+
p_year %>%
   inner_join(poems,by=c("p_id")) %>%
   count(collection,year) %>%
   mutate(measure="yearly count") %>%
   union_all(
     p_year %>% # 10 year rolling mean
-      distinct(year) %>% 
+      distinct(year) %>%
       left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>%
       inner_join(p_year,by=c("year.y"="year")) %>%
       inner_join(poems,by=c("p_id")) %>%
@@ -1838,15 +1838,15 @@ 

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=5000) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=5000, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,4600),xlim=c(1800,1970),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + @@ -1869,13 +1869,13 @@

Temporal overview

-
top_top_themes <- poem_theme %>% 
+
top_top_themes <- poem_theme %>%
   inner_join(poems) %>%
   inner_join(themes_to_top_level_themes) %>%
   count(collection, ancestor_t_id) %>%
   group_by(collection) %>%
   slice_max(n,n=9) %>%
-  ungroup() %>% 
+  ungroup() %>%
   mutate(top_theme=TRUE) %>%
   select(ancestor_t_id,top_theme) %>%
   compute_a(temporary=TRUE, overwrite=TRUE)
@@ -1885,10 +1885,10 @@

Temporal overview

-
d <- p_year %>% 
+
d <- p_year %>%
   inner_join(poems,by=c("p_id")) %>%
   inner_join(poem_theme %>%
-    inner_join(themes_to_top_level_themes %>% 
+    inner_join(themes_to_top_level_themes %>%
                  inner_join(themes %>%
                               filter(!str_detect(theme_id,"^erab_orig")) %>%
                         select(ancestor_t_id=t_id,ancestor_theme_name=name)))) %>%
@@ -1903,14 +1903,14 @@ 

Temporal overview

mutate(measure="yearly count") %>% union_all( p_year %>% # 10 year rolling mean - distinct(year) %>% + distinct(year) %>% left_join(p_year %>% distinct(year),sql_on="RHS.year BETWEEN LHS.year-5 AND LHS.year+4") %>% inner_join(p_year,by=c("year.y"="year")) %>% inner_join(poems,by=c("p_id")) %>% inner_join(poem_theme %>% - inner_join(themes_to_top_level_themes %>% - inner_join(themes %>% - filter(!str_detect(theme_id,"^erab_orig")) %>% + inner_join(themes_to_top_level_themes %>% + inner_join(themes %>% + filter(!str_detect(theme_id,"^erab_orig")) %>% select(ancestor_t_id=t_id,ancestor_theme_name=name)))) %>% left_join(top_top_themes) %>% mutate( @@ -1953,15 +1953,15 @@

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=1400) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=1400, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,1300),xlim=c(1800,1940),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=500),labels=scales::number) + @@ -2021,16 +2021,16 @@

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=900) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=900, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=1785,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( legend.box.just = "top", - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,820),xlim=c(1820,1950),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=200),labels=scales::number) + @@ -2069,15 +2069,15 @@

Temporal overview

geom_point(data=~.x %>% filter(youtlier==TRUE),aes(x=year),y=7200) + geom_text_repel(data=~.x %>% filter(youtlier==TRUE),aes(x=year,label=scales::number(n)),y=7200, show.legend=FALSE) + geom_point(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n)) + - geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + + geom_text_repel(data=~.x %>% filter(xoutlier==TRUE,measure=="yearly count"),aes(x=year,y=n,label=scales::number(n)), show.legend=FALSE) + geom_line(data=~.x %>% filter(xoutlier==FALSE,measure=="10 year rolling mean")) + - theme_hsci_discrete(base_family="Arial") + + theme_hsci_discrete(base_family="Arial") + theme( - legend.justification=c(0,1), - legend.position=c(0.02, 0.98), - legend.background = element_blank(), + legend.justification=c(0,1), + legend.position=c(0.02, 0.98), + legend.background = element_blank(), legend.key=element_blank() - ) + + ) + labs(color=NULL) + coord_cartesian(ylim=c(0,6500),xlim=c(1800,1960),clip="off") + scale_y_continuous(breaks=seq(0,20000,by=1000),labels=scales::number) + @@ -2100,9 +2100,9 @@

Temporal overview

-
p_year %>% 
-  filter(year %in% c(0,9999)) %>% 
-  left_join(poems) %>% 
+
p_year %>%
+  filter(year %in% c(0,9999)) %>%
+  left_join(poems) %>%
   count(collection,year) %>%
   ungroup() %>%
   gt() %>%
@@ -2117,10 +2117,10 @@ 

Overview of collectors

-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
-  map(~p_col %>% 
+  map(~p_col %>%
     inner_join(poems %>% filter(collection==.x),by=c(\p_id\)) %>%
     count(col_id) %>%
     left_join(collectors,by=c(\col_id\)) %>%
@@ -2172,7 +2172,7 @@ 

Overview of collectors

-
p_col %>% 
+
p_col %>%
   anti_join(collectors) %>%
   count(col_id) %>%
   gt() %>%
@@ -2592,7 +2592,7 @@ 

Overview of collectors

- + @@ -2669,8 +2669,8 @@

Overview of collectors

- - + +
n
Lönnrot, Elias 1 239
@@ -2698,7 +2698,7 @@

Overview of collectors

inner_join(themes %>% rename(theme_name=name)) %>% inner_join(collectors) %>% filter(collection=="SKVR") %>% - collect() %>% + collect() %>% # group_by(collection) %>% # mutate(theme_name=fct_lump_n(theme_name, n, n=5, other_level="Muut")) %>% # ungroup() %>% @@ -2728,15 +2728,15 @@

Geographical overview

-
d <- p_loc %>% 
-  count(loc_id) %>% 
+
d <- p_loc %>%
+  count(loc_id) %>%
   inner_join(locations) %>%
   select(name,n) %>%
   collect()
 
-poems_without_location <- poems %>% 
-  anti_join(p_loc) %>% 
-  count() %>% 
+poems_without_location <- poems %>%
+  anti_join(p_loc) %>%
+  count() %>%
   pull()
 
 unprojected_locations <- d %>%
@@ -3701,17 +3701,17 @@ 

Geographical overview by collection

-
d <- p_loc %>% 
+
d <- p_loc %>%
   left_join(poems) %>%
-  count(collection,loc_id) %>% 
+  count(collection,loc_id) %>%
   ungroup() %>%
   inner_join(locations) %>%
   select(collection,name,n) %>%
   collect()
 
-poems_without_location <- poems %>% 
-  anti_join(p_loc) %>% 
-  count(collection) %>% 
+poems_without_location <- poems %>%
+  anti_join(p_loc) %>%
+  count(collection) %>%
   collect() %>%
   mutate(name=NA_character_)
 
@@ -3724,16 +3724,16 @@ 

Geographical overview by collection

-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
   map(~
     tm_shape(
       polygons %>%
         left_join(
-          p_loc %>% 
+          p_loc %>%
             inner_join(poems %>% filter(collection==.x),by=c(\p_id\)) %>%
-            count(loc_id) %>% 
+            count(loc_id) %>%
             inner_join(locations) %>%
             select(name,n) %>%
             collect()
@@ -3778,7 +3778,7 @@ 

Poem locations not mapped by collection

-
poems %>% 
+
poems %>%
   distinct(collection) %>%
   pull() %>%
   map(~
@@ -3800,12 +3800,12 @@ 

Informants

-
raw_meta %>% 
+
raw_meta %>%
   filter(field=="INF") %>%
-  mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>% 
-  mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>% 
-  group_by(name) %>% 
-  summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>% 
+  mutate(value_c=str_replace_all(value,"^\\s*[A-Za-zÅÄÖåäö][a-zåäö][a-zåäö]+\\.","")) %>%
+  mutate(name=str_replace_all(value_c,"\\s*\"*([A-Za-zÅÄÖåäö]?[a-zåäö]?[a-zåäö]?\\.?[^;.,]+)(.|\\n)*","\\1")) %>%
+  group_by(name) %>%
+  summarise(origs=str_flatten(sql("distinct value"),collapse="|"),n=n(),.groups="drop") %>%
   collect()
@@ -3816,20 +3816,20 @@

Poem types

-
poems %>% 
+
poems %>%
   filter(collection!="literary") %>%
-  left_join(poem_theme %>% filter(is_minor==0) %>% inner_join(themes %>% mutate(theme_type=if_else(str_detect(theme_id,"^erab_orig"),"Non-unified","Unified")))) %>% 
+  left_join(poem_theme %>% filter(is_minor==0) %>% inner_join(themes %>% mutate(theme_type=if_else(str_detect(theme_id,"^erab_orig"),"Non-unified","Unified")))) %>%
   group_by(collection,p_id) %>%
   summarise(theme_type=case_when(
-    any(theme_type=="Unified") ~ "Systematisoituja", 
-    any(theme_type=="Non-unified") ~ "Vain ei-systematisoituja", 
+    any(theme_type=="Unified") ~ "Systematisoituja",
+    any(theme_type=="Non-unified") ~ "Vain ei-systematisoituja",
     T ~ "Ei annotointeja"), .groups="drop") %>%
   count(collection,theme_type) %>%
   collect() %>%
   mutate(collection=fct_rev(fct_relevel(str_to_upper(collection),"ERAB","SKVR","JR")),theme_type=fct_rev(fct_relevel(theme_type,"Systematisoituja","Vain ei-systematisoituja","Ei annotointeja"))) %>%
-  ggplot(aes(x=collection,y=n,fill=theme_type)) + 
-  geom_col() + 
-  theme_hsci_discrete() + 
+  ggplot(aes(x=collection,y=n,fill=theme_type)) +
+  geom_col() +
+  theme_hsci_discrete() +
   xlab("Kokoelma") +
   ylab("Runoja") +
   labs(fill="Runotyyppiannotaatiot") +
@@ -3850,14 +3850,14 @@ 

Spatiotemporal overview

d <- poems %>%
-  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>% 
+  left_join(p_year %>% mutate(year=if_else(year %in% c(0L,9999L),NA,year))) %>%
   collect() %>%
   mutate(year_ntile=ntile(year,11)) %>%
   group_by(year_ntile) %>%
   mutate(years=str_c(min(year),\-\,max(year))) %>%
   ungroup() %>%
-  left_join(p_loc %>% collect()) %>% 
-  count(years,loc_id) %>% 
+  left_join(p_loc %>% collect()) %>%
+  count(years,loc_id) %>%
   ungroup() %>%
   left_join(locations %>% select(loc_id,name) %>% collect())
@@ -3866,7 +3866,7 @@

Spatiotemporal overview

-
polygons %>% 
+
polygons %>%
   left_join(d %>% complete(name,years)) %>%
   tm_shape() +
   tm_polygons(col='n', id='name', style='fisher', palette='plasma') +
@@ -4314,7 +4314,7 @@ 

Poems with more than 75 verse lines

Poems with more than 75 verses - + @@ -4337,8 +4337,8 @@

Poems with more than 75 verse lines

114 15.24% - - + + @@ -4354,10 +4354,10 @@

By county

-
poem_stats %>% 
-  left_join(p_loc) %>% 
-  left_join(locations) %>% 
-  left_join(locations,by=c("par_id"="loc_id")) %>% 
+
poem_stats %>%
+  left_join(p_loc) %>%
+  left_join(locations) %>%
+  left_join(locations,by=c("par_id"="loc_id")) %>%
   mutate(name=if_else(type.x=="county",name.x,name.y)) %>%
   count(name,nverses) %>%
   ungroup() %>%
@@ -4392,10 +4392,10 @@ 

Line types

-
d <- verses %>% 
-  left_join(verse_poem) %>% 
-  left_join(poems) %>% 
-  count(collection,type) %>% 
+
d <- verses %>%
+  left_join(verse_poem) %>%
+  left_join(poems) %>%
+  count(collection,type) %>%
   ungroup() %>%
   arrange(collection,desc(n)) %>%
   collect()
@@ -4405,7 +4405,7 @@

Line types

-
d %>% 
+
d %>%
   group_by(collection) %>%
   mutate(proportion=n/sum(n)) %>%
   gt() %>%
@@ -4776,7 +4776,7 @@ 

Line types

} - + @@ -4870,8 +4870,8 @@

Line types

- - + +
type 87 0.10%
@@ -4888,9 +4888,9 @@

Verse line lengths

d_nr_characters <- verses_cl %>%
   mutate(nr_characters=str_length(text)) %>%
-  left_join(verse_poem) %>% 
-  left_join(poems) %>% 
-  count(collection,nr_characters) %>% 
+  left_join(verse_poem) %>%
+  left_join(poems) %>%
+  count(collection,nr_characters) %>%
   ungroup() %>%
   arrange(collection,desc(n)) %>%
   collect()
@@ -4899,8 +4899,8 @@ 

Verse line lengths

group_by(v_id) %>% summarise(nr_words=max(pos),.groups="drop") %>% left_join(verse_poem) %>% - left_join(poems) %>% - count(collection,nr_words) %>% + left_join(poems) %>% + count(collection,nr_words) %>% ungroup() %>% arrange(collection,desc(n)) %>% collect()
@@ -4912,7 +4912,7 @@

Verse line lengths in characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   filter(nr_characters<=60) %>%
   ggplot(aes(x=nr_characters,y=n)) +
   geom_col(width=1) +
@@ -4931,7 +4931,7 @@ 

Verse line lengths in characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   group_by(collection) %>%
   mutate(prop=n/sum(n)) %>%
   ungroup() %>%
@@ -4955,7 +4955,7 @@ 

Verse lines with more than 60 characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   mutate(nl=if_else(nr_characters>60,n,0L)) %>%
   group_by(collection) %>%
   summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups=\drop\) %>%
@@ -4973,7 +4973,7 @@ 

Verse line lengths in words

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<=10) %>%
   ggplot(aes(x=nr_words,y=n)) +
   geom_col(width=1) +
@@ -4993,7 +4993,7 @@ 

Verse line lengths in words

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<=10) %>%
   uncount(n) %>%
   ggplot(aes(x=nr_words,y=collection,fill=collection)) +
@@ -5016,7 +5016,7 @@ 

Verse lines with more than 10 words

-
d_nr_words %>% 
+
d_nr_words %>%
   mutate(nl=if_else(nr_words>10,n,0L)) %>%
   group_by(collection) %>%
   summarise(lines=sum(nl),proportion=sum(nl)/sum(n),.groups=\drop\) %>%
@@ -5031,7 +5031,7 @@ 

Verse lines with more than 10 words

-
verse_nr_words <- word_occ %>% 
+
verse_nr_words <- word_occ %>%
   group_by(v_id) %>%
   summarise(nr_words=max(pos)) %>%
   compute_a(unique_indexes=list(c(\v_id\,\nr_words\)))
@@ -5044,8 +5044,8 @@ 

Verse lines with more than 10 words

d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c(\v_id\)) %>% - left_join(poems) %>% + left_join(verse_poem %>% select(-pos),by=c(\v_id\)) %>% + left_join(poems) %>% count(collection,nr_words,pos,nr_characters) %>% collect()
@@ -5063,12 +5063,12 @@

Verse line lengths

d_nr_characters <- verses_cl %>%
   mutate(nr_characters=str_length(text)) %>%
-  left_join(verse_poem) %>% 
-  left_join(p_loc) %>% 
-  left_join(locations) %>% 
-  left_join(locations,by=c("par_id"="loc_id")) %>% 
+  left_join(verse_poem) %>%
+  left_join(p_loc) %>%
+  left_join(locations) %>%
+  left_join(locations,by=c("par_id"="loc_id")) %>%
   mutate(name=if_else(type.x=="county",name.x,name.y)) %>%
-  count(name,nr_characters) %>% 
+  count(name,nr_characters) %>%
   ungroup() %>%
   arrange(name,desc(n)) %>%
   collect()
@@ -5077,11 +5077,11 @@ 

Verse line lengths

group_by(v_id) %>% summarise(nr_words=max(pos),.groups="drop") %>% left_join(verse_poem) %>% - left_join(p_loc) %>% - left_join(locations) %>% - left_join(locations,by=c("par_id"="loc_id")) %>% + left_join(p_loc) %>% + left_join(locations) %>% + left_join(locations,by=c("par_id"="loc_id")) %>% mutate(name=if_else(type.x=="county",name.x,name.y)) %>% - count(name,nr_words) %>% + count(name,nr_words) %>% ungroup() %>% arrange(name,desc(n)) %>% collect()
@@ -5093,7 +5093,7 @@

Verse line lengths in characters

-
d_nr_characters %>% 
+
d_nr_characters %>%
   group_by(name) %>%
   mutate(prop=n/sum(n)) %>%
   ungroup() %>%
@@ -5118,7 +5118,7 @@ 

Verse line lengths in words

-
d_nr_words %>% 
+
d_nr_words %>%
   filter(nr_words<8,name!="Ahvenanmaa") %>%
   mutate(name=fct_reorder(name,n,.fun=max)) %>%
   uncount(n) %>%
@@ -5144,7 +5144,7 @@ 

Number of characters in words by their position

-
verse_nr_words <- word_occ %>% 
+
verse_nr_words <- word_occ %>%
   group_by(v_id) %>%
   summarise(nr_words=max(pos)) %>%
   compute_a(unique_indexes=list(c("v_id","nr_words")))
@@ -5157,8 +5157,8 @@ 

Number of characters in words by their position

d <- word_occ %>% left_join(word_nr_characters) %>% left_join(verse_nr_words) %>% - left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% - left_join(poems) %>% + left_join(verse_poem %>% select(-pos),by=c("v_id")) %>% + left_join(poems) %>% count(collection,nr_words,pos,nr_characters) %>% collect()
@@ -5176,7 +5176,7 @@

Number of characters in words by their position

uncount(n) %>% ggplot(aes(x=nr_characters,y=nr_words,fill=pos)) + stat_binline(binwidth=1) + - facet_grid(collection~pos,labeller = labeller(pos=label_both)) + + facet_grid(collection~pos,labeller = labeller(pos=label_both)) + xlab(\Number of characters in word\) + ylab(\Number of words in verse\) + labs( @@ -5203,7 +5203,7 @@

Number of characters in words by their position

uncount(n) %>% ggplot(aes(x=nr_characters,y=pos,fill=nr_words)) + stat_binline(binwidth=1) + - facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + + facet_grid(collection~nr_words,labeller = labeller(nr_words=label_both)) + xlab(\Number of characters in word\) + ylab(\Position\) + labs( diff --git a/map-experiments.Rmd b/map-experiments.Rmd index 07cd571..1a518be 100644 --- a/map-experiments.Rmd +++ b/map-experiments.Rmd @@ -1,7 +1,7 @@ --- title: "General statistical overviews of FILTER data" date: "`r Sys.Date()`" -output: +output: html_notebook: code_folding: hide toc: yes @@ -11,110 +11,111 @@ output: --- ```{r setup, include=FALSE} -knitr::opts_chunk$set(message=FALSE,dpi=300,fig.retina=2,fig.width=8) +knitr::opts_chunk$set(message = FALSE, dpi = 300, fig.retina = 2, fig.width = 8) source(here::here("src/common_basis.R")) tmap_mode("plot") ``` ```{r} -polygons %>% - filter(map_id==1) %>% +polygons %>% + filter(map_id == 1) %>% left_join(p_pl %>% count(pl_id) %>% collect()) %>% - replace_na(list(n=0)) %>% + replace_na(list(n = 0)) %>% group_by(pol_id, geometry) %>% - summarise(n=sum(n)) %>% + summarise(n = sum(n)) %>% tm_shape() + - tm_polygons(col='n', id='name', style='fisher', palette='plasma') + - tm_layout(main.title="Exp-1") + tm_polygons(col = "n", id = "name", style = "fisher", palette = "plasma") + + tm_layout(main.title = "Exp-1") ``` ```{r} -polygons %>% - filter(map_id==0) %>% +polygons %>% + filter(map_id == 0) %>% left_join(p_pl %>% count(pl_id) %>% collect()) %>% group_by(pol_id, geometry) %>% - summarise(n=sum(n)) %>% + summarise(n = sum(n)) %>% tm_shape() + - tm_polygons(col='n', id='name', style='fisher', palette='plasma') + - tm_layout(main.title="Exp-1") + tm_polygons(col = "n", id = "name", style = "fisher", palette = "plasma") + + tm_layout(main.title = "Exp-1") ``` ```{r} -polygons %>% - filter(map_id==0) %>% +polygons %>% + filter(map_id == 0) %>% left_join(p_pl %>% count(pl_id) %>% collect()) %>% group_by(pol_id, geometry) %>% - summarise(n=sum(n)) %>% + summarise(n = sum(n)) %>% tm_shape() + tm_polygons() + - tm_bubbles(size = "n", col="n") + - tm_layout(main.title="Exp-1") + tm_bubbles(size = "n", col = "n") + + tm_layout(main.title = "Exp-1") ``` ```{r} -polygons %>% - filter(map_id==1) %>% +polygons %>% + filter(map_id == 1) %>% left_join(p_pl %>% count(pl_id) %>% collect()) %>% - replace_na(list(n=0)) %>% + replace_na(list(n = 0)) %>% group_by(pol_id, geometry) %>% - summarise(n=sum(n)) %>% - tm_shape(point.per="largest") + + summarise(n = sum(n)) %>% + tm_shape(point.per = "largest") + tm_polygons() + - tm_bubbles(size = "n", col="n") + - tm_layout(main.title="Exp-1") + tm_bubbles(size = "n", col = "n") + + tm_layout(main.title = "Exp-1") ``` ```{r} -d <- polygons %>% - filter(map_id==0) %>% +d <- polygons %>% + filter(map_id == 0) %>% left_join(p_pl %>% inner_join(poems) %>% count(collection, pl_id) %>% collect()) %>% - replace_na(list(n=0)) %>% + replace_na(list(n = 0)) %>% group_by(collection, pol_id, geometry) %>% - summarise(n=sum(n)) %>% + summarise(n = sum(n)) %>% group_by(pol_id) %>% - mutate(tn=sum(n)) %>% + mutate(tn = sum(n)) %>% ungroup() %>% - mutate(pol_id=str_c("p", pol_id), collection=as_factor(collection)) + mutate(pol_id = str_c("p", pol_id), collection = as_factor(collection)) plot_cols <- coloropt_pal()(4) -grps <- d %>% -# filter(!is.na(n)) %>% - complete(nesting(pol_id,tn,geometry),collection,fill=list(n=0)) %>% - group_by(pol_id) -grobs <- grps %>% - group_map(~ggplotGrob(ggplot(.x, aes(x="", y=n, fill=collection)) + - geom_col(width=1) + - scale_y_continuous(expand=c(0,0)) + - scale_fill_manual(values=plot_cols) + - theme_ps(plot.axes = FALSE)) -) %>% +grps <- d %>% + # filter(!is.na(n)) %>% + complete(nesting(pol_id, tn, geometry), collection, fill = list(n = 0)) %>% + group_by(pol_id) +grobs <- grps %>% + group_map(~ ggplotGrob(ggplot(.x, aes(x = "", y = n, fill = collection)) + + geom_col(width = 1) + + scale_y_continuous(expand = c(0, 0)) + + scale_fill_manual(values = plot_cols) + + theme_ps(plot.axes = FALSE))) %>% set_names(group_keys(grps) %>% pull()) ``` ```{r,fig.width=10,fig.height=14} d %>% - distinct(pol_id,geometry,tn) %>% - tm_shape(point.per="largest") + + distinct(pol_id, geometry, tn) %>% + tm_shape(point.per = "largest") + tm_polygons() + - tm_symbols(shape="pol_id", - shapes=grobs, - size="tn", - legend.shape.show = FALSE, - legend.size.is.portrait = TRUE, - shapes.legend = 22) + - tm_add_legend(type="fill", - col=plot_cols, - labels=c("erab", "jr", "literary", "skvr"), - title="Collection") - + tm_symbols( + shape = "pol_id", + shapes = grobs, + size = "tn", + legend.shape.show = FALSE, + legend.size.is.portrait = TRUE, + shapes.legend = 22 + ) + + tm_add_legend( + type = "fill", + col = plot_cols, + labels = c("erab", "jr", "literary", "skvr"), + title = "Collection" + ) ``` ```{r} d %>% tm_shape() + tm_polygons() + - tm_bubbles(size="n") + - tm_facets(by="collection",ncol=4) + tm_bubbles(size = "n") + + tm_facets(by = "collection", ncol = 4) ``` - diff --git a/renv/activate.R b/renv/activate.R index 9b2e7f1..fcabd68 100644 --- a/renv/activate.R +++ b/renv/activate.R @@ -95,24 +95,24 @@ local({ if ("renv" %in% loadedNamespaces()) unloadNamespace("renv") - # load bootstrap tools + # load bootstrap tools `%||%` <- function(x, y) { if (is.null(x)) y else x } - + catf <- function(fmt, ..., appendLF = TRUE) { - + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) if (quiet) return(invisible()) - + msg <- sprintf(fmt, ...) cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") - + invisible(msg) - + } - + header <- function(label, ..., prefix = "#", @@ -123,22 +123,22 @@ local({ n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) if (n <= 0) return(paste(prefix, label)) - + tail <- paste(rep.int(suffix, n), collapse = "") paste0(prefix, " ", label, " ", tail) - + } - + startswith <- function(string, prefix) { substring(string, 1, nchar(prefix)) == prefix } - + bootstrap <- function(version, library) { - + friendly <- renv_bootstrap_version_friendly(version) section <- header(sprintf("Bootstrapping renv %s", friendly)) catf(section) - + # attempt to download renv catf("- Downloading renv ... ", appendLF = FALSE) withCallingHandlers( @@ -150,7 +150,7 @@ local({ ) catf("OK") on.exit(unlink(tarball), add = TRUE) - + # now attempt to install catf("- Installing renv ... ", appendLF = FALSE) withCallingHandlers( @@ -161,174 +161,174 @@ local({ } ) catf("OK") - + # add empty line to break up bootstrapping from normal output catf("") - + return(invisible()) } - + renv_bootstrap_tests_running <- function() { getOption("renv.tests.running", default = FALSE) } - + renv_bootstrap_repos <- function() { - + # get CRAN repository cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") - + # check for repos override repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) if (!is.na(repos)) { - + # check for RSPM; if set, use a fallback repository for renv rspm <- Sys.getenv("RSPM", unset = NA) if (identical(rspm, repos)) repos <- c(RSPM = rspm, CRAN = cran) - + return(repos) - + } - + # check for lockfile repositories repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) if (!inherits(repos, "error") && length(repos)) return(repos) - + # retrieve current repos repos <- getOption("repos") - + # ensure @CRAN@ entries are resolved repos[repos == "@CRAN@"] <- cran - + # add in renv.bootstrap.repos if set default <- c(FALLBACK = "https://cloud.r-project.org") extra <- getOption("renv.bootstrap.repos", default = default) repos <- c(repos, extra) - + # remove duplicates that might've snuck in dupes <- duplicated(repos) | duplicated(names(repos)) repos[!dupes] - + } - + renv_bootstrap_repos_lockfile <- function() { - + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") if (!file.exists(lockpath)) return(NULL) - + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) if (inherits(lockfile, "error")) { warning(lockfile) return(NULL) } - + repos <- lockfile$R$Repositories if (length(repos) == 0) return(NULL) - + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) names(vals) <- keys - + return(vals) - + } - + renv_bootstrap_download <- function(version) { - + sha <- attr(version, "sha", exact = TRUE) - + methods <- if (!is.null(sha)) { - + # attempting to bootstrap a development version of renv c( function() renv_bootstrap_download_tarball(sha), function() renv_bootstrap_download_github(sha) ) - + } else { - + # attempting to bootstrap a release version of renv c( function() renv_bootstrap_download_tarball(version), function() renv_bootstrap_download_cran_latest(version), function() renv_bootstrap_download_cran_archive(version) ) - + } - + for (method in methods) { path <- tryCatch(method(), error = identity) if (is.character(path) && file.exists(path)) return(path) } - + stop("All download methods failed") - + } - + renv_bootstrap_download_impl <- function(url, destfile) { - + mode <- "wb" - + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 fixup <- Sys.info()[["sysname"]] == "Windows" && substring(url, 1L, 5L) == "file:" - + if (fixup) mode <- "w+b" - + args <- list( url = url, destfile = destfile, mode = mode, quiet = TRUE ) - + if ("headers" %in% names(formals(utils::download.file))) args$headers <- renv_bootstrap_download_custom_headers(url) - + do.call(utils::download.file, args) - + } - + renv_bootstrap_download_custom_headers <- function(url) { - + headers <- getOption("renv.download.headers") if (is.null(headers)) return(character()) - + if (!is.function(headers)) stopf("'renv.download.headers' is not a function") - + headers <- headers(url) if (length(headers) == 0L) return(character()) - + if (is.list(headers)) headers <- unlist(headers, recursive = FALSE, use.names = TRUE) - + ok <- is.character(headers) && is.character(names(headers)) && all(nzchar(names(headers))) - + if (!ok) stop("invocation of 'renv.download.headers' did not return a named character vector") - + headers - + } - + renv_bootstrap_download_cran_latest <- function(version) { - + spec <- renv_bootstrap_download_cran_latest_find(version) type <- spec$type repos <- spec$repos - + baseurl <- utils::contrib.url(repos = repos, type = type) ext <- if (identical(type, "source")) ".tar.gz" @@ -338,36 +338,36 @@ local({ ".tgz" name <- sprintf("renv_%s%s", version, ext) url <- paste(baseurl, name, sep = "/") - + destfile <- file.path(tempdir(), name) status <- tryCatch( renv_bootstrap_download_impl(url, destfile), condition = identity ) - + if (inherits(status, "condition")) return(FALSE) - + # report success and return destfile - + } - + renv_bootstrap_download_cran_latest_find <- function(version) { - + # check whether binaries are supported on this system binary <- getOption("renv.bootstrap.binary", default = TRUE) && !identical(.Platform$pkgType, "source") && !identical(getOption("pkgType"), "source") && Sys.info()[["sysname"]] %in% c("Darwin", "Windows") - + types <- c(if (binary) "binary", "source") - + # iterate over types + repositories for (type in types) { for (repos in renv_bootstrap_repos()) { - + # retrieve package database db <- tryCatch( as.data.frame( @@ -376,89 +376,89 @@ local({ ), error = identity ) - + if (inherits(db, "error")) next - + # check for compatible entry entry <- db[db$Package %in% "renv" & db$Version %in% version, ] if (nrow(entry) == 0) next - + # found it; return spec to caller spec <- list(entry = entry, type = type, repos = repos) return(spec) - + } } - + # if we got here, we failed to find renv fmt <- "renv %s is not available from your declared package repositories" stop(sprintf(fmt, version)) - + } - + renv_bootstrap_download_cran_archive <- function(version) { - + name <- sprintf("renv_%s.tar.gz", version) repos <- renv_bootstrap_repos() urls <- file.path(repos, "src/contrib/Archive/renv", name) destfile <- file.path(tempdir(), name) - + for (url in urls) { - + status <- tryCatch( renv_bootstrap_download_impl(url, destfile), condition = identity ) - + if (identical(status, 0L)) return(destfile) - + } - + return(FALSE) - + } - + renv_bootstrap_download_tarball <- function(version) { - + # if the user has provided the path to a tarball via # an environment variable, then use it tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) if (is.na(tarball)) return() - + # allow directories if (dir.exists(tarball)) { name <- sprintf("renv_%s.tar.gz", version) tarball <- file.path(tarball, name) } - + # bail if it doesn't exist if (!file.exists(tarball)) { - + # let the user know we weren't able to honour their request fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." msg <- sprintf(fmt, tarball) warning(msg) - + # bail return() - + } - + catf("- Using local tarball '%s'.", tarball) tarball - + } - + renv_bootstrap_download_github <- function(version) { - + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") if (!identical(enabled, "TRUE")) return(FALSE) - + # prepare download options pat <- Sys.getenv("GITHUB_PAT") if (nzchar(Sys.which("curl")) && nzchar(pat)) { @@ -474,25 +474,25 @@ local({ options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) } - + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) name <- sprintf("renv_%s.tar.gz", version) destfile <- file.path(tempdir(), name) - + status <- tryCatch( renv_bootstrap_download_impl(url, destfile), condition = identity ) - + if (!identical(status, 0L)) return(FALSE) - + renv_bootstrap_download_augment(destfile) - + return(destfile) - + } - + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we # can use renv::install() to fully capture metadata. renv_bootstrap_download_augment <- function(destfile) { @@ -500,13 +500,13 @@ local({ if (is.null(sha)) { return() } - + # Untar tempdir <- tempfile("renv-github-") on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) untar(destfile, exdir = tempdir) pkgdir <- dir(tempdir, full.names = TRUE)[[1]] - + # Modify description desc_path <- file.path(pkgdir, "DESCRIPTION") desc_lines <- readLines(desc_path) @@ -520,170 +520,170 @@ local({ paste("RemoteSha: ", sha) ) writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) - + # Re-tar local({ old <- setwd(tempdir) on.exit(setwd(old), add = TRUE) - + tar(destfile, compression = "gzip") }) invisible() } - + # Extract the commit hash from a git archive. Git archives include the SHA1 # hash as the comment field of the tarball pax extended header # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) # For GitHub archives this should be the first header after the default one # (512 byte) header. renv_bootstrap_git_extract_sha1_tar <- function(bundle) { - + # open the bundle for reading # We use gzcon for everything because (from ?gzcon) # > Reading from a connection which does not supply a 'gzip' magic # > header is equivalent to reading from the original connection conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) on.exit(close(conn)) - + # The default pax header is 512 bytes long and the first pax extended header # with the comment should be 51 bytes long # `52 comment=` (11 chars) + 40 byte SHA1 hash len <- 0x200 + 0x33 res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) - + if (grepl("^52 comment=", res)) { sub("52 comment=", "", res) } else { NULL } } - + renv_bootstrap_install <- function(version, tarball, library) { - + # attempt to install it into project library dir.create(library, showWarnings = FALSE, recursive = TRUE) output <- renv_bootstrap_install_impl(library, tarball) - + # check for successful install status <- attr(output, "status") if (is.null(status) || identical(status, 0L)) return(status) - + # an error occurred; report it header <- "installation of renv failed" lines <- paste(rep.int("=", nchar(header)), collapse = "") text <- paste(c(header, lines, output), collapse = "\n") stop(text) - + } - + renv_bootstrap_install_impl <- function(library, tarball) { - + # invoke using system2 so we can capture and report output bin <- R.home("bin") exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" R <- file.path(bin, exe) - + args <- c( "--vanilla", "CMD", "INSTALL", "--no-multiarch", "-l", shQuote(path.expand(library)), shQuote(path.expand(tarball)) ) - + system2(R, args, stdout = TRUE, stderr = TRUE) - + } - + renv_bootstrap_platform_prefix <- function() { - + # construct version prefix version <- paste(R.version$major, R.version$minor, sep = ".") prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") - + # include SVN revision for development versions of R # (to avoid sharing platform-specific artefacts with released versions of R) devel <- identical(R.version[["status"]], "Under development (unstable)") || identical(R.version[["nickname"]], "Unsuffered Consequences") - + if (devel) prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") - + # build list of path components components <- c(prefix, R.version$platform) - + # include prefix if provided by user prefix <- renv_bootstrap_platform_prefix_impl() if (!is.na(prefix) && nzchar(prefix)) components <- c(prefix, components) - + # build prefix paste(components, collapse = "/") - + } - + renv_bootstrap_platform_prefix_impl <- function() { - + # if an explicit prefix has been supplied, use it prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) if (!is.na(prefix)) return(prefix) - + # if the user has requested an automatic prefix, generate it auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) if (auto %in% c("TRUE", "True", "true", "1")) return(renv_bootstrap_platform_prefix_auto()) - + # empty string on failure "" - + } - + renv_bootstrap_platform_prefix_auto <- function() { - + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) if (inherits(prefix, "error") || prefix %in% "unknown") { - + msg <- paste( "failed to infer current operating system", "please file a bug report at https://github.com/rstudio/renv/issues", sep = "; " ) - + warning(msg) - + } - + prefix - + } - + renv_bootstrap_platform_os <- function() { - + sysinfo <- Sys.info() sysname <- sysinfo[["sysname"]] - + # handle Windows + macOS up front if (sysname == "Windows") return("windows") else if (sysname == "Darwin") return("macos") - + # check for os-release files for (file in c("/etc/os-release", "/usr/lib/os-release")) if (file.exists(file)) return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) - + # check for redhat-release files if (file.exists("/etc/redhat-release")) return(renv_bootstrap_platform_os_via_redhat_release()) - + "unknown" - + } - + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { - + # read /etc/os-release release <- utils::read.table( file = file, @@ -693,13 +693,13 @@ local({ comment.char = "#", stringsAsFactors = FALSE ) - + vars <- as.list(release$Value) names(vars) <- release$Key - + # get os name os <- tolower(sysinfo[["sysname"]]) - + # read id id <- "unknown" for (field in c("ID", "ID_LIKE")) { @@ -708,7 +708,7 @@ local({ break } } - + # read version version <- "unknown" for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { @@ -717,17 +717,17 @@ local({ break } } - + # join together paste(c(os, id, version), collapse = "-") - + } - + renv_bootstrap_platform_os_via_redhat_release <- function() { - + # read /etc/redhat-release contents <- readLines("/etc/redhat-release", warn = FALSE) - + # infer id id <- if (grepl("centos", contents, ignore.case = TRUE)) "centos" @@ -735,73 +735,73 @@ local({ "redhat" else "unknown" - + # try to find a version component (very hacky) version <- "unknown" - + parts <- strsplit(contents, "[[:space:]]")[[1L]] for (part in parts) { - + nv <- tryCatch(numeric_version(part), error = identity) if (inherits(nv, "error")) next - + version <- nv[1, 1] break - + } - + paste(c("linux", id, version), collapse = "-") - + } - + renv_bootstrap_library_root_name <- function(project) { - + # use project name as-is if requested asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") if (asis) return(basename(project)) - + # otherwise, disambiguate based on project's path id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) paste(basename(project), id, sep = "-") - + } - + renv_bootstrap_library_root <- function(project) { - + prefix <- renv_bootstrap_profile_prefix() - + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) if (!is.na(path)) return(paste(c(path, prefix), collapse = "/")) - + path <- renv_bootstrap_library_root_impl(project) if (!is.null(path)) { name <- renv_bootstrap_library_root_name(project) return(paste(c(path, prefix, name), collapse = "/")) } - + renv_bootstrap_paths_renv("library", project = project) - + } - + renv_bootstrap_library_root_impl <- function(project) { - + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) if (!is.na(root)) return(root) - + type <- renv_bootstrap_project_type(project) if (identical(type, "package")) { userdir <- renv_bootstrap_user_dir() return(file.path(userdir, "library")) } - + } - + renv_bootstrap_validate_version <- function(version, description = NULL) { - + # resolve description file # # avoid passing lib.loc to `packageDescription()` below, since R will @@ -809,17 +809,17 @@ local({ # this function should only be called after 'renv' is loaded # https://github.com/rstudio/renv/issues/1625 description <- description %||% packageDescription("renv") - + # check whether requested version 'version' matches loaded version of renv sha <- attr(version, "sha", exact = TRUE) valid <- if (!is.null(sha)) renv_bootstrap_validate_version_dev(sha, description) else renv_bootstrap_validate_version_release(version, description) - + if (valid) return(TRUE) - + # the loaded version of renv doesn't match the requested version; # give the user instructions on how to proceed remote <- if (!is.null(description[["RemoteSha"]])) { @@ -827,13 +827,13 @@ local({ } else { paste("renv", description[["Version"]], sep = "@") } - + # display both loaded version + sha if available friendly <- renv_bootstrap_version_friendly( version = description[["Version"]], sha = description[["RemoteSha"]] ) - + fmt <- paste( "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.", "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.", @@ -841,90 +841,90 @@ local({ sep = "\n" ) catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) - + FALSE - + } - + renv_bootstrap_validate_version_dev <- function(version, description) { expected <- description[["RemoteSha"]] is.character(expected) && startswith(expected, version) } - + renv_bootstrap_validate_version_release <- function(version, description) { expected <- description[["Version"]] is.character(expected) && identical(expected, version) } - + renv_bootstrap_hash_text <- function(text) { - + hashfile <- tempfile("renv-hash-") on.exit(unlink(hashfile), add = TRUE) - + writeLines(text, con = hashfile) tools::md5sum(hashfile) - + } - + renv_bootstrap_load <- function(project, libpath, version) { - + # try to load renv from the project library if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) return(FALSE) - + # warn if the version of renv loaded does not match renv_bootstrap_validate_version(version) - + # execute renv load hooks, if any hooks <- getHook("renv::autoload") for (hook in hooks) if (is.function(hook)) tryCatch(hook(), error = warnify) - + # load the project renv::load(project) - + TRUE - + } - + renv_bootstrap_profile_load <- function(project) { - + # if RENV_PROFILE is already set, just use that profile <- Sys.getenv("RENV_PROFILE", unset = NA) if (!is.na(profile) && nzchar(profile)) return(profile) - + # check for a profile file (nothing to do if it doesn't exist) path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) if (!file.exists(path)) return(NULL) - + # read the profile, and set it if it exists contents <- readLines(path, warn = FALSE) if (length(contents) == 0L) return(NULL) - + # set RENV_PROFILE profile <- contents[[1L]] if (!profile %in% c("", "default")) Sys.setenv(RENV_PROFILE = profile) - + profile - + } - + renv_bootstrap_profile_prefix <- function() { profile <- renv_bootstrap_profile_get() if (!is.null(profile)) return(file.path("profiles", profile, "renv")) } - + renv_bootstrap_profile_get <- function() { profile <- Sys.getenv("RENV_PROFILE", unset = "") renv_bootstrap_profile_normalize(profile) } - + renv_bootstrap_profile_set <- function(profile) { profile <- renv_bootstrap_profile_normalize(profile) if (is.null(profile)) @@ -932,25 +932,25 @@ local({ else Sys.setenv(RENV_PROFILE = profile) } - + renv_bootstrap_profile_normalize <- function(profile) { - + if (is.null(profile) || profile %in% c("", "default")) return(NULL) - + profile - + } - + renv_bootstrap_path_absolute <- function(path) { - + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( substr(path, 1L, 1L) %in% c(letters, LETTERS) && substr(path, 2L, 3L) %in% c(":/", ":\\") ) - + } - + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") root <- if (renv_bootstrap_path_absolute(renv)) NULL else project @@ -958,50 +958,50 @@ local({ components <- c(root, renv, prefix, ...) paste(components, collapse = "/") } - + renv_bootstrap_project_type <- function(path) { - + descpath <- file.path(path, "DESCRIPTION") if (!file.exists(descpath)) return("unknown") - + desc <- tryCatch( read.dcf(descpath, all = TRUE), error = identity ) - + if (inherits(desc, "error")) return("unknown") - + type <- desc$Type if (!is.null(type)) return(tolower(type)) - + package <- desc$Package if (!is.null(package)) return("package") - + "unknown" - + } - + renv_bootstrap_user_dir <- function() { dir <- renv_bootstrap_user_dir_impl() path.expand(chartr("\\", "/", dir)) } - + renv_bootstrap_user_dir_impl <- function() { - + # use local override if set override <- getOption("renv.userdir.override") if (!is.null(override)) return(override) - + # use R_user_dir if available tools <- asNamespace("tools") if (is.function(tools$R_user_dir)) return(tools$R_user_dir("renv", "cache")) - + # try using our own backfill for older versions of R envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") for (envvar in envvars) { @@ -1009,7 +1009,7 @@ local({ if (!is.na(root)) return(file.path(root, "R/renv")) } - + # use platform-specific default fallbacks if (Sys.info()[["sysname"]] == "Windows") file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") @@ -1017,109 +1017,109 @@ local({ "~/Library/Caches/org.R-project.R/R/renv" else "~/.cache/R/renv" - + } - + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { sha <- sha %||% attr(version, "sha", exact = TRUE) parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) paste(parts, collapse = "") } - + renv_bootstrap_exec <- function(project, libpath, version) { if (!renv_bootstrap_load(project, libpath, version)) renv_bootstrap_run(version, libpath) } - + renv_bootstrap_run <- function(version, libpath) { - + # perform bootstrap bootstrap(version, libpath) - + # exit early if we're just testing bootstrap if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) return(TRUE) - + # try again to load if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { return(renv::load(project = getwd())) } - + # failed to download or load renv; warn the user msg <- c( "Failed to find an renv installation: the project will not be loaded.", "Use `renv::activate()` to re-initialize the project." ) - + warning(paste(msg, collapse = "\n"), call. = FALSE) - + } - + renv_json_read <- function(file = NULL, text = NULL) { - + jlerr <- NULL - + # if jsonlite is loaded, use that instead if ("jsonlite" %in% loadedNamespaces()) { - + json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity) if (!inherits(json, "error")) return(json) - + jlerr <- json - + } - + # otherwise, fall back to the default JSON reader json <- tryCatch(renv_json_read_default(file, text), error = identity) if (!inherits(json, "error")) return(json) - + # report an error if (!is.null(jlerr)) stop(jlerr) else stop(json) - + } - + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") jsonlite::fromJSON(txt = text, simplifyVector = FALSE) } - + renv_json_read_default <- function(file = NULL, text = NULL) { - + # find strings in the JSON text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' locs <- gregexpr(pattern, text, perl = TRUE)[[1]] - + # if any are found, replace them with placeholders replaced <- text strings <- character() replacements <- character() - + if (!identical(c(locs), -1L)) { - + # get the string values starts <- locs ends <- locs + attr(locs, "match.length") - 1L strings <- substring(text, starts, ends) - + # only keep those requiring escaping strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) - + # compute replacements replacements <- sprintf('"\032%i\032"', seq_along(strings)) - + # replace the strings mapply(function(string, replacement) { replaced <<- sub(string, replacement, replaced, fixed = TRUE) }, strings, replacements) - + } - + # transform the JSON into something the R parser understands transformed <- replaced transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) @@ -1127,38 +1127,38 @@ local({ transformed <- gsub("[]}]", ")", transformed, perl = TRUE) transformed <- gsub(":", "=", transformed, fixed = TRUE) text <- paste(transformed, collapse = "\n") - + # parse it json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] - + # construct map between source strings, replaced strings map <- as.character(parse(text = strings)) names(map) <- as.character(parse(text = replacements)) - + # convert to list map <- as.list(map) - + # remap strings in object remapped <- renv_json_read_remap(json, map) - + # evaluate eval(remapped, envir = baseenv()) - + } - + renv_json_read_remap <- function(json, map) { - + # fix names if (!is.null(names(json))) { lhs <- match(names(json), names(map), nomatch = 0L) rhs <- match(names(map), names(json), nomatch = 0L) names(json)[rhs] <- map[lhs] } - + # fix values if (is.character(json)) return(map[[json]] %||% json) - + # handle true, false, null if (is.name(json)) { text <- as.character(json) @@ -1169,16 +1169,16 @@ local({ else if (text == "null") return(NULL) } - + # recurse if (is.recursive(json)) { for (i in seq_along(json)) { json[i] <- list(renv_json_read_remap(json[[i]], map)) } } - + json - + } # load the renv profile, if any diff --git a/src/common_basis.R b/src/common_basis.R index 4a865bb..602dcfa 100644 --- a/src/common_basis.R +++ b/src/common_basis.R @@ -30,7 +30,7 @@ con <- get_connection() register_tables(con, "filter") register_tables(con, "filter_statistical_overview") -types_to_top_level_types <- tbl(con,sql(" +types_to_top_level_types <- tbl(con, sql(" WITH RECURSIVE type_ancestor AS ( SELECT t_id, t_id AS ancestor_t_id FROM filter.types WHERE par_id = 23959 @@ -38,13 +38,13 @@ WITH RECURSIVE type_ancestor AS ( SELECT types.t_id, ancestor_t_id FROM type_ancestor, filter.types WHERE types.par_id=type_ancestor.t_id -) +) SELECT * FROM type_ancestor ")) -polygons <- st_read(con, query=" -SELECT pol_id, ST_AsBinary(geometry) AS geometry -FROM filter.polygons -WHERE geometry IS NOT NULL", geometry_column='geometry') -st_crs(polygons) <- 'urn:ogc:def:crs:EPSG::3067' +polygons <- st_read(con, query = " +SELECT pol_id, ST_AsBinary(geometry) AS geometry +FROM filter.polygons +WHERE geometry IS NOT NULL", geometry_column = "geometry") +st_crs(polygons) <- "urn:ogc:def:crs:EPSG::3067" polygons <- st_make_valid(polygons) %>% inner_join(map_pol %>% inner_join(maps) %>% inner_join(pol_pl) %>% collect())