diff --git a/notebooks/__marimo__/session/poem-excerpt-length.py.json b/notebooks/__marimo__/session/poem-excerpt-length.py.json
new file mode 100644
index 00000000..a66abedd
--- /dev/null
+++ b/notebooks/__marimo__/session/poem-excerpt-length.py.json
@@ -0,0 +1,568 @@
+{
+ "version": "1",
+ "metadata": {
+ "marimo_version": "0.20.4",
+ "script_metadata_hash": null
+ },
+ "cells": [
+ {
+ "id": "Hbol",
+ "code_hash": "537723cf2f1e49893574f4ece30a379b",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "PPA found poems \u2014 poem/excerpt length over time
\nMM expects that excerpts get shorter over time as the books get smaller. We also know that poems are also getting shorter over this time. What evidence of that can we find our found poem excerpt data?"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "MJUe",
+ "code_hash": "55094988fafe294d48983b0eedf34186",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/plain": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "vblA",
+ "code_hash": "c032e1213a9c7f05204da2ebf8542066",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Poems cited in PPA
\nWe don't currently have dates in our poem metadata, but as a starting point we can look at the lengths of being poems quoted in PPA over time."
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "bkHC",
+ "code_hash": "0072cc53b4f26995f9cf0c3d7079b9db",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "lEQa",
+ "code_hash": "8108e62b1ecbf8367fc2d21c40d52eaf",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "PKri",
+ "code_hash": "06c322d2883a22179f240429eb34c9c4",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "Xref",
+ "code_hash": "8cb17d90872cb18eb5cac983334a24fe",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "SFPL",
+ "code_hash": "d15013d0d549daa18f4d8e9785d61899",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/plain": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "BYtC",
+ "code_hash": "c6d35f07f604231ca522bdf1730262e7",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "RGSE",
+ "code_hash": "58673bd27359a8e6f13a0e1624d17666",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "Kclp",
+ "code_hash": "d8727917223475e25d87c71ab9cf943c",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/plain": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "emfo",
+ "code_hash": "835db7c67cde58aa447a06d1d5f78109",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "We can plot poem length by number of words or number of characters - but the general trend looks the same across those measurements."
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "Hstk",
+ "code_hash": "42ae9af5a9b53dc30d8cde4ecc093331",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "nWHF",
+ "code_hash": "d793988107eb1dad0c1e74b6802a139a",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "iLit",
+ "code_hash": "4eb43305aff3b54d271d9e720180cbc8",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Poem length by first appearance in PPA
"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "ZHCJ",
+ "code_hash": "9c04bd15a4c7e8bedf0d1e5108a3cb67",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "ROlb",
+ "code_hash": "e4cdd78d539c32943f968f8e0740b401",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "qnkX",
+ "code_hash": "467a9a28e0f7cdcf531fd919e96726c2",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "TqIu",
+ "code_hash": "78bcfb287fe28dc35883d7999f91b114",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "Vxnm",
+ "code_hash": "79801d8494b6cd7d66641920a78f39a3",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "DnEU",
+ "code_hash": "d8d76843bb8fc986d7894d921a180a49",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "ulZA",
+ "code_hash": "452865be80ac4e839565ecb781228b1f",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "As a way of checking & inspecting the above charts, what are the longest poems in each decade, based on first appearance in PPA?"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "ecfG",
+ "code_hash": "46f3745a0b42e11b5b35ce54fb8bae64",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "Pvdt",
+ "code_hash": "3122ae9d7c639d31915fc01a690fc215",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "How can Shelley's \"The Cenci\" be quoted in 1532 ? Is this really in our data?"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "ZBYS",
+ "code_hash": "7740d19eee35ec93a4baad46ff5ffb2f",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "aLJB",
+ "code_hash": "c05ef44ae7ce10991059fc328d3424f7",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Answer: yes, it is in our data. Passim matched this line from PPA:\n\nof this questyon (who dyd the dede) so whan there is no doubt but that the\n
\nwith this line of Shelley:\n\nother Lurking among the rocks; there is no doubt But that the\n
\nCommon text? there is no doubt but that the"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "nHfw",
+ "code_hash": "5d65f5f26776ee93748a4de50f1aea57",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "xXTn",
+ "code_hash": "131233381e523a5abb22d4d084dfc6d7",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "AjVT",
+ "code_hash": "422225a885003b34649fe7f3e09c29f8",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Identified 47,447 suspect excerpts (PPA work publication year < poem author birth year + 10)"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "pHFh",
+ "code_hash": "191aed126dea5ceeb1cbe4f1c397d3bb",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "NCOB",
+ "code_hash": "1fad41b59e48ec393f14646b8c1cf6e2",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "aqbW",
+ "code_hash": "dae86dde69fba9b0bb97157264f5f2bc",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "TRpd",
+ "code_hash": "c1f01d42a0bc4ba39bcd88bc47ec39ca",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "TXez",
+ "code_hash": "12e732823558a51c8f3440185766e959",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Poem excerpt length
\nHow much of a poem is cited in a PPA work, and how does that change over time?\nTo simplify our measurement and avoid counting duplicate excerpts or excerpts split by page range, we aggregate excerpts\nand collapse the reference spans by PPA work, to determine the total length of each poem cited in each work."
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "dNNg",
+ "code_hash": "56fedfa9c8f0a25428bdcf153c50bd90",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "yCnT",
+ "code_hash": "a959f2ecd67a48395ba7b6db2cd4b2ef",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "Which poems are quoted from the most? (Sorting by sum of reference span lengths)"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "wlCL",
+ "code_hash": "62ad7747b34064a6ea6034aae3359e73",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "kqZH",
+ "code_hash": "988d1b87ec49df18a36280d2c0b527d2",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "5,566 poems are quoted in full \n(based on total reference span length and percentage of poem length, which may not match exactly)"
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "wAgl",
+ "code_hash": "58cc6ed28ddc1b0d3171ff37074955c8",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "rEll",
+ "code_hash": "c1246514a9874b5283bf1d467b8c7530",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "dGlV",
+ "code_hash": "49dcd4711711c6bfb3a849b0d8ba5d66",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/markdown": "We can graph the min/max, but the maximum length is quite large and changes the scale substantially."
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "SdmI",
+ "code_hash": "8eeead6d6a17a8c53d38815ae7216ea6",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ },
+ {
+ "id": "lgWD",
+ "code_hash": "e4bc5ee65ce01047d193db2c7bce6174",
+ "outputs": [
+ {
+ "type": "data",
+ "data": {
+ "text/html": ""
+ }
+ }
+ ],
+ "console": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/notebooks/poem-excerpt-length.py b/notebooks/poem-excerpt-length.py
new file mode 100644
index 00000000..7974406f
--- /dev/null
+++ b/notebooks/poem-excerpt-length.py
@@ -0,0 +1,904 @@
+import marimo
+
+__generated_with = "0.20.4"
+app = marimo.App(width="medium")
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ # PPA found poems — poem/excerpt length over time
+
+ MM expects that excerpts get shorter over time as the books get smaller. We also know that poems are also getting shorter over this time. What evidence of that can we find our found poem excerpt data?
+ """)
+ return
+
+
+@app.cell
+def _():
+ import pathlib
+
+ import altair as alt
+ import marimo as mo
+ import polars as pl
+
+ from corppa.config import get_config
+ from corppa.poetry_detection.polars_utils import load_excerpts_df
+
+ return alt, get_config, load_excerpts_df, mo, pathlib, pl
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Poems cited in PPA
+
+ We don't currently have dates in our poem metadata, but as a starting point we can look at the lengths of being poems quoted in PPA over time.
+ """)
+ return
+
+
+@app.cell
+def _(get_config, load_excerpts_df, pathlib, pl):
+ config_opts = get_config()
+ data_dir = pathlib.Path(config_opts["compiled_dataset"]["data_dir"])
+
+ # Create a dictionary of data files for lookup based on file base name without any extension
+ # so that excerpts data can be .csv or compressed .csv.gz
+ data_paths = {
+ data_file.stem.split(".", 1)[0]: data_file for data_file in data_dir.iterdir()
+ }
+
+ # use the existing method to do the maximal load and join excerpts with ppa and poem metadata, and then subset/combine
+ excerpts_df = (
+ load_excerpts_df(
+ data_paths["excerpts"],
+ ppa_works_meta=data_paths["ppa_work_metadata"],
+ ref_poems_meta=data_paths["poem_meta"],
+ )
+ .with_columns(
+ # round years to decade
+ ppa_pub_decade=pl.col("ppa_pub_year").floordiv(10).mul(10),
+ )
+ .cast(
+ # convert all the length measures to numeric so we can calculate stats
+ {
+ "poem_num_lines": pl.Int32,
+ "poem_num_words": pl.Int32,
+ "poem_char_len": pl.Int32,
+ }
+ )
+ )
+
+ excerpts_df
+ return (excerpts_df,)
+
+
+@app.cell
+def _(excerpts_df):
+ excerpts_df.select("ppa_work_id", "ppa_pub_year", "ppa_pub_decade")
+ return
+
+
+@app.cell
+def _(excerpts_df):
+ # filter down to unique pairs of works + poems with decade and poem length field
+ works_poems_df = excerpts_df.select(
+ "ppa_work_id",
+ "ppa_pub_decade",
+ "poem_id",
+ "poem_num_lines",
+ "poem_num_words",
+ "poem_char_len",
+ ).unique()
+ works_poems_df
+ return (works_poems_df,)
+
+
+@app.cell
+def _(pl, works_poems_df):
+ # aggregate by decade and calculate min/max/average for all poem length measurements
+ work_poem_decade_stats_df = works_poems_df.group_by("ppa_pub_decade").agg(
+ count=pl.len(),
+ # number of lines
+ min_lines=pl.col("poem_num_lines").min(),
+ max_lines=pl.col("poem_num_lines").max(),
+ mean_lines=pl.col("poem_num_lines").mean(),
+ lines_Q1=pl.col("poem_num_lines").quantile(0.25),
+ median_lines=pl.col("poem_num_lines").quantile(0.5), # Q2 = median
+ lines_Q3=pl.col("poem_num_lines").quantile(0.75),
+ # number of words
+ min_words=pl.col("poem_num_words").min(),
+ max_words=pl.col("poem_num_words").max(),
+ mean_words=pl.col("poem_num_words").mean(),
+ words_Q1=pl.col("poem_num_words").quantile(0.25),
+ median_words=pl.col("poem_num_words").quantile(0.5),
+ words_Q3=pl.col("poem_num_words").quantile(0.75),
+ # number of characters poem_char_len
+ min_chars=pl.col("poem_char_len").min(),
+ max_chars=pl.col("poem_char_len").max(),
+ mean_chars=pl.col("poem_char_len").mean(),
+ chars_Q1=pl.col("poem_char_len").quantile(0.25),
+ median_chars=pl.col("poem_char_len").quantile(0.5),
+ chars_Q3=pl.col("poem_char_len").quantile(0.75),
+ )
+ work_poem_decade_stats_df
+ return (work_poem_decade_stats_df,)
+
+
+@app.cell
+def _(alt):
+ def plot_quartiles(df, x_field, x_field_title, stat_field, stat_noun):
+ # generate a layered chart of area between Q1/Q3 and lines for quartiles, means, median
+
+ # unpivot mean/median to graph together with color legend
+ stats_fields = [
+ f"{stat_field}_Q1",
+ f"mean_{stat_field}",
+ f"median_{stat_field}",
+ f"{stat_field}_Q3",
+ ]
+ stats_df = df.unpivot(on=stats_fields, index=x_field)
+
+ # return a layered chart with area and lines
+ return alt.layer(
+ alt.Chart(df)
+ .mark_area(
+ opacity=0.4,
+ color="#f05b69",
+ )
+ .encode(
+ x=alt.X(x_field, title=x_field_title)
+ .axis(format="r")
+ .scale(zero=False),
+ y=alt.Y(
+ f"{stat_field}_Q3",
+ title=f"{stat_noun} (Q1, Q2, Q3, mean, max)",
+ ),
+ y2=f"{stat_field}_Q1",
+ tooltip=stats_fields,
+ ),
+ alt.Chart(stats_df)
+ .mark_line()
+ .encode(x=x_field, y="value", color="variable"),
+ )
+
+ return (plot_quartiles,)
+
+
+@app.cell
+def _(mo, plot_quartiles, work_poem_decade_stats_df):
+ mo.ui.altair_chart(
+ plot_quartiles(
+ work_poem_decade_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ ).properties(
+ title="Mean and quartile poem length in lines for poems cited in PPA works by decade"
+ )
+ )
+ return
+
+
+@app.cell
+def _(custom_boxplot, mo, work_poem_decade_stats_df):
+ mo.ui.altair_chart(
+ custom_boxplot(
+ work_poem_decade_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ )
+ .properties(
+ title="Distribution of poem length for all poems quoted in PPA by decade"
+ )
+ .interactive()
+ )
+ return
+
+
+@app.cell
+def _(alt):
+ # define a custom box plot method using layered plots,
+ # so that we can quickly generate plots from statistics generated by polars
+ # adapted from prior work https://princeton-cdh.github.io/simulating-risk/notebooks/hawkdovemulti-noadjust/
+
+ def custom_boxplot(df, x_field, x_field_title, stat_field, stat_noun):
+ stats_fields = [
+ f"min_{stat_field}",
+ f"{stat_field}_Q1",
+ f"mean_{stat_field}",
+ f"median_{stat_field}",
+ f"{stat_field}_Q3",
+ f"max_{stat_field}",
+ ]
+
+ # create base chart to use across layers
+ base_chart = alt.Chart(df)
+
+ # area chart for Q1 to Q3
+ area_chart = base_chart.mark_rect(width=15).encode(
+ y=alt.Y(f"{stat_field}_Q1").axis(
+ offset=12
+ ), # add offset so axis does not crowd rectangle
+ y2=f"{stat_field}_Q3",
+ x=alt.X(x_field, title=x_field_title).axis(format="r"),
+ tooltip=stats_fields,
+ )
+ stroke_color = "orange"
+ # line chart for min-max spread
+ # specifying a stroke for point on the line only adds the min point
+ minmax_line_chart = base_chart.mark_line(
+ point=alt.OverlayMarkDef(
+ filled=False, shape="stroke", color=stroke_color, strokeWidth=2
+ ),
+ color=stroke_color,
+ ).encode(alt.Y(f"min_{stat_field}"), alt.Y2(f"max_{stat_field}"), x=x_field)
+ # add a stroke for the max
+ max_marks = base_chart.mark_point(
+ shape="stroke", size=55, color=stroke_color
+ ).encode(
+ y=alt.Y(f"max_{stat_field}"),
+ x=x_field,
+ )
+ # add a stroke for the min
+ median_marks = base_chart.mark_point(
+ shape="stroke", size=100, strokeWidth=1, color=stroke_color
+ ).encode(y=f"median_{stat_field}", x=x_field)
+
+ # mean line ?
+ mean_line_chart = base_chart.mark_line(
+ interpolate="monotone", color="yellow", opacity=0.5
+ ).encode(
+ x=alt.X(x_field),
+ y=alt.Y(f"mean_{stat_field}", title=f"{stat_noun} (mean)").scale(
+ zero=False
+ ),
+ )
+
+ return alt.layer(
+ mean_line_chart, minmax_line_chart, area_chart, median_marks, max_marks
+ ).resolve_axis("shared")
+
+ return (custom_boxplot,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ We can plot poem length by number of words or number of characters - but the general trend looks the same across those measurements.
+ """)
+ return
+
+
+@app.cell
+def _(mo, plot_quartiles, work_poem_decade_stats_df):
+ mo.ui.altair_chart(
+ plot_quartiles(
+ work_poem_decade_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "words",
+ "Number of words",
+ ).properties(
+ title="Mean and quartile poem length by number of words for poems cited in PPA works by decade"
+ )
+ )
+ return
+
+
+@app.cell
+def _(mo, plot_quartiles, work_poem_decade_stats_df):
+ mo.ui.altair_chart(
+ plot_quartiles(
+ work_poem_decade_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "chars",
+ "Number of characters",
+ ).properties(
+ title="Mean and quartile poem length by number of characters for poems cited in PPA works by decade"
+ )
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Poem length by first appearance in PPA
+ """)
+ return
+
+
+@app.cell
+def _(excerpts_df, pl):
+ # instead of filtering to unique pairs of works + poems with decade and poem length field,
+ # aggregate by poem id and get the earliest date it is quoted in the PPA
+ poems_firstquoted_df = (
+ # filter out the few PPA works with no publication date, then sort by publication year
+ excerpts_df.filter(~pl.col.ppa_pub_year.is_null())
+ .sort("ppa_pub_year")
+ # group by poem but maintain order so we can get the earliest PPA work an poem is found in
+ .group_by("poem_id", maintain_order=True)
+ .agg(
+ pl.first("ppa_pub_decade"),
+ pl.first("poem_num_lines"),
+ pl.first("poem_num_words"),
+ pl.first("poem_char_len"),
+ pl.first("ppa_work_id"),
+ pl.first("ppa_pub_year"),
+ pl.first("poem_title"),
+ pl.first("poem_author"),
+ )
+ )
+ poems_firstquoted_df
+ return (poems_firstquoted_df,)
+
+
+@app.cell
+def _(pl, poems_firstquoted_df):
+ # now generate stats
+ # aggregate by decade and calculate min/max/average for all poem length measurements
+ poems_firstquoted_stats_df = poems_firstquoted_df.group_by("ppa_pub_decade").agg(
+ count=pl.len(), # number of poems
+ # number of lines
+ min_lines=pl.col("poem_num_lines").min(),
+ max_lines=pl.col("poem_num_lines").max(),
+ mean_lines=pl.col("poem_num_lines").mean(),
+ lines_Q1=pl.col("poem_num_lines").quantile(0.25),
+ median_lines=pl.col("poem_num_lines").quantile(0.5),
+ lines_Q3=pl.col("poem_num_lines").quantile(0.75),
+ # number of words
+ min_words=pl.col("poem_num_words").min(),
+ max_words=pl.col("poem_num_words").max(),
+ mean_words=pl.col("poem_num_words").mean(),
+ words_Q1=pl.col("poem_num_words").quantile(0.25),
+ median_words=pl.col("poem_num_words").quantile(0.5),
+ words_Q3=pl.col("poem_num_words").quantile(0.75),
+ # number of characters poem_char_len
+ min_chars=pl.col("poem_char_len").min(),
+ max_chars=pl.col("poem_char_len").max(),
+ mean_chars=pl.col("poem_char_len").mean(),
+ chars_Q1=pl.col("poem_char_len").quantile(0.25),
+ median_chars=pl.col("poem_char_len").quantile(0.5),
+ chars_Q3=pl.col("poem_char_len").quantile(0.75),
+ )
+ poems_firstquoted_stats_df
+ return (poems_firstquoted_stats_df,)
+
+
+@app.cell
+def _(pl, poems_firstquoted_df):
+ # what is that early outlier skewing the graphs?
+ poems_firstquoted_df.filter(pl.col.ppa_pub_decade.lt(1600)).sort(
+ "poem_num_lines", descending=True
+ )
+ return
+
+
+@app.cell
+def _(pl, poems_firstquoted_stats_df):
+ poems_firstquoted_stats_df.filter(pl.col.median_lines.gt(100))
+ return
+
+
+@app.cell
+def _(mo, plot_quartiles, poems_firstquoted_stats_df):
+ mo.ui.altair_chart(
+ plot_quartiles(
+ poems_firstquoted_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ ).properties(
+ title="Mean and quartile poem length by number of lines for poems first appearance in PPA"
+ )
+ )
+ return
+
+
+@app.cell
+def _(custom_boxplot, mo, poems_firstquoted_stats_df):
+ mo.ui.altair_chart(
+ custom_boxplot(
+ poems_firstquoted_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ )
+ .properties(
+ title="Distribution of poem length based on poem first appearance in PPA by decade"
+ )
+ .interactive()
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ As a way of checking & inspecting the above charts, what are the longest poems in each decade, based on first appearance in PPA?
+ """)
+ return
+
+
+@app.cell
+def _(mo, pl, poems_firstquoted_df):
+ mo.ui.table(
+ poems_firstquoted_df.sort(
+ "ppa_pub_decade",
+ "poem_num_lines",
+ descending=[False, True],
+ nulls_last=True,
+ )
+ .group_by("ppa_pub_decade", maintain_order=True)
+ .agg(
+ pl.first("poem_title"),
+ pl.first("poem_author"),
+ pl.first("poem_num_lines"),
+ pl.first("ppa_work_id"),
+ pl.first("poem_id"),
+ )
+ .cast({"ppa_pub_decade": pl.String}), # cast decade to str for readability
+ label="Longest poem for each decade first quoted in PPA",
+ page_size=40,
+ selection=None,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ How can Shelley's "The Cenci" be quoted in 1532 ? Is this really in our data?
+ """)
+ return
+
+
+@app.cell
+def _(excerpts_df, pl):
+ excerpts_df.filter(pl.col.poem_id.eq("Z200484006")).sort("ppa_pub_year")
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Answer: yes, it is in our data. Passim matched this line from PPA:
+
+ > of this questyon (who dyd the dede) so whan there is no doubt but that the
+
+ with this line of Shelley:
+
+ > other Lurking among the rocks; there is no doubt But that the
+
+
+ Common text? **there is no doubt but that the**
+ """)
+ return
+
+
+@app.cell
+def _(pl):
+ ### Refine poem first quoted date by poet dates
+
+ poet_meta_df = pl.read_csv("data/ref-corpora/ch_poets_meta.csv").with_columns(
+ # construct combined author field for matching with poem metadata
+ author=pl.concat_str(
+ [pl.col("author_firstname"), pl.col("author_lastname")],
+ separator=" ",
+ ),
+ )
+ poet_meta_df
+ return (poet_meta_df,)
+
+
+@app.cell
+def _(excerpts_df, pl, poet_meta_df):
+ # join poet metadata with excerpts to get author birth year
+ # for now maybe we limit to numerics? (undate later ;-P )
+
+ # filter to subset with numeric birth year
+ poet_birthyear_df = poet_meta_df.with_columns(
+ # convert year to number; convert to null if can't be converted
+ poet_birthyear=pl.col.author_birth.str.to_integer(strict=False),
+ ).filter(pl.col.poet_birthyear.is_not_null()) # drop nulls
+
+ excerpts_poets_df = excerpts_df.join(
+ poet_birthyear_df.select("author", "poet_birthyear"),
+ left_on="poem_author",
+ right_on="author",
+ )
+ excerpts_poets_df
+ return (excerpts_poets_df,)
+
+
+@app.cell
+def _(excerpts_poets_df, mo, pl):
+ # how many excerpts are temporally suspect based on poet birth date and ppa publication date?
+ # (or maybe cases where quotation is going in the other direction, possibly through an intermediary....)
+
+ suspect_excerpts_df = excerpts_poets_df.filter(
+ pl.col("ppa_pub_year").lt(pl.col("poet_birthyear").add(10))
+ )
+
+ mo.md(
+ f"Identified {suspect_excerpts_df.height:,} suspect excerpts (PPA work publication year < poem author birth year + 10)"
+ )
+ return (suspect_excerpts_df,)
+
+
+@app.cell
+def _(mo, suspect_excerpts_df):
+ mo.ui.table(
+ suspect_excerpts_df.select(
+ "ppa_work_id",
+ "ppa_title",
+ "ppa_pub_year",
+ "poem_title",
+ "poem_author",
+ "poet_birthyear",
+ "ppa_span_text",
+ "ref_span_text",
+ ),
+ page_size=25,
+ selection=None,
+ )
+ return
+
+
+@app.cell
+def _(excerpts_poets_df, mo, pl, plot_quartiles):
+ # filter those out and rerun the first-citation logic
+
+ filtered_excerpts_df = excerpts_poets_df.filter(
+ pl.col("ppa_pub_year").gt(pl.col("poet_birthyear"))
+ )
+
+ # repeat above logic to identify earliest quote, but on the filtered set
+ filtered_poems_firstquoted_df = (
+ # filter out the few PPA works with no publication date, then sort by publication year
+ filtered_excerpts_df.filter(~pl.col.ppa_pub_year.is_null())
+ .sort("ppa_pub_year")
+ # group by poem but maintain order so we can get the earliest PPA work an poem is found in
+ .group_by("poem_id", maintain_order=True)
+ .agg(
+ pl.first("ppa_pub_decade"),
+ pl.first("poem_num_lines"),
+ pl.first("poem_num_words"),
+ pl.first("poem_char_len"),
+ pl.first("ppa_work_id"),
+ pl.first("ppa_pub_year"),
+ pl.first("poem_title"),
+ pl.first("poem_author"),
+ )
+ )
+
+ # aggregate by decade and calculate min/max/average for all poem length measurements
+ filtered_poems_firstquoted_stats_df = filtered_poems_firstquoted_df.group_by(
+ "ppa_pub_decade"
+ ).agg(
+ count=pl.len(), # number of poems
+ # number of lines
+ min_lines=pl.col("poem_num_lines").min(),
+ max_lines=pl.col("poem_num_lines").max(),
+ mean_lines=pl.col("poem_num_lines").mean(),
+ lines_Q1=pl.col("poem_num_lines").quantile(0.25),
+ median_lines=pl.col("poem_num_lines").quantile(0.5),
+ lines_Q3=pl.col("poem_num_lines").quantile(0.75),
+ # number of words
+ min_words=pl.col("poem_num_words").min(),
+ max_words=pl.col("poem_num_words").max(),
+ mean_words=pl.col("poem_num_words").mean(),
+ words_Q1=pl.col("poem_num_words").quantile(0.25),
+ median_words=pl.col("poem_num_words").quantile(0.5),
+ words_Q3=pl.col("poem_num_words").quantile(0.75),
+ # number of characters poem_char_len
+ min_chars=pl.col("poem_char_len").min(),
+ max_chars=pl.col("poem_char_len").max(),
+ mean_chars=pl.col("poem_char_len").mean(),
+ chars_Q1=pl.col("poem_char_len").quantile(0.25),
+ median_chars=pl.col("poem_char_len").quantile(0.5),
+ chars_Q3=pl.col("poem_char_len").quantile(0.75),
+ )
+
+ mo.ui.altair_chart(
+ plot_quartiles(
+ filtered_poems_firstquoted_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ ).properties(
+ title="Mean and quartile poem length by number of lines for poems first appearance in PPA"
+ )
+ )
+ return filtered_poems_firstquoted_df, filtered_poems_firstquoted_stats_df
+
+
+@app.cell
+def _(custom_boxplot, filtered_poems_firstquoted_stats_df, mo):
+ mo.ui.altair_chart(
+ custom_boxplot(
+ filtered_poems_firstquoted_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "lines",
+ "Number of lines",
+ )
+ .properties(
+ title="Distribution of poem length based on poem first appearance in PPA by decade"
+ )
+ .interactive()
+ )
+ return
+
+
+@app.cell
+def _(filtered_poems_firstquoted_df, mo, pl):
+ mo.ui.table(
+ filtered_poems_firstquoted_df.sort(
+ "ppa_pub_decade",
+ "poem_num_lines",
+ descending=[False, True],
+ nulls_last=True,
+ )
+ .group_by("ppa_pub_decade", maintain_order=True)
+ .agg(
+ pl.first("poem_title"),
+ pl.first("poem_author"),
+ pl.first("poem_num_lines"),
+ pl.first("ppa_work_id"),
+ pl.first("poem_id"),
+ )
+ .cast({"ppa_pub_decade": pl.String}), # cast decade to str for readability
+ label="Longest poem for each decade first quoted in PPA (filtered set, PPA publication > poet birth year + 10)",
+ page_size=40,
+ selection=None,
+ )
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Poem excerpt length
+
+ How much of a poem is cited in a PPA work, and how does that change over time?
+
+ To simplify our measurement and avoid counting duplicate excerpts or excerpts split by page range, we aggregate excerpts
+ and collapse the reference spans by PPA work, to determine the total length of each poem cited in each work.
+ """)
+ return
+
+
+@app.cell
+def _(excerpts_df, pl):
+ # in the percent ppa poetry notebook, we collapsed overlapping spans for the ppa text, for each page of ppa
+ # here, we want to do collapse reference spans for each ppa work
+
+ # collapse excerpts with any overlap to a single span so we can calculate the total number of characters
+ # covered by any of the merged spans
+
+ ref_merged_excerpts_df = (
+ # sort by work, poem, and reference span start
+ excerpts_df.sort("ppa_work_id", "poem_id", "ref_span_start")
+ .select(
+ "ppa_work_id",
+ "ppa_pub_decade",
+ "poem_id",
+ "ref_span_start",
+ "ref_span_end",
+ "poem_author",
+ "poem_title",
+ "poem_char_len",
+ )
+ .with_columns(
+ # Use shift and cumulative max to determine if current span
+ # has any overlap with previous spans or is the beginning of a new group.
+ # shift(1) gets previous row; fill null for first row (which has no previous row),
+ # and calculate current max span end for this page.
+ # NOTE: we use >= because span end is exclusive (i.e., is not included in the range)
+ new_group=(
+ pl.col("ref_span_start")
+ >= pl.col("ref_span_end").shift(1).fill_null(-1).cum_max()
+ )
+ .cast(pl.Int32) # cast to int gives 1 or 0 to indicate new group
+ .over(
+ "ppa_work_id", "poem_id"
+ ) # limit to spans to a single poem quoted in a single work
+ )
+ .with_columns(
+ # because new_group is 1 or 0, cumulative sum gives each group on a page a unique group id
+ pl.col("new_group")
+ .cum_sum()
+ .alias("group_id")
+ .over("ppa_work_id", "poem_id")
+ )
+ .group_by("ppa_work_id", "poem_id", "group_id")
+ .agg(
+ # group by page id and group id and get the smallest start and largest end
+ # to get the outer bounds of the overlapping spans
+ pl.col("ref_span_start").min(),
+ pl.col("ref_span_end").max(),
+ pl.col("ppa_pub_decade").first(),
+ pl.col("poem_title").first(),
+ pl.col("poem_author").first(),
+ pl.col("poem_char_len").first(),
+ )
+ # calculate length of the consolidated reference
+ .with_columns(ref_span_len=pl.col.ref_span_end - pl.col.ref_span_start)
+ # calculate percentage of the poem that is quoted
+ .with_columns(ref_percent=pl.col.ref_span_len.truediv(pl.col.poem_char_len))
+ .drop("group_id")
+ )
+
+ # based on the merged reference spans, calculate how much of each poem is quoted in each work
+
+ excerpt_poem_chars_df = ref_merged_excerpts_df.group_by(
+ "ppa_work_id", "poem_id"
+ ).agg(
+ # calculate the number of characters covered by all merged spans for each poem
+ ref_char_len=(pl.col("ref_span_end") - pl.col("ref_span_start")).sum(),
+ ppa_pub_decade=pl.col.ppa_pub_decade.first(),
+ )
+
+ excerpt_poem_chars_df
+ return (ref_merged_excerpts_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ Which poems are quoted from the most? (Sorting by sum of reference span lengths)
+ """)
+ return
+
+
+@app.cell
+def _(ref_merged_excerpts_df):
+ ref_merged_excerpts_df.sort(
+ "ref_span_len", descending=True, nulls_last=True
+ ).select(
+ "ppa_work_id",
+ "poem_id",
+ "poem_author",
+ "poem_title",
+ "poem_char_len",
+ "ref_span_len",
+ "ref_percent",
+ )
+ return
+
+
+@app.cell
+def _(mo, pl, ref_merged_excerpts_df):
+ fully_quoted_poems = (
+ ref_merged_excerpts_df.filter(pl.col.ref_percent.ge(1))
+ .select("poem_id")
+ .unique()
+ .height
+ )
+
+ mo.md(
+ f"""{fully_quoted_poems:,} poems are quoted in full
+
+ (based on total reference span length and percentage of poem length, which may not match exactly)"""
+ )
+ return
+
+
+@app.cell
+def _(ref_merged_excerpts_df):
+ # which ones are quoted most?
+ # we have numbers of 100% here - guessing this is due to lack of alignment / different ways of counting characters
+
+ ref_merged_excerpts_df.sort("ref_percent", descending=True, nulls_last=True).select(
+ "ppa_work_id",
+ "poem_id",
+ "poem_author",
+ "poem_title",
+ "poem_char_len",
+ "ref_span_len",
+ "ref_percent",
+ )
+ return
+
+
+@app.cell
+def _(alt, mo, pl, plot_quartiles, ref_merged_excerpts_df):
+ # aggregrate reference spans to get statistics over PPA works by decade
+
+ ref_excerpts_stats_df = ref_merged_excerpts_df.group_by("ppa_pub_decade").agg(
+ count=pl.len(),
+ # number of characters quoted from a poem, based on combined reference span length
+ min_chars=pl.col("ref_span_len").min(),
+ max_chars=pl.col("ref_span_len").max(),
+ mean_chars=pl.col("ref_span_len").mean(),
+ chars_Q1=pl.col("ref_span_len").quantile(0.25),
+ median_chars=pl.col("ref_span_len").quantile(0.5),
+ chars_Q3=pl.col("ref_span_len").quantile(0.75),
+ # percent of poem by character length
+ mean_percent=pl.col("ref_percent").mean(),
+ percent_Q1=pl.col("ref_percent").quantile(0.25),
+ median_percent=pl.col("ref_percent").quantile(0.5),
+ percent_Q3=pl.col("ref_percent").quantile(0.75),
+ )
+
+ # unpivot mean/median to graph together with color legend
+ mean_median_ref_stats_df = ref_excerpts_stats_df.unpivot(
+ on=["mean_chars", "median_chars"], index="ppa_pub_decade"
+ )
+
+ mean_median_reflength_chart = (
+ alt.Chart(mean_median_ref_stats_df)
+ .mark_line()
+ .encode(x="ppa_pub_decade", y="value", color="variable")
+ .properties(
+ title="Average and quantiles for poem excerpt length included per PPA work, by PPA publication decade"
+ )
+ )
+
+ mo.ui.altair_chart(
+ plot_quartiles(
+ ref_excerpts_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "chars",
+ "Number of characters",
+ ).properties(
+ title="Mean and quartile poem quotation length by number of characters for poems found in PPA"
+ )
+ )
+ return (ref_excerpts_stats_df,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ We can graph the min/max, but the maximum length is quite large and changes the scale substantially.
+ """)
+ return
+
+
+@app.cell
+def _(alt, mo, ref_excerpts_stats_df):
+ mo.ui.altair_chart(
+ alt.Chart(ref_excerpts_stats_df)
+ .mark_area(
+ opacity=0.4,
+ color="#6252a0",
+ )
+ .encode(
+ x=alt.X("ppa_pub_decade", title="PPA Publication decade").axis(format="r"),
+ y=alt.Y("min_chars", title="Poem characters quoted (min/max length)"),
+ y2="max_chars",
+ )
+ )
+ return
+
+
+@app.cell
+def _(mo, plot_quartiles, ref_excerpts_stats_df):
+ # what percent of poems are quoted over time?
+
+ mo.ui.altair_chart(
+ plot_quartiles(
+ ref_excerpts_stats_df,
+ "ppa_pub_decade",
+ "PPA Publication decade",
+ "percent",
+ "Percent of poem",
+ ).properties(title="Percent of poem quoted in a single work")
+ )
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/src/corppa/poetry_detection/polars_utils.py b/src/corppa/poetry_detection/polars_utils.py
index b9c9de70..fdf205cd 100644
--- a/src/corppa/poetry_detection/polars_utils.py
+++ b/src/corppa/poetry_detection/polars_utils.py
@@ -38,6 +38,9 @@
"author": "poem_author",
"title": "poem_title",
"ref_corpus": "ref_corpus",
+ "num_lines": "poem_num_lines",
+ "num_words": "poem_num_words",
+ "char_len": "poem_char_len",
}