diff --git a/.gitignore b/.gitignore index 1c7f1330..8f463653 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,10 @@ __pycache__/ .test_* *.lcov + +/docs/**/article.md +/docs/2.developers/?.API-docs +/docs/2.developers/?.documentation +.yarn +*.csv +!output_stream.csv diff --git a/docs/2.developers/4.user-guide/.changelog.md b/docs/2.developers/4.user-guide/.changelog.md new file mode 100644 index 00000000..ef1950a1 --- /dev/null +++ b/docs/2.developers/4.user-guide/.changelog.md @@ -0,0 +1,6 @@ +--- +title: Changelog +description: 'This page references the Changelog of Pathway' +toc: false +--- + diff --git a/docs/2.developers/4.user-guide/.gitignore b/docs/2.developers/4.user-guide/.gitignore new file mode 100644 index 00000000..a8b1a044 --- /dev/null +++ b/docs/2.developers/4.user-guide/.gitignore @@ -0,0 +1 @@ +120.development/20.changelog.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/.sql/__init__.py b/docs/2.developers/4.user-guide/.sql/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/.sql/article.py b/docs/2.developers/4.user-guide/.sql/article.py new file mode 100644 index 00000000..06645ce5 --- /dev/null +++ b/docs/2.developers/4.user-guide/.sql/article.py @@ -0,0 +1,246 @@ +# --- +# title: pw.sql +# description: 'Using SQL commands with Pathway using pw.sql function.' +# notebook_export_path: notebooks/tutorials/sql_api.ipynb +# --- + +# # Using SQL with Pathway +# Perform SQL commands using Pathway's `pw.sql` function. +# +# --- +# +# Pathway provides a very simple way to use SQL commands directly in your Pathway application: the use of `pw.sql`. +# Pathway is significantly different from a usual SQL database, and not all SQL operations are available in Pathway. +# In the following, we present the SQL operations which are compatible with Pathway and how to use `pw.sql`. +# +# **This article is a summary of dos and don'ts on how to use Pathway to execute SQL queries, this is not an introduction to SQL.** +# +# ## Usage +# You can very easily execute a SQL command by doing the following: +# +# ```python +# pw.sql(query, tab=t) +# ``` +# +# This will execute the SQL command `query` where the Pathway table `t` (Python local variable) can be referred to as `tab` (SQL table name) inside `query`. +# More generally, you can pass an arbitrary number of tables associations `name, table` using `**kwargs`: `pw.sql(query, tab1=t1, tab2=t2,.., tabn=tn)`. +# +# ## Example + +# + +import pathway as pw + +t = pw.debug.table_from_markdown( + """ + | a | b + 1 | 1 | 2 + 2 | 4 | 3 + 3 | 4 | 7 + """ +) + +ret = pw.sql("SELECT * FROM tab WHERE a2", tab=t) +pw.debug.compute_and_print(result_where) + +# ### Boolean and Arithmetic Expressions +# With the `SELECT ...` and `WHERE ...` clauses, you can use the following operators: +# - boolean operators: `AND`, `OR`, `NOT` +# - arithmetic operators: `+`, `-`, `*`, `/`, `DIV`, `MOD`, `==`, `!=`, `<`, `>`, `<=`, `>=`, `<>` +# - NULL + +result_bool = pw.sql("SELECT a,b FROM tab WHERE b-a>0 AND a>3", tab=t) +pw.debug.compute_and_print(result_bool) + +# Both `!=` and `<>` can be used to check non-equality. + +result_neq = pw.sql("SELECT a,b FROM tab WHERE a != 4 OR b <> 3", tab=t) +pw.debug.compute_and_print(result_neq) + +# `NULL` can be used to filter out rows with missing values: + +# + +t_null = pw.debug.table_from_markdown( + """ + | a | b + 1 | 1 | 2 + 2 | 4 | + 3 | 4 | 7 + """ +) + +result_null = pw.sql("SELECT a, b FROM tab WHERE b IS NOT NULL ", tab=t_null) +pw.debug.compute_and_print(result_null) +# - + +# You can use single row result subqueries in the `WHERE` clause to filter a table based on the subquery results: + +t_subqueries = pw.debug.table_from_markdown( + """ + | employee | salary + 1 | 1 | 10 + 2 | 2 | 11 + 3 | 3 | 12 + """ +) +result_subqueries = pw.sql( + "SELECT employee, salary FROM t WHERE salary >= (SELECT AVG(salary) FROM t)", + t=t_subqueries, +) +pw.debug.compute_and_print(result_subqueries) + +# ⚠️ For now, only single row result subqueries are supported. +# Correlated subqueries and the associated operations `ANY`, `NONE`, and `EVERY` (or its alias `ALL`) are currently not supported. + +# ### `GROUP BY` +# You can use `GROUP BY` to group rows with the same value for a given column, and to use an aggregate function over the grouped rows. + +result_groupby = pw.sql("SELECT a, SUM(b) FROM tab GROUP BY a", tab=t) +pw.debug.compute_and_print(result_groupby) + +# ⚠️ `GROUP BY` and `JOIN` should not be used together in a single `SELECT`. + +# #### Aggregation functions +# With `GROUP BY`, you can use the following aggregation functions: +# - `AVG` +# - `COUNT` +# - `MAX` +# - `MIN` +# - `SUM` +# +# ⚠️ Pathway reducers (`pw.count`, `pw.sum`, etc.) aggregate over `None` values, while traditional SQL aggregate functions skip `NULL` values: be careful to remove all the undefined values before using an aggregate function. + +# ### `HAVING` + +result_having = pw.sql("SELECT a, SUM(b) FROM tab GROUP BY a HAVING SUM(b)>5", tab=t) +pw.debug.compute_and_print(result_having) + +# ### `AS` (alias) +# Pathway supports both notations: `old_name as new_name` and `old_name new_name`. + +result_alias = pw.sql("SELECT b, a AS c FROM tab", tab=t) +pw.debug.compute_and_print(result_alias) + +result_alias = pw.sql("SELECT b, a c FROM tab", tab=t) +pw.debug.compute_and_print(result_alias) + +# ### `UNION` +# Pathway provides the standard `UNION` SQL operator. +# Note that `UNION` requires matching column names. + +# + +t_union = pw.debug.table_from_markdown( + """ + | a | b + 4 | 9 | 3 + 5 | 2 | 7 + """ +) + +result_union = pw.sql("SELECT * FROM tab UNION SELECT * FROM tab2", tab=t, tab2=t_union) +pw.debug.compute_and_print(result_union) +# - + +# ### `INTERSECT` +# Pathway provides the standard `INTERSECT` SQL operator. +# Note that `INTERSECT` requires matching column names. + +# + +t_inter = pw.debug.table_from_markdown( + """ + | a | b + 4 | 9 | 3 + 5 | 2 | 7 + 6 | 1 | 2 + """ +) + +result_inter = pw.sql( + "SELECT * FROM tab INTERSECT SELECT * FROM tab2", tab=t, tab2=t_inter +) +pw.debug.compute_and_print(result_inter) +# - + +# ⚠️ `INTERSECT` does not support `INTERSECT ALL` (coming soon). + +# ### `JOIN` +# Pathway provides different join operations: `INNER JOIN`, `LEFT JOIN` (or `LEFT OUTER JOIN`), `RIGHT JOIN` (or `RIGHT OUTER JOIN`), `SELF JOIN`, and `CROSS JOIN`. + +t_join = pw.debug.table_from_markdown( + """ + | b | c + 4 | 4 | 9 + 5 | 3 | 4 + 6 | 7 | 5 + """ +) +result_join = pw.sql( + "SELECT * FROM left_table INNER JOIN right_table ON left_table.b==right_table.b", + left_table=t, + right_table=t_join, +) +pw.debug.compute_and_print(result_join) + +# ⚠️ `GROUP BY` and `JOIN` should not be used together in a single `SELECT`. + +# ⚠️ `NATURAL JOIN` and `FULL JOIN` are not supported (coming soon). + +# ### `WITH` +# In addition to being placed inside a `WHERE` clause, subqueries can also be performed using the `WITH` keyword: + +result_with = pw.sql( + "WITH group_table (a, sumB) AS (SELECT a, SUM(b) FROM tab GROUP BY a) SELECT sumB FROM group_table", + tab=t, +) +pw.debug.compute_and_print(result_with) + +# ## Differences from the SQL standard +# First of all, not all SQL queries can be executed in Pathway. +# This stems mainly from the fact that Pathway is built to process streaming and dynamic data efficiently. +# +# ### No ordering +# In Pathway, indexes are separately generated and maintained by the engine, which does not guarantee any row order: SQL operations like `LIMIT`, `ORDER BY` or `SELECT TOP` don't always make sense in this context. In the future, we will support an `ORDER BY ... LIMIT ...` keyword combination, which is typically meaningful in Pathway. +# The column `id` is reserved and should not be used as a column name, this column is not captured by `*` expressions. +# +# Furthermore, there is no order on the columns and the column order used in a `SELECT` query need not be preserved. +# +# ### Immutability +# Pathway tables are immutable: operations such as `INSERT INTO` are not supported. +# +# ### Limits +# Correlated subqueries are currently not supported and keywords such as `LIKE`, `ANY`, `ALL`, or `EXISTS` are not supported. +# `COALESCE` and`IFNULL` are not supported but should be soon. +# We strongly suggest not to use anonymous columns: they might work but we cannot guarantee their behavior. +# +# ## Conclusion +# Pathway provides a powerful API to ease the transition of SQL data transformations and pipelines into Pathway. +# However, Pathway and SQL serve different purposes. To benefit from all the possibilities Pathway has to offer we strongly encourage you to use the Python syntax directly, as much as you can. Most of the time, this syntax is at least as easy to follow as SQL - see for example our [join](/developers/user-guide/data-transformation/join-manual/) and [groupby](/developers/user-guide/data-transformation/groupby-reduce-manual/) manu[a]ls. diff --git a/docs/2.developers/4.user-guide/010.introduction/00.welcome.md b/docs/2.developers/4.user-guide/010.introduction/00.welcome.md new file mode 100644 index 00000000..b07dc23d --- /dev/null +++ b/docs/2.developers/4.user-guide/010.introduction/00.welcome.md @@ -0,0 +1,137 @@ +--- +title: "Getting started" +description: "Welcome to the Pathway developer hub" +navigation: true +heading: false +toc: false +--- + +# Welcome to Pathway Developer Documentation! + +Pathway is a Python data processing framework for analytics and AI pipelines over data streams. +It’s the ideal solution for real-time processing use cases like streaming ETL or RAG pipelines for unstructured data. + +Pathway comes with an **easy-to-use Python API**, allowing you to seamlessly integrate your favorite Python ML libraries. +Pathway syntax is simple and intuitive, and **you can use the same code for both batch and streaming processing**. + +Pathway is powered by a **scalable Rust engine** based on Differential Dataflow and performing incremental computation. +Your Pathway code, despite being written in Python, is run by the engine, enabling multithreading, multiprocessing, and distributed computations. +All the pipeline is kept in memory and can be easily deployed with **Docker and Kubernetes**. + +You can install Pathway with a simple pip command: +::pip-install +:: + +::article-img +--- +src: '/assets/content/documentation/pathway-introduction.svg' +alt: 'Pathway code example.' +class: 'mx-auto' +--- +:: + +::container{.flex .gap-8 .items-center .w-full .justify-center} + ::pathway-button{href="/developers/user-guide/introduction/first_realtime_app_with_pathway"} + Your first pipeline + :: + + ::pathway-button{type="secondary" href="https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/notebooks/pathway_intro.ipynb"} + Try it in a Jupyter notebook + :: +:: + +## Use cases +::container{.flex .gap-8 .items-center .w-full .justify-center} + ::flat-tile + --- + title: "Real-time processing over data streams" + --- + #default + - [Real-time ETL.](/developers/showcases/kafka-etl) + - [Event-driven pipelines with alerting.](/developers/showcases/realtime-log-monitoring) + - [Realtime analytics.](/developers/showcases/linear_regression_with_kafka) + - [Switch from batch to streaming.](/developers/user-guide/connecting-to-data/switch-from-batch-to-streaming) + :: + ::flat-tile + --- + title: "Live Data AI Pipelines" + --- + #default + - [Data indexing pipeline and RAG.](/developers/user-guide/llm-xpack/vectorstore_pipeline) + - [LLM-powered data pipeline.](/developers/showcases/llm-alert-pathway) + - [Unstructured data to SQL on-the-fly.](/developers/showcases/unstructured-to-structured) + :: +:: + +## Learn more + +::card-grid{:gapless="true" .mt-8} + ::flat-tile + --- + title: "Key concepts" + icon: "heroicons:bolt-solid" + button: + href: "/developers/user-guide/introduction/concepts" + text: "Read more" + --- + #default + Learn more about how Pathway's engine and what makes it powerful. + :: + ::flat-tile + --- + title: "API docs" + icon: "heroicons:book-open-20-solid" + button: + href: "/developers/api-docs/pathway" + text: "See the API docs" + --- + #default + Not sure how to use a specific feature of Pathway? The answer to your question is likely in the API docs. + :: + ::flat-tile + --- + title: "Tutorials" + icon: "heroicons:academic-cap-solid" + button: + href: "/developers/user-guide/data-transformation/table-operations" + text: "See the tutorials" + --- + #default + Learn how to use Pathway with our tutorials. For beginners and veterans alike, they cover most of Pathway's features. + :: +:: +::card-grid{:gapless="true" .!mt-0} + ::flat-tile + --- + title: "See the repo" + icon: "uil:github" + button: + href: "https://github.com/pathwaycom/pathway" + text: "Go to Github" + --- + #default + Curious about how Pathway works? Don't hesitate to take a look at the sources and clone the repo. + :: + ::flat-tile + --- + title: "Deployment" + icon: "heroicons:rocket-launch-20-solid" + button: + href: "/features/" + text: "Pathway for Enterprise" + --- + #default + Do you want to scale up your Pathway app? Pathway for Enterprise can help you deploy your application on the cloud. + :: + ::flat-tile + --- + title: "What's new?" + icon: "heroicons:light-bulb-solid" + button: + href: "/developers/user-guide/development/changelog" + text: "See the Changelog" + --- + #default + See the latest available features in the Changelog. + :: +:: diff --git a/docs/2.developers/4.user-guide/010.introduction/10.first_realtime_app_with_pathway.md b/docs/2.developers/4.user-guide/010.introduction/10.first_realtime_app_with_pathway.md new file mode 100644 index 00000000..6f3d52e2 --- /dev/null +++ b/docs/2.developers/4.user-guide/010.introduction/10.first_realtime_app_with_pathway.md @@ -0,0 +1,172 @@ +--- +title: 'Your first real-time pipeline' +description: 'A step-by-step guide to build a real-time pipeline with Pathway' +--- +# Your first real-time pipeline: a step-by-step guide +In this article, you will learn how to start your journey with Pathway. + +To build your first real-time pipeline with Pathway, you need to follow these steps: +1. [Install with pip](#step-1-install-with-pip) +2. [Import Pathway](#step-2-import-pathway) +3. [Connect to you data sources with input connectors](#step-3-connect-to-your-data) +4. [Create your pipeline](#step-4-create-your-pipeline) +5. [Send the results with output connectors](#step-5-output-your-results) +6. [Run your pipeline](#step-6-run-your-pipeline) + +## Step 1: Install with pip + +You can download the current Pathway release, which is now available on a free-to-use [license](https://pathway.com/license): +::pip-install +:: +on a Python 3.10+ installation, and you are ready to roll! + +⚠️ Pathway is available on MacOS and Linux. Pathway is currently not supported on Windows. +Windows users may want to use Windows Subsystem for Linux (WSL), docker, or a VM. + +You can also try these steps in an online notebook environment like [Colab](https://colab.research.google.com/). + +> To jumpstart a Pathway project, quickly check our [cookiecutter template](https://github.com/pathwaycom/cookiecutter-pathway). + + +## Step 2: import Pathway + +To use Pathway, you need to import it: + +```python +import pathway as pw +``` + +## Step 3: connect to your data + +Before building your pipeline, you need to connect to your data sources by using [input connectors](/developers/user-guide/connecting-to-data/connectors/). +Input connectors pull the data from the data sources to the Pathway tables. + +Pathway provides a wide range of input connectors. +For example, you can easily connect to Kafka using Pathway [Kafka connector](/developers/user-guide/connectors/kafka_connectors/): + + +```python +class ExampleSchema: + value: int + + +input_table = pw.io.kafka.read( + rdkafka_settings, + topic="topic_name", + format="json", + schema=ExampleSchema +) +``` + +> 💡 To connect to a data source, you need to specify the [schema](/developers/user-guide/types-in-pathway/schema/) of the incoming data. + +You can learn more about the connectors and see the full list of input connectors [here](/developers/user-guide/connecting-to-data/connectors/). + + +If you don't have any data source ready but still want to try out Pathway, you can use the [`pw.demo`](/developers/user-guide/connecting-to-data/artificial-streams/) library to generate an artificial data source. + + + +## Step 4: Create your pipeline +Now that you have data, you can process it as you want! +Joins, temporal windows, filtering... +You can have a glimpse of the available operations in our [basic operations guide](/developers/user-guide/data-transformation/table-operations/). + +For simplicity, let's start with a simple sum: + +```python +sum_table = input_table.reduce(sum=pw.reducers.sum(input_table.value)) +``` + +The `sum_table` contains a single column `sum` with a single entry that is the sum of all the values in the table `input_table`. +This value is automatically updated by Pathway when new values are added to the `input_table` table. + +## Step 5: Output your results + +Now that your pipeline is ready, you need to specify what to do with the results. +Pathway uses [output connectors](/developers/user-guide/connecting-to-data/connectors/) to output the data outside of Pathway. + +Let's output the results in a new CSV file by using the [`pw.io.csv`](/developers/api-docs/pathway-io/csv/) connector. + +```python +pw.io.csv.write(sum_table, "output_stream.csv") +``` + +## Step 6: Run your pipeline +Now that everything is ready, you can easily run while letting Pathway handle the updates. +To launch the computation on streaming data you need to add `pw.run()`: + +```python +pw.run() +``` + +Don't forget to run the computation with `pw.run()`, otherwise the pipeline is built, but no data is ingested so no computation happens. + +That's it! + + +With `pw.run()`, the computation is launched. +Each update in the input data streams will automatically trigger the update of the whole pipeline. +Pathway will poll for new updates until the process is terminated: the computation runs forever until the process gets killed. +**This is the normal behavior of Pathway.** + +If you want to test your pipeline on static and finite data, Pathway also provides a static mode. +You can learn more about both modes in our [dedicated article](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). + + +## Understanding the output + +Suppose the previous example of a simple sum on the values received on the Kafka topic with the following input: +``` +{"value":1} +{"value":2} +``` + +We assume the first value was received at time 1, and the second at time 2. +Here is the resulting CSV file `output_stream.csv` in which all the successive updates of the sum values are displayed: + +```shellscript +sum,time,diff +1,1,1 +1,2,-1 +3,2,1 +``` + +Let's explain the meaning of those lines. +Each row contains: +* the columns of the output table, here you only have `sum`. +* `time`, which represents the time at which the update has happened. In practice, the time is a timestamp. +* `diff`, which represents whether the row represents an addition or a deletion. An update is represented by two rows: one to remove the old value, one to add the new values. Those two rows have the same time to ensure the atomicity of the operation. + + +Hence, the first line represents the insertion (`diff=1`) of a new value (`sum=0`), which has happened at time 1. +Then a value is added containing the new value 1, changing the value from 0 to 1, resulting in both the deletion (`diff=-1`) of the previous value (`sum=0`) which has happened at time 1 AND an addition (`diff=1`) of a new value (`sum=1`) at the same time (`time=2`). + + +## Bonus: using the CLI + +In addition to `python my_script`, you can use Pathway CLI to launch your computation: + +``` +pathway spawn python realtime_sum.py +``` + +With the CLI, you can use more advanced features such as multiprocessing. + +### Single machine using multiprocessing +While being a requirement for efficiency, doing multiprocessing and multithreading with Python can be tedious. +Pathway natively provides multiprocessing and multithreading: no extra library nor GIL! +You only need to use the CLI and specify the number of processes and the number of threads per process by using the `--processes` (`-n`) and the `--threads` (`-t`) options! +To launch your application with 2 processes, having 3 threads each, you can do as follows: + +```bash +pathway spawn --processes 2 --threads 3 python realtime_sum.py +``` + + +## Next steps +Congratulations, you are now ready to do more advanced pipelines. +Why not try to connect to CSV files using the [CSV connector](/developers/user-guide/connectors/csv_connectors/) or do [a linear regression on Kafka](/developers/showcases/linear_regression_with_kafka)? +You can also take a look at Pathway [basic operations guide](/developers/user-guide/data-transformation/table-operations/), describing the elements of Pathway you are most likely to use from day one. +If you want to learn more about how Pathway works, you can read our article about [Pathway concepts](/developers/user-guide/introduction/concepts). +If you encounter an issue, don't hesitate to check-out our [get-help page](/developers/user-guide/development/get-help/). diff --git a/docs/2.developers/4.user-guide/010.introduction/15.concepts.md b/docs/2.developers/4.user-guide/010.introduction/15.concepts.md new file mode 100644 index 00000000..b5b7e0c6 --- /dev/null +++ b/docs/2.developers/4.user-guide/010.introduction/15.concepts.md @@ -0,0 +1,374 @@ +--- +title: 'Core concepts' +description: 'A review of the core concepts behind the Pathway programming framework.' +--- + + +# Core concepts +A review of the core concepts behind data representation and data transformation in the programming layer of Pathway. + +These are the core concepts you need to know about Pathway: +- [Input connectors](#connect-to-your-data-sources-with-input-connectors) +- [Tables](#tables-dynamic-content-with-static-schema) +- [Transformations](#processing-the-data-with-transformations) +- [Output connectors](#send-the-results-to-external-systems-using-output-connectors) +- [Dataflow](#dataflow) +- [Runtime](#running-the-computation-with-the-rust-engine) + + + +## Pipeline construction with connectors and transformations + +In Pathway, there is a clear divide between the definition of the pipeline and the execution of the computation. +The pipeline defines the sources of the data, the transformations performed on the data, and where the results are sent. +The pipeline is a recipe of your processing, defining the ingredients (the data sources) and the different operations (the transformations): it does not contain any actual food (data). +Once your pipeline is built, you can run the computation to ingest and process the data. + + +::article-img +--- +src: '/assets/content/documentation/concepts/pathway-pipeline.svg' +alt: 'Pathway pipeline' +zoomable: true +--- +:: + +In this section, you will learn how to build your data pipeline. + +### Connect to your data sources with input connectors + +**Connectors** are Pathway's interface with external systems, for both extracting the input data and sending the output data. +Extracting input data from external data sources is done using **input connectors**. + + +::article-img +--- +src: '/assets/content/documentation/concepts/input-connector.svg' +alt: 'Input connector ingests the data into Pathway' +zoomable: true +--- +:: + +Input connectors return a Pathway table which represents a snapshot of the input stream (more information on tables [below](#tables-dynamic-content-with-static-schema)). + +You need to provide a [schema](/developers/user-guide/types-in-pathway/schema/) to the input connector. +For example, let's consider a data stream made of events containing two fields: a primary key `name` that is a string and an integer value `age`. + +```python +class InputSchema(pw.Schema): + name: str = pw.column_definition(primary_key=True) + age: int +``` + +Using the schema, the input connector knows how to format the data. +Pathway comes with many input connectors: + +::code-group-tabs +```python [CSV] +input_table = pw.io.csv.read("./input_dir/", schema=InputSchema) +``` +```python [Kafka] +input_table = pw.io.kafka.read(rdkafka_settings, schema=InputSchema, topic="topic1", format="json") +``` +```python [CDC] +input_table = pw.io.debezium.read(rdkafka_settings, schema=InputSchema, topic_name="pets") +``` +:: + +The connector listens for incoming data and updates the resulting table accordingly. + +You can find more information about the available connectors [here](/developers/user-guide/connecting-to-data/connectors). + + + +### Tables: dynamic content with static schema + +**In Pathway, data is modeled as tables, representing snapshots of the data streams.** + +All the data is modeled as tables, similar to classical relational tables, organized into columns and rows. +Tables have a static [schema](/developers/user-guide/types-in-pathway/schema/), but their content is dynamic and is updated whenever a new event is received. +A table is a snapshot of a data stream and is the latest state of all the events that have been received up to the current processing time. + +For example, if two events have been received: + + +::article-img +--- +src: '/assets/content/documentation/concepts/event-stream-table-init.svg' +alt: 'A table is a summary of the event stream at the current time.' +zoomable: true +--- +:: + +There is one row per entry. Each new entry is seen as an addition and represented by a new row. + + +::article-img +--- +src: '/assets/content/documentation/concepts/event-stream-table-add.svg' +alt: 'A new entry results in the addition of a new row to the table.' +zoomable: true +--- +:: + +Pathway also supports the removal and the update of entries. +An update is represented by removing the previous entry and adding the new version at the same time: + + +::article-img +--- +src: '/assets/content/documentation/concepts/event-stream-table-update.svg' +alt: 'A new update is a removal followed by an addition.' +zoomable: true +--- +:: + +Since the column `name` is the primary key, the reception of a new event with an already existing `name` value (`Bob` here) is an update of the associated row in the table. + +As Pathway handles potentially infinite and ever-changing streaming data, the number of rows and the content changes with time as new data comes into the system. +You can learn more about the tables in the dedicated [article](/developers/user-guide/diving-deeper/table-in-pathway/). + +### Processing the data with transformations + +Pathway provides operators to modify the tables such as `select` or `join`. +These operators are called *transformations*. +Pathway has a functional programming approach: each transformation returns a new table, leaving the input table unchanged. +Using the transformations, you can define your processing pipeline, sequentially specifying the transformations your data will go through. + + +::article-img +--- +src: '/assets/content/documentation/concepts/transformation-table.svg' +alt: 'A transformation returns a new table.' +zoomable: true +--- +:: + +For example, you can define a pipeline filtering on the column `age`, keeping only the entries with a positive value. +Then, you sum all the values and store the result in a single-row table with a single column `sum_age`. +Here is a way to do it with Pathway, assuming a correct input table called `input_table`: + +```python +filtered_table = input_table.filter(input_table.age >= 0) +result_table = filtered_table.reduce(sum_age = pw.reducers.sum(filtered_table.age)) +``` + +It's okay if you don't understand everything for now. +Here are the takeaways: +- Each line produces a new table from the previous one. The first line filters the values and the second does the sum. +- Running this code snippet does not run any computation on data. + +The last point, which may seem counter-intuitive at first, is discussed [later in this article](#dataflow). + +Don't hesitate to read our article about [Pathway basic transformations](/developers/user-guide/data-transformation/table-operations/). + +#### External functions and LLMs + +Pathway provides many ready-to-use transformations, but it may not be enough for your project. +If you don't find what you need, don't worry you can use any Python function in your pipeline. +**Pathway allows you to seamlessly integrate with Python Machine Learning libraries, use LLM's, and call into synchronous and asynchronous API's.** + +### Send the results to external systems using output connectors. + +The data is sent out of Pathway using **output connectors**. + + +::article-img +--- +src: '/assets/content/documentation/concepts/output-connector.svg' +alt: 'Output connectors sent the data out of Pathway.' +zoomable: true +--- +:: + + +Output connectors are used to configure the connection to the chosen location (Kafka, PostgreSQL, etc.): + +::code-group +```python [CSV] +pw.io.csv.write(table,"./output_file.csv") +``` +```python [Kafka] +pw.io.kafka.write(table, rdkafka_settings, topic="topic2", format="json") +``` +```python [PostgreSQL] +pw.io.postgres.write(table, psql_setting, "table_name") +``` +:: + +The connector forwards the changes to the table. + +Pathway comes with many output connectors, you can learn more about them in [our dedicated article](/developers/user-guide/connecting-to-data/connectors/). + +Output connectors send the table to the chosen location (Kafka, PostgreSQL, etc.) as a stream of updates. + +#### The output is a data stream. + +**The tables are produced out of the system as data streams.** + +The tables you want to send out of Pathway to external systems are also dynamic. +They are updated due to new events entering the system. +These changes are forwarded to external systems as a data stream: the updates are represented by new events. +As previously, the event can represent the addition or the removal of an entry and an update is represented by a removal of the old entry and its addition with the new value. + +**Pathway handles the data in an incremental way.** +Instead of sending the entire version of the table whenever there is an update, Pathway only sends the changes to the table to be more efficient. +Only the rows that are affected by the changes are sent to the external system. + +For example, consider the case of a Pathway pipeline computing the sum of the ages. +This value is stored in a single-row table and at $t_2$, before Bob's update, the value of the sum is $90$. +Upon reception of Bob's new value, the sum is updated to $40$. +This update is propagated to external systems (Kafka, PostgreSQL, etc.) as the removal of the old entry and the insertion of the new one: + + +::article-img +--- +src: '/assets/content/documentation/concepts/event-stream-output.svg' +alt: 'The changes to the output are sent as an event stream.' +zoomable: true +--- +:: + +The `diff` column represents whether the value has been added (`diff=1`) or removed (`diff=-1`). +Both events are issued at the same time (`t_3` in this example) with no distinctive order. +The time of emission is also included in a column `time`. + +In practice, not all systems support data streams. +Pathway output connectors adapt the updates to match the system constraints. +For example, Pathway PostgreSQL connector sends the output into a PostgreSQL table: **it will not insert the `1` and `-1` values in a separate column**, it will update the table directly in real time. +On the other hand, if the results are outputted to a CSV file, the new events will be appended to the end of the file with the columns `diff` and `time`. +This choice of outputting the results as logs into the CSV file allows having an incremental approach and avoid removing and rewriting the entire CSV file at each update. +This is why, in this case, the `diff` and `time` columns are added. + + +::article-img +--- +src: '/assets/content/documentation/concepts/event-stream-output-systems.svg' +alt: 'Different systems will handle the event stream differently.' +zoomable: true +--- +:: + + +Note that for readability of the CSV output, only the previous value $90$ at time $t_2$ is shown. In practice, all the previous updates are written. + +### Dataflow + +**Transformations and connectors are used to define a pipeline: they are used to build the pipeline, but they do not trigger any computation.** + +In Pathway, the processing pipeline is modeled using a graph. +This graph, called the **dataflow**, models the different transformation steps performed on the data. +Each table is a node, linked with other nodes (tables) by transformations. + +For example, the previous Pathway pipeline is represented as follows: + + +::article-img +--- +src: '/assets/content/documentation/concepts/computation-graph.svg' +alt: 'Dataflow of our example before being run.' +zoomable: true +--- +:: + + + +This dataflow is the core of Pathway. +The user creates a pipeline which is translated into a dataflow by Pathway. +The graph is built by the calls to Pathway operators but, at that point, no computations are done: there is simply no data. + +## Running the computation with the Rust engine + +### Run the computation with `pw.run()` +Now that your pipeline is fully ready, with both connectors and transformations, you can run the computation with the command run: + +```python +pw.run() +``` + +And that's it! With this, running your code will launch the computation. +Each update in the input data streams will automatically trigger the update of the relevant data in the pipeline. + +For example, consider our complete example: + +```python +import pathway as pw + + +class InputSchema(pw.Schema): + name: str = pw.column_definition(primary_key=True) + age: int + + +input_table = pw.io.kafka.read(kafka_settings, schema=InputSchema, topic="topic1", format="json") +filtered_table = input_table.filter(input_table.age >= 0) +result_table = filtered_table.reduce(sum_age = pw.reducers.sum(filtered_table.age)) +pw.io.kafka.write(table, kafka_settings, topic="topic2", format="json") + +pw.run() +``` + + +::article-img +--- +src: '/assets/content/documentation/concepts/computation-graph-run.svg' +alt: 'Dataflow of our example' +zoomable: true +--- +:: + +The reception of a new value in Kafka triggers the insertion of a new row in `input_table`. +This then triggers the update of `filtered_table` and possibly of `result_table`. +The changes are propagated until they have no impact anymore, the altered rows are filtered out by a `filter`, or until they reach the output connector. +In the latter case, the changes are forwarded to the external system. + + +::article-img +--- +src: '/assets/content/documentation/concepts/computation-graph-run-update.svg' +alt: 'The reception of the update is propagated in the dataflow.' +zoomable: true +--- +:: + +Pathway listens to the data sources for new updates until the process is terminated: **the computation runs forever until the process gets killed**. +**This is the normal behavior of Pathway**. + +During the whole run, the dataflow maintains the latest version of the data in order to enable quick updates: **instead of ingesting all the data from scratch in the graph, only the relevant parts are updated to take into account the new data.** + +In our example, at the reception of a new value, the sum is not recomputed from scratch but only incremented by the said value, making the computation faster. + +This dataflow allows the user to focus on the intended behavior of the processing pipeline, as if the data were static, and Pathway handles the updates on its own using the dataflow. + +### Fast in-memory computations thanks to a powerful Rust Engine + +In Pathway, both the storage of the tables and the computations are done in memory. +This can raise legitimate concerns about memory and speed. +Indeed, Python is not known to be the most efficient language: it is a dynamically typed and interpreted language. +Worse, its infamous GIL limits parallelism and concurrency... + +Fortunately, Pathway comes with a powerful Rust engine which takes over once the pipeline is ready. +Python is used for its accessibility to describe the (typed) pipeline, but the dataflow is built and maintained by Pathway engine. +Pathway engine removes those limits associated with Python. +In particular, **Pathway natively supports multithreading and multiprocessing, and can be distributed using Kubernetes.** +The content of the tables is handled by Pathway Rust engine, making it very memory-efficient. +Similarly, most of the transformations are handled at the Rust level, making the processing very fast. + +If you add the incremental nature of the computations, you end-up with the [fastest data processing engine on the market](/blog/streaming-benchmarks-pathway-fastest-engine-on-the-market). + + +### Static mode + +With Pathway, it doesn't matter if you are dealing with static or streaming data. +The same pipeline can be used for both kinds of data, Pathway's engine provides consistent outputs in both cases. +You can combine real-time and historical data in the same code logic. + +In the static mode, all the data is loaded and processed at once and then the process terminates. +It does not wait for new data unlike in the streaming mode. + +You can learn more about the streaming and static mode in [our dedicated article](/developers/user-guide/connecting-to-data/streaming-and-static-modes). \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/010.introduction/20.what-you-get.md b/docs/2.developers/4.user-guide/010.introduction/20.what-you-get.md new file mode 100644 index 00000000..31fb6912 --- /dev/null +++ b/docs/2.developers/4.user-guide/010.introduction/20.what-you-get.md @@ -0,0 +1,140 @@ +--- +title: What you get +description: 'What to expect when using Pathway: experience and performance' +--- + +# What you get with Pathway + +:article-toc + +## Python + Rust: the best of both worlds + +Pathway provides a Python interface and experience created with data developers in mind. You can easily build pipelines by manipulating Pathway tables and rely on the vast resources and libraries of the Python ecosystem. Also, Pathway can seamlessly be integrated into your CI/CD chain as it is inherently compatible with popular tools such as [mypy](https://mypy-lang.org/) or [pytest](https://pytest.org/). +Your Pathway pipelines can be automatically tested, built, and deployed, like any other Python workflow. + +Pathway also relies on a powerful Rust engine to ensure high performance for your pipelines, no matter if you are dealing with batch or streaming data. +Pathway engine makes the utmost of Rust speed and memory safety to provide efficient parallel and distributed processing without being limited by Python's [GIL](https://en.wikipedia.org/w/index.php?title=Global_interpreter_lock&oldid=1144836295). + +Pathway takes the best of both worlds and efficiently associates the convenience of Python with the power of Rust. + +![Pathway: simplicity of Python + power of Rust](/assets/content/documentation/pathway_python_rust.svg) + +## Incremental computation + +Pathway's engine incrementally processes data updates. This means that the minimum work needed by any algorithm or transformation is performed to refresh its results when fresh data arrives. + +![Pathway runs on an incremental data processing engine](/assets/content/documentation/pathway-streaming-engine-30p.gif) + + +## An ML-friendly code life cycle + +### Writing your code + +As a Pathway user, you write code in Python, with Pathway imported as a Python module. +Pathway provides a [cookiecutter template](https://github.com/pathwaycom/cookiecutter-pathway) to help start Pathway projects. + +Data manipulation syntax is built around a "[Table API](/developers/user-guide/data-transformation/table-operations/#starting-with-data)", which closely resembles the [DataFrame API of PySpark](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.DataFrame.html) (which in turn resembles [DataFrames in pandas](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)). + +The same code developed with the Pathway module can be used for batch and streaming use cases, depending only on [data connector settings and method of launching](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). +For many operations, Pathway returns exactly the same results when run in batch and streaming modes. +Exceptions are out-of-order data for which streaming mode may optionally ignore and functions that explicitly depend on processing time. + +### Running code and prototyping + +During rapid prototyping, Pathway code can be written and tested interactively, without waiting for compilation or deployment, with the dataflow being built in the background. +This is helpful for instance during data science work in Jupyter notebooks. + +The developed code prototype can then run on streaming data sources. +The dataflow is handed down to the runtime engine when executing the line `pathway.run()`. + +In terms of the interplay between interactive usability and launching compute graphs, Pathway takes direct inspiration from TensorFlow and [PyTorch](https://github.com/pytorch/pytorch): just like TensorFlow, we explicitly represent computations as graphs that are executed with a `run()` command, however, similarly to PyTorch, we aim to offer the best in class interactive work environment for hands-on, data-driven algorithm design.\ +\ +(Footnote: Our advisor and Business Angel, Lukasz Kaiser, was a co-author of [TensorFlow](https://github.com/tensorflow/tensorflow), and our CTO, Jan Chorowski, was a contributor to [Theano](https://github.com/Theano/Theano)). + +Pathway calls into the same Rust runtime in any deployment - on either streaming data sources or in interactive mode. + +## Consistency of results + +Pathway computes consistent results: each produced output is computed based on data contained in prefixes of the input data streams. +For most of the operations, exactly the same outputs are produced: + +1. If the input data was sent to the engine all at once in batch mode. +2. If the inputs were sent in several smaller batches, or in streaming mode. + +This equivalence of stream and batch processing facilitates easy development of data processing logic: one can reason in terms of steady-state and consistent results and not worry about all intermediate and wrong states through which an eventually consistent system may pass. + +Further details about consistency, coupled with a practical example, can be found [here](/developers/user-guide/persistence-consistency-scaling/consistency/). + +## Containerized deployments + +Pathway is meant to be deployed in a containerized manner. + +Single-machine deployments can easily be achieved using Docker. +The deployment can run concurrently on multiple cores using multiple processes or threads. + +We provide a [`pathway spawn`](/developers/user-guide/introduction/first_realtime_app_with_pathway#bonus-using-the-cli) command to aid in launching multi-process and multi-threaded jobs. + +The choice between threads and multiple processes depends on the nature of the computation. +While communication between threads is faster, Python-heavy workloads may require multiprocess parallelism to bypass the GIL. + +For using Pathway on large workloads beyond a single machine, see [Distributed deployment](/developers/user-guide/persistence-consistency-scaling/worker-architecture/#distributed-deployment). + +## Easy testing and CI/CD + +Pathway tests on offline data snippets can be run locally in any CI/CD pipeline with Python. +Tests can cover the handling of temporal (late, out of order) aspects of data by comparing results on multiple revisions. +Pathway supports several session-replay mechanisms, such as the [demo API](/developers/api-docs/pathway-demo). +These allow recreating streaming scenarios predictably within standard CI/CD pipelines (Jenkins, GitHub Actions, etc.) + + + +::landing-section +#title +Showcases + +#default + + ::card-grid + #default + ::landing-card + --- + icon: 'heroicons:document-magnifying-glass-solid' + button: + text: 'Build an LLM app' + href: '/developers/showcases/llm-alert-pathway/' + --- + #title + Realtime Alerts + + #default + Get realtime alerts with changes in Google Docs using Pathway and LLMs. + :: + ::landing-card + --- + icon: 'heroicons:share-solid' + button: + text: 'Our Gaussian filter' + href: '/developers/showcases/gaussian_filtering_python' + --- + #title + Realtime Analytics + + #default + Perform signal processing on data streams in realtime, even with out-of-order data. + :: + ::landing-card + --- + icon: 'heroicons:document-magnifying-glass-20-solid' + button: + text: 'Learn more' + href: '/developers/showcases/realtime-log-monitoring' + --- + #title + Realtime Log Monitoring + + #default + Monitor your server logs, process your logs, and do realtime anomaly detection. + :: + :: +:: + diff --git a/docs/2.developers/4.user-guide/010.introduction/30.batch-processing-vs-stream-processing.md b/docs/2.developers/4.user-guide/010.introduction/30.batch-processing-vs-stream-processing.md new file mode 100644 index 00000000..0b3b710c --- /dev/null +++ b/docs/2.developers/4.user-guide/010.introduction/30.batch-processing-vs-stream-processing.md @@ -0,0 +1,357 @@ +--- +title: "Batch processing vs stream processing" +description: "Batch Processing vs Stream Processing: Why Stream Processing Leaves Batch in the Dust" +--- + +# Batch Processing vs Stream Processing: Why Stream Processing Leaves Batch in the Dust + +Batch processing and stream processing are the two distinct approaches to handling data: while batch processing involves processing data in discrete chunks or batches, stream processing deals with real-time data processing as data flows continuously through the system. +While batch has become the de facto way of processing data, stream processing is not widely used, even when the processed data is a data stream from Apache Kafka. +Why so? What is so complex about stream processing that people are mainly using batch processing? + + + +**Why would you care about streaming?** + + +In this article, I will show you that **stream processing matters** by answering all the questions you have about batch processing vs stream processing: + +:article-toc-without-title + +By the end of this article, you shall be convinced that stream processing is more than the future of data processing! + +## What are batch processing and stream processing anyway? +First, before answering more complex questions, let's define exactly what we refer to by "batch" and "streaming" processing. + +### Batch processing. + +Batch processing consists of collecting data over a period of time and then processing it all at once. +Instead of handling each transaction or task individually as soon as it arrives, batch processing collects them into a batch or group and processes them together at a scheduled interval. + +In batch processing, the data is called **batch data** and is supposed complete and static: it will not change during the whole computation. +All the batch data is separated into batches that are then processed sequentially. +A batch is a collection of data processed together as a single unit. +This grouping allows for more efficient processing, as multiple items can be handled simultaneously rather than individually. + +In batch processing, all the data should be available at the time of the computation. +If the data is unbounded, only the available data at the time of the start of the computation is considered. +This data is considered as complete and processed as a whole without time consideration. +Whenever new data comes into the system, a new processing job should be done from scratch. +It's commonly used in scenarios where data can be accumulated and processed offline, such as overnight or during periods of low system activity. +This is usually used for tasks like large-scale data analysis, reporting, and ETL (Extract, Transform, Load) processes where it's more efficient to process large amounts of data in batches rather than one at a time. + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-processing.svg' +alt: 'Batch processing' +zoomable: true +class: 'mx-auto' +style: 'width: 580px' +--- +:: + + +Batch processing is usually used to periodically compute statistics over data that has been gathered during the two computations, such as daily analytics, which are computed overnight. + + +### Stream processing. +Stream processing, also called data stream processing or streaming processing, is the **continuous processing of data records as they arrive**, in real-time. +We say "real-time," but the exact term is "near real-time," as some latency is impossible to avoid. +Stream processing aims to deliver the lowest latency possible between the arrival of new data and the update of the system. + + + + +In stream processing, the data is called **streaming data** or **data streams** and is processed as it arrives, often in small, manageable chunks or streams, without the need to store the entire dataset before processing. +This approach enables immediate analysis, response, and decision-making based on the most recent data, making it well-suited for applications like real-time analytics, monitoring, and event-driven processing. + + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/stream-processing.svg' +alt: 'Stream processing' +zoomable: true +class: 'mx-auto' +style: 'width: 580px' +--- +:: + + + +Stream processing is usually used for operational analytics. It allows monitoring of the system's evolution and triggering alerts in real time. For example, you may want to monitor live metrics using a dashboard during a product launch to adapt your strategy (republish, update the content, etc.) based on what happens. In those use cases, the timing is primordial. + +### Comparison: batch processing vs stream processing + +The key difference between batch processing and stream processing lies in their handling of data: batch processing operates on finite datasets, processing them in discrete chunks or batches at scheduled intervals, while stream processing deals with data in real-time, handling it incrementally as it arrives, enabling immediate analysis and action without the need for storing the entire dataset beforehand. + + +| | **Batch Processing** | **Stream Processing** | +|--------------------|-------------------------------------------------------------------------------------------------------|-----| +| Nature of the data | Data considered as complete, without time consideration. | Data with time. | +| Data Handling | Processes data in discrete batches at scheduled intervals. | Processes data in real-time as it arrives. | +| Processing Time | Typically involves longer processing times due to batch accumulation. | Processes data immediately, resulting in faster processing times. | +| Data Storage | Requires storing the entire dataset before processing. | Does not require storing the entire dataset beforehand. | +| Use Cases | Suitable for periodic analytics. | Ideal for real-time analytics, monitoring, and event-driven processing. | +| Latency | Higher latency as processing occurs at scheduled intervals. | Lower latency as processing is immediate upon data arrival. | +| Resource Utilization | May require significant resources during batch processing intervals. | Resource-efficient, utilizing resources as data arrives. | +| Typical use case | Periodic reports for marketing. | Operational analytics. | + +#### Time and consistency + +By handling real time live data, stream processing introduces a new notion in the computation: the time. +The time of arrival of a data point has an impact on the system: late and out-of-order data points may invalidate previous computations. +For example, suppose you want to send an alert if you receive 10 or more data points in less than one minute. +A few minutes ago, you received 9 data points so you didn't trigger an alert. However, you receive, now, a data point that should have been received before: do you send an alert or not? +In stream processing, late data points can have an impact and invalidate the results previously computed. +If you were to redo the same computation over the same data stream but with different times of arrival, the results may not be consistent. + +In batch processing, none of this exists as the data is supposed to be complete at the start of the computation. +Data points are reordered and late data points will not be considered in this computation: the results are consistent according to the data you had at this time. + + +## Why would I use streaming? +Given the previous definitions of batch and streaming, you could think that what you are currently doing, which is most likely batch, is perfect for your use case: you gather your data in your data lake or warehouse and periodically launch processing on the data you have gathered. + + +Then, why would you care about stream processing when everything in your project screams "batch"? + +Actually, things are not that simple. Batch and streaming are more similar than they look at first. +Let's explore and debunk the most common misconceptions about the differences between batch and streaming. + +### Why would I use streaming with my batch data? +Well… is your data really batch? +This is the first question you might ask yourself. Why would you go streaming if your data is not a data stream? + +But is your data really static? **When you think about it, all the data we interact with is, in essence, a data stream**. +Data is typically generated and updated over time. +The only really static datasets are the ones frozen in time that are shared for reproducibility and Kaggle competitions. +**There are only unbounded data streams**. + +Batch processing is simply a way to process large amounts of data at once as if the data were bounded, while stream processing embraces the inherent unboundedness of the data to process it live with as little latency as possible. +New data stream processing frameworks such as Pathway use incrementality and micro-batching optimizations to avoid redundant computations while providing both high throughput with minimal latency. + + +In batch processing, new points are gathered before being processed, leading to potentially high latency. On the other hand, stream processing has low latency as new data points are processed upon reception. + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-vs-stream-latency.svg' +alt: 'Different latency for batch processing vs stream processing.' +zoomable: true +class: 'mx-auto' +--- +:: + + + +### Why would I use something as complex as streaming? +Even if your data is a data stream, batch processing seems appealing: it saves you from handling time and consistency concerns. +It's because of this simplicity that batch processing is the de facto standard for data processing. +Why would you use something as complex as stream processing? + + + +While batch processing may appear simpler initially, it is not the case anymore. **Stream processing frameworks have greatly matured, significantly lowering the learning curve for stream processing using simplified APIs**. +All the challenges of stream processing, such as handling late and out-of-order data, are handled by the engine and hidden from the user. +**Thanks to those new frameworks, stream processing is now as easy as batch processing from the user's perspective**. + + + + + +Time and consistency are still important in stream processing and those frameworks let you configure consistency, i.e., how to deal with late and out-of-order data but their default configuration is similar to batch processing. +For instance, **the default behavior of Pathway is to return an output in real-time, which is what you would have if you were processing the received data using batch processing**. +In particular, Pathway updates its former results whenever a data point arrives late. +You can manually change it so that data points received after a cut-off are ignored, but the default configuration lets you ignore all the time considerations. + +Stream processing even makes some tasks easier by providing temporal operations such as [as-of-join](/developers/user-guide/temporal-data/asof-join) and [temporal windows](/developers/user-guide/temporal-data/windows-manual). +Of course, those are technically possible in batch processing but at a greater development cost. +**Stream processing offers temporal operations that make development way easier**. + + + + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-vs-stream-windows.svg' +alt: 'How to detect alerts in batch processing vs stream processing.' +zoomable: true +class: 'mx-auto' +--- +:: + + + +Triggering an alert whenever more than 10 data points are received in less than a minute is easy with a sliding window. + +### Why would I break my architecture to use streaming? + +What if you have already deployed a working batch-processing pipeline? +Migrating to stream processing might look like a lot of effort. +Your co-workers (or project manager) are not gonna easily accept a change in the architecture. + +Fortunately, streaming architecture is closer to batch than you may think: **your system is likely to be halfway there**. + +- **Your data is already a data stream**. +As previously explained, data sources generate data streams, so you are likely already dealing with data streams. +In fact, 75% of all Fortune 500 companies are using Apache Kafka. +**Yes, people are doing batch processing on data streams**. +If you are using a regular DBMS, you can easily add a Change Data Capture (CDC) mechanism to send the changes happening in your database to your processing engine as a data stream. + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-vs-stream-architecture.svg' +alt: 'Different architecture for batch processing vs streaming processing.' +zoomable: true +class: 'mx-auto' +--- +:: + + +- **Streaming does not require an extra layer in your architecture**. +The days of the lambda architecture are over! With a single data processing framework such as Pathway, you can now handle both batch and streaming. +Moreover, existing tools provide APIs to the most commonly used languages, such as Python and SQL. +Using Python may simplify your architecture as Python is one of the most used languages for ETL, and it is the AI/ML science language. +It's highly probable that people already have some Python data engineers and some part of the pipeline in Python: in that case, using the same language for a larger part of the pipeline makes it easier to develop, test, and maintain. + +Switching to streaming has many benefits as it saves you from translating the data stream into batch data instead of processing it in a streaming fashion. +Furthermore, if you already have a hybrid approach to handle both batch and streaming, using a unified framework can only make things easier for you. + + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-vs-stream-architecture-lambda.svg' +alt: 'Using stream processing is simpler than using the Lambda architecture.' +zoomable: true +class: 'mx-auto' +--- +:: + + + +## Why would you still use batch? It seems quite expensive… +Now, you should be convinced that streaming is relevant and easy. However, the trade-off might still favor batch processing, especially if you already have a deployed batch processing pipeline. If everyone is happy with the current batch processing, why would you change? + +The real question is, "Why would you keep using batch?". +**The main reason for switching to streaming is the cost. In the long run, batch processing will cost more in both direct and indirect costs.** + +### Why computing, again and again, the same metrics over the same (sample) of data? +The basis of batch processing is to periodically do the same computation on all the available data. +When doing a new computation, the data that was already present in the previous one will be reprocessed. +While this is not necessarily the case, in practice, you want to include them to correct previous analyses (wrong statistics because of late data, for example). +So you end up processing the same data again and again. +**This is a waste of resources.** + +Of course, you could optimize your implementation and only process the relevant data… this is exactly what streaming is about! +Stream processing processes data incrementally and in real-time as it arrives: it only reprocesses the relevant data, leaving the rest untouched for better resource usage. +Data stream processing frameworks such as Pathway now use advanced optimization techniques, including incrementality and micro-batching, providing better performance than their batch counterparts. +**Streaming optimizes the code for you, allowing you to use fewer resources.** +And this may have an important impact on your cloud bill. + +### What is the cost of using outdated/wrong analytics? +The fundamental difference between batch and streaming is how the computations are done: batch relies on periodic "once for all" computations while streaming outputs in real-time a continuous flow of updates. +We can wonder what happens between two successive batch computations: your data may change while your results don't. +In other words, **your results are possibly outdated or totally wrong between two batch computations**. +In general, the whole point of the data pipeline is to serve those results, be it analytics, alerts, or reports. +The quality of those results should be your top priority. +**Providing results on outdated or wrong data can be harmful and have a cost to your business**. + +A simple solution would be to increase the frequency of the processing to be sure that the results are based on relatively fresh data. +Yes, sure, but how often should you relaunch your computation? +Finding the correct frequency depends on many factors, such as business logic or the size of the data. +**Finding the sweet spot is hard. Streaming processing handles this for you by maintaining the results up-to-date whenever data comes in.** + + + + +::article-img +--- +src: '/assets/content/documentation/batch-vs-stream/batch-vs-stream-outdated.svg' +alt: 'The results of batch processing can be outdated, unlike stream processing.' +zoomable: true +class: 'mx-auto' +style: 'width: 580px' +--- +:: + + +### Why pick one when you can have both? +A standard scenario is to start with a daily computation and then shortening more and more the time between each refresh because of an increased data load. +Batch processing was a reasonable choice at the beginning, but running this job more and more often and on more and more data will be difficult by sticking to batch. +The use case has slowly migrated from batch to streaming! +You will either end up doing poorly optimized streaming with your batch implementation (see the above point) or switching to a proper streaming tool. +Either way, it'll be painful. +You don't have to wait before switching to a streaming framework. +Indeed, **with unified data processing frameworks such as Pathway, the same code can be used for both batch processing and stream processing**. +Those frameworks rely on powerful engines suitable for both scenarios. +You can first use your code to compute your analytics as a batch processing job, and you can seamlessly switch to a streaming setup whenever it makes sense without changing anything in your code. + + +## Batch processing vs stream processing: stream processing for the win! + +Batch processing has become the de facto standard of data processing because of stream processing's inherent complexity. +While streaming is more expressive and adapted to many use cases, people often prefer batch processing's apparent simplicity. +Fortunately, the stream processing tools have matured, making stream processing as easy as batch processing. + +Unified data processing frameworks such as Pathway allow you to use the same code for batch and streaming. All the complexity, including late data and consistency, are automatically handled and hidden from the user. Those frameworks provide advanced streaming operations, such as temporal windows, while keeping the simplicity of batch processing. This makes the pipelines easier to develop and maintain while optimizing the resources used. + +| | **Batch Processing** | **stream processing** | +|--------------------------|----------------------|-----------------------| +| Nature of the data | Streaming data | Streaming data | +| Simplicity of use | ✅ | ✅ | +| Ease of deployment | ❌ | ✅ | +| Freshness of the results | ❌ | ✅ | +| Cost | ❌ | ✅ | + + +**Batch processing was only a workaround to avoid the complexity of stream processing.** +The incoherence of the results between two computations was a small price to pay to avoid the complex deployment of stream processing. +Now that new tools have made stream processing as accessible as batch processing: why would you keep paying this price? diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/.gitignore b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/.gitignore new file mode 100644 index 00000000..05fac093 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/.gitignore @@ -0,0 +1 @@ +ticker.csv diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/article.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/article.py new file mode 100644 index 00000000..8da13b8c --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step1/article.py @@ -0,0 +1,195 @@ +# --- +# title: "Part 1: Static data exploration in Jupyter" +# description: '' +# notebook_export_path: projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb +# author: pathway +# article: +# date: '2023-11-29' +# thumbnail: '' +# tags: [] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Part 1: Static data exploration in Jupyter +# This notebook is part of the first part of the tutorial [From interactive data exploration to deployment](/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/#part-1-static-data-exploration-in-jupyter). + + +# %% +# Download CSV file +# !wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv + +# %% [markdown] +# ## Loading data + +# %% +import datetime + +import pathway as pw + +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) +data = pw.io.csv.read(fname, schema=schema, mode="static") +# _MD_SHOW_data + +# %% +data = data.with_columns(t=data.t.dt.utc_from_timestamp(unit="ms")) +# _MD_SHOW_data + +# %% +import bokeh.plotting + + +def vwap_history(src): + fig = bokeh.plotting.figure( + height=400, + width=600, + title="Volume-weighted average price", + x_axis_type="datetime", + ) + fig.line("t", "vwap", source=src) + return fig + + +# %% +data.plot(vwap_history, sorting_col="t") + +# %% [markdown] +# ## Designing the algorithm + +# %% +minute_20_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), duration=datetime.timedelta(minutes=20) + ), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2) ** 0.5 + ) + .with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd, + ) +) +# _MD_SHOW_minute_20_stats + +# %% +minute_1_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) +) +# _MD_SHOW_minute_1_stats + +# %% +joint_stats = ( + minute_1_stats.join( + minute_20_stats, pw.left.t == pw.right.t, pw.left.ticker == pw.right.ticker + ) + .select( + *pw.left, + bollinger_lower=pw.right.bollinger_lower, + bollinger_upper=pw.right.bollinger_upper, + ) + .with_columns( + is_alert=( + (pw.this.volume > 10000) + & ( + (pw.this.vwap > pw.this.bollinger_upper) + | (pw.this.vwap < pw.this.bollinger_lower) + ) + ) + ) + .with_columns( + action=pw.if_else( + pw.this.is_alert, + pw.if_else(pw.this.vwap > pw.this.bollinger_upper, "sell", "buy"), + "hodl", + ) + ) +) +# _MD_SHOW_joint_stats + +# %% +alerts = joint_stats.filter(pw.this.is_alert).select( + pw.this.ticker, pw.this.t, pw.this.vwap, pw.this.action +) +# _MD_SHOW_alerts + +# %% [markdown] +# ## Plotting Bollinger Bands + +# %% +import bokeh.models + + +def stats_plotter(src): + actions = ["buy", "sell", "hodl"] + color_map = bokeh.models.CategoricalColorMapper( + factors=actions, palette=("#00ff00", "#ff0000", "#00000000") + ) + + fig = bokeh.plotting.figure( + height=400, + width=600, + title="20 minutes Bollinger bands with last 1 minute average", + x_axis_type="datetime", + ) + + fig.line("t", "vwap", source=src) + + fig.line("t", "bollinger_lower", source=src, line_alpha=0.3) + fig.line("t", "bollinger_upper", source=src, line_alpha=0.3) + fig.varea( + x="t", + y1="bollinger_lower", + y2="bollinger_upper", + fill_alpha=0.3, + fill_color="gray", + source=src, + ) + + fig.scatter( + "t", + "vwap", + size=10, + marker="circle", + color={"field": "action", "transform": color_map}, + source=src, + ) + + return fig + + +# _MD_SHOW_joint_stats.plot(stats_plotter, sorting_col="t") diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/.gitignore b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/.gitignore new file mode 100644 index 00000000..05fac093 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/.gitignore @@ -0,0 +1 @@ +ticker.csv diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/article.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/article.py new file mode 100644 index 00000000..3932eebd --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step2/article.py @@ -0,0 +1,183 @@ +# --- +# title: "Part 2: From static data exploration to interactive dashboard" +# description: '' +# notebook_export_path: projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb +# author: pathway +# article: +# date: '2023-11-29' +# thumbnail: '' +# tags: [] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Part 2: From static data exploration to interactive dashboard prototyping +# This notebook is part of the second part of the tutorial [From interactive data exploration to deployment](https://pathway.com/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/#part-2-from-static-data-exploration-to-interactive-dashboard-prototyping). + +# %% +# Download CSV file +# !wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv + +# %% [markdown] +# ## Switching to streaming data + +# %% +import datetime + +import pathway as pw + +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) +data = pw.demo.replay_csv(fname, schema=schema, input_rate=1000) + +# %% +data = data.with_columns(t=data.t.dt.utc_from_timestamp(unit="ms")) + + +# %% [markdown] +# ## Defining behaviors for streaming windows + +# %% +minute_20_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), duration=datetime.timedelta(minutes=20) + ), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2) ** 0.5 + ) + .with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd, + ) +) + +# %% +minute_1_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) +) + +# %% +joint_stats = ( + minute_1_stats.join( + minute_20_stats, pw.left.t == pw.right.t, pw.left.ticker == pw.right.ticker + ) + .select( + *pw.left, + bollinger_lower=pw.right.bollinger_lower, + bollinger_upper=pw.right.bollinger_upper, + ) + .with_columns( + is_alert=( + (pw.this.volume > 10000) + & ( + (pw.this.vwap > pw.this.bollinger_upper) + | (pw.this.vwap < pw.this.bollinger_lower) + ) + ) + ) + .with_columns( + action=pw.if_else( + pw.this.is_alert, + pw.if_else(pw.this.vwap > pw.this.bollinger_upper, "sell", "buy"), + "hodl", + ) + ) +) + +# %% +alerts = joint_stats.filter(pw.this.is_alert).select( + pw.this.ticker, pw.this.t, pw.this.vwap, pw.this.action +) + +# %% +import bokeh.models + + +def stats_plotter(src): + actions = ["buy", "sell", "hodl"] + color_map = bokeh.models.CategoricalColorMapper( + factors=actions, palette=("#00ff00", "#ff0000", "#00000000") + ) + + fig = bokeh.plotting.figure( + height=400, + width=600, + title="20 minutes Bollinger bands with last 1 minute average", + x_axis_type="datetime", + ) + + fig.line("t", "vwap", source=src) + + fig.line("t", "bollinger_lower", source=src, line_alpha=0.3) + fig.line("t", "bollinger_upper", source=src, line_alpha=0.3) + fig.varea( + x="t", + y1="bollinger_lower", + y2="bollinger_upper", + fill_alpha=0.3, + fill_color="gray", + source=src, + ) + + fig.scatter( + "t", + "vwap", + size=10, + marker="circle", + color={"field": "action", "transform": color_map}, + source=src, + ) + + return fig + + +# %% [markdown] +# ## Running the dashboard + +# %% +import panel as pn + +viz = pn.Row( + joint_stats.plot(stats_plotter, sorting_col="t"), + alerts.show(include_id=False, sorters=[{"field": "t", "dir": "desc"}]), +) +# _MD_SHOW_viz + +# %% +# _MD_SHOW_pw.run() diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/.gitignore b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/.gitignore new file mode 100644 index 00000000..05fac093 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/.gitignore @@ -0,0 +1 @@ +ticker.csv diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/article.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/article.py new file mode 100644 index 00000000..07b4de5e --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3/article.py @@ -0,0 +1,226 @@ +# --- +# title: "Part 3: Kafka integration and alerts forwarding" +# description: '' +# notebook_export_path: projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb +# author: pathway +# article: +# date: '2023-11-29' +# thumbnail: '' +# tags: [] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Part 3: Kafka integration and alerts forwarding (Consumer) +# This notebook is part of the third part of the tutorial [From interactive data exploration to deployment](https://pathway.com/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/#part-3-kafka-integration-and-alerts-forwarding). + +# %% [markdown] +# ## Reading messages from Kafka + +# %% +import datetime + +import pathway as pw + +# TODO Please set appropriate values for KAFKA_ENDPOINT, KAFKA_USERNAME, and KAFKA_PASSWORD +rdkafka_consumer_settings = { + "bootstrap.servers": "KAFKA_ENDPOINT:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "sasl.username": "KAFKA_USERNAME", + "sasl.password": "KAFKA_PASSWORD", + "group.id": "kafka-group-0", + "auto.offset.reset": "earliest", +} + + +# The schema definition is autogenerated +class DataSchema(pw.Schema): + ticker: str + open: float + high: float + low: float + close: float + volume: float + vwap: float + t: int + transactions: int + otc: str + + +data = pw.io.kafka.read( + rdkafka_consumer_settings, topic="ticker", format="json", schema=DataSchema +) + +# %% +data = data.with_columns(t=data.t.dt.utc_from_timestamp(unit="ms")) + +# %% +minute_20_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), duration=datetime.timedelta(minutes=20) + ), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2) ** 0.5 + ) + .with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd, + ) +) + +# %% +minute_1_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) +) + +# %% +joint_stats = ( + minute_1_stats.join( + minute_20_stats, pw.left.t == pw.right.t, pw.left.ticker == pw.right.ticker + ) + .select( + *pw.left, + bollinger_lower=pw.right.bollinger_lower, + bollinger_upper=pw.right.bollinger_upper, + ) + .with_columns( + is_alert=( + (pw.this.volume > 10000) + & ( + (pw.this.vwap > pw.this.bollinger_upper) + | (pw.this.vwap < pw.this.bollinger_lower) + ) + ) + ) + .with_columns( + action=pw.if_else( + pw.this.is_alert, + pw.if_else(pw.this.vwap > pw.this.bollinger_upper, "sell", "buy"), + "hodl", + ) + ) +) + +# %% +alerts = joint_stats.filter(pw.this.is_alert).select( + pw.this.ticker, pw.this.t, pw.this.vwap, pw.this.action +) + +# %% +import bokeh.models + + +def stats_plotter(src): + actions = ["buy", "sell", "hodl"] + color_map = bokeh.models.CategoricalColorMapper( + factors=actions, palette=("#00ff00", "#ff0000", "#00000000") + ) + + fig = bokeh.plotting.figure( + height=400, + width=600, + title="20 minutes Bollinger bands with last 1 minute average", + x_axis_type="datetime", + ) + + fig.line("t", "vwap", source=src) + + fig.line("t", "bollinger_lower", source=src, line_alpha=0.3) + fig.line("t", "bollinger_upper", source=src, line_alpha=0.3) + fig.varea( + x="t", + y1="bollinger_lower", + y2="bollinger_upper", + fill_alpha=0.3, + fill_color="gray", + source=src, + ) + + fig.scatter( + "t", + "vwap", + size=10, + marker="circle", + color={"field": "action", "transform": color_map}, + source=src, + ) + + return fig + + +# %% +import panel as pn + +viz = pn.Row( + joint_stats.plot(stats_plotter, sorting_col="t"), + alerts.show(include_id=False, sorters=[{"field": "t", "dir": "desc"}]), +) +# _MD_SHOW_viz + +# %% [markdown] +# ## Alerts forwarding to Slack + +# %% +import requests + +# TODO Please set appropriate values for SLACK_CHANNEL_ID, and SLACK_TOKEN +slack_alert_channel_id = "SLACK_CHANNEL_ID" +slack_alert_token = "SLACK_TOKEN" + + +def send_slack_alert(key, row, time, is_addition): + if not is_addition: + return + alert_message = f'Please {row["action"]} {row["ticker"]}' + print(f'Sending alert "{alert_message}"') + requests.post( + "https://slack.com/api/chat.postMessage", + data="text={}&channel={}".format(alert_message, slack_alert_channel_id), + headers={ + "Authorization": "Bearer {}".format(slack_alert_token), + "Content-Type": "application/x-www-form-urlencoded", + }, + ).raise_for_status() + + +pw.io.subscribe(alerts, send_slack_alert) + +# %% +# _MD_SHOW_pw.run() diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/.gitignore b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/.gitignore new file mode 100644 index 00000000..05fac093 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/.gitignore @@ -0,0 +1 @@ +ticker.csv diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/article.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/article.py new file mode 100644 index 00000000..187554d8 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from-jupyter-to-deploy/.step3_streamer/article.py @@ -0,0 +1,77 @@ +# --- +# title: "Part 3: Kafka data streamer" +# description: '' +# notebook_export_path: projects/from_jupyter_to_deploy/part3_kafka_data_streamer.ipynb +# author: pathway +# article: +# date: '2023-11-29' +# thumbnail: '' +# tags: [] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Part 3: Kafka integration and alerts forwarding (Producer) +# +# This notebook is a helper notebook for the third part of the tutorial [From interactive data exploration to deployment](https://pathway.com/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/#part-3-kafka-integration-and-alerts-forwarding). + +# %% +# Download CSV file +# !wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv + +# %% [markdown] +# ## Writing messages to Kafka + +# %% +import pathway as pw + +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) + + +# %% +print(schema.generate_class(class_name="DataSchema")) + + +# %% +# The schema definition is autogenerated +class DataSchema(pw.Schema): + ticker: str + open: float + high: float + low: float + close: float + volume: float + vwap: float + t: int + transactions: int + otc: str + + +data = pw.demo.replay_csv(fname, schema=DataSchema, input_rate=1000) + +# %% +# TODO: please set appropriaye values for KAFKA_ENDPOINT, KAFKA_USERNAME, and KAFKA_PASSWORD +rdkafka_producer_settings = { + "bootstrap.servers": "KAFKA_ENDPOINT:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "sasl.username": "KAFKA_USERNAME", + "sasl.password": "KAFKA_PASSWORD", +} + +pw.io.kafka.write(data, rdkafka_producer_settings, topic_name="ticker") + +# %% +# _MD_SHOW_pw.run() diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/__init__.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/article.py b/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/article.py new file mode 100644 index 00000000..a16732a8 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/article.py @@ -0,0 +1,264 @@ +# --- +# title: "Coming from Pandas" +# description: '' +# notebook_export_path: notebooks/tutorials/from_pandas.ipynb +# --- + +# # Pathway Tutorial for pandas Users +# This tutorial gets you up and running with Pathway in 10 minutes. It is specificaly designed for users coming from `pandas`. +# +# In this tutorial, you will learn how to: +# - build a real-time data streaming pipeline +# - process data on-the-fly as it comes into your stream +# - store the processed data on disk +# +# You'll be performing a simple sum aggregation on live data. +# +# In `pandas` you would compute a sum aggregation like this: +# ```python +# import pandas as pd +# df = pd.read_csv("input_data.csv") +# df.value.sum() +# ``` + +# This is pretty straightforward...but doesn't work with streaming data 😕 Fortunately, Pathway does! 🙌 +# +# This is the Pathway code that will solve our problem: +# +# ```python +# import pathway as pw +# +# class InputSchema(pw.Schema): +# value: int +# +# t = pw.io.csv.read( +# './sum_input_data/', +# schema=InputSchema +# ) +# t = t.reduce(sum=pw.reducers.sum(t.value)) +# pw.io.csv.write(t, "output_stream.csv") +# pw.run() +# ``` +# +# Don't worry if you don't fully understand it yet -- that's exactly what you will learn by completing this tutorial. + +# ## Install Pathway +# To install the latest Pathway release, which is now available on a free-to-use license: +# +# ::pip-install +# :: +# You are ready to roll - let's get to it! + +# ## Get the Data into Your Stream +# +# Let's begin by importing `pathway`: + +import pathway as pw + +# Next, let's define a table. This table will contain the data that is flowing into our stream. +# +# A table in Pathway is comparable to a DataFrame in `pandas`. This table is created by 'listening' to a directory for new events. +# +# The term 'events' may be new to you if you're coming from `pandas`. Events are the core of data streaming architectures and it's important to understand their basic principles so that you can build effective data streaming pipelines. The next section explains some of these principles; you should feel free to skip it if you have a basic understanding of how data streaming works. +# +# +# +# > ### Basic Data Streaming Principles +# > When doing 'regular' data-processing in `pandas` you are working with a dataset that is static: all the data you will work with is available to you *before* you start your processing. Sure, it may need some cleaning and wrangling, but your dataset stays more or less the same. +# +# > In streaming workflows, this is very different. Data is not static but dynamic. You don't know the size or content of your dataset before you start processing: the processing happens *as* the data is being generated. +# +# > This means that we can't structure our processing around 'batches' of data the way we did in `pandas`: the whole point of data streaming is to process data in real-time, not sit around waiting until all of the data has been collected. +# +# > So instead, data streaming pipelines are built around 'events': specific moments when something happens to our data. Common events are data being created, removed, or updated. Whenever an event happens, the data streaming pipeline is triggered and data processing starts. +# +# In our case, we will create a Pathway table by listening to a directory for new events (in this case `./sum_input_data/`) and defining the schema of the table which maps the columns to their data types. Be sure to toggle streaming mode on by setting `mode` to `streaming`. + +# + +class InputSchema(pw.Schema): + value: int + +t = pw.io.csv.read( + "./sum_input_data/", + schema=InputSchema, +) +# - + +# Notice that we have just created a new table...without having any data yet! + +t + +# +# +# This is one of the fundamental differences between `pandas` and `pathway`. While pandas is static and requires a batch of data already available to process, a Pathway table can be created simply by pointing it in the right direction, and the table will be populated with data as it arrives. +# +# It's as if we're saying, "Hey Pathway, keep an eye on that corner of the restaurant and whenever new people arrive, give them a seat at that table over there." +# +# Or in more technical terms: every time a new CSV file is added into `./sum_input_data/`, the content of its `value` column will be automatically added to the table `t`. +# +# ## Process the Data +# +# Now that we have a place where we can hold the data we're interested in, it's time to do something with it :) +# +# Remember that our specific problem for this tutorial is to perform a real-time sum on live incoming data. +# +# Let's go ahead and `select` the values we're interested in. +# + +t1 = t.select(value=t.value) + +# The arguments to `select` are dynamic; you can use them to define the desired column names and then pass the content. To create a table with column name `value` containing the contents of the `value` column: +# +# You could also name your new column something else by writing: + +t2 = t.select(new_col_value=t.value) + +# Then let's perform a groupby on the `value` column using the `reduce` method. +# +# Just like with `select` above, the argument to `reduce` is used to define the resulting column name (in this case `sum`). We pass it the result of `pw.reducers.sum` over the `t.value` column: + +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# +# +# Again, notice that we are 'performing' `select` and `reduce` actions without actually having any data yet. In fact, it would be more accurate to say that we are *describing* actions to be taken in the future, as soon as data arrives. +# +# What we are doing here is **building the pipeline**, so that when our data starts arriving it is processed correctly and ends up in the place where we want it to be. This is different from `pandas` in which we have all the data already and operations are executed the moment they are described. Read more about the [dataflow](/developers/user-guide/introduction/concepts#dataflow) to understand how Pathway pipelines are built under the hood. +# +# ## Get the Data out of Your Stream +# +# Now that we have built our processing pipeline, let's write the resulting data to disk and store it for future use: + +pw.io.csv.write(t, "sum_output_stream.csv") + +# That's it! Your real-time sum data stream is all set up. +# +# It just needs........data :) +# +# ## Generate a Live Data Stream +# +# We can simulate a live data stream by writing some code that will create a CSV file every second with some data in its `value` column. +# +# Using `pandas` that would look something like this: +# +# ```python +# from time import sleep +# import pandas as pd +# sleep(10) +# df = pd.DataFrame() +# for loop_id in range(0,10): +# df = pd.DataFrame(data=[loop_id], columns=["value"]) +# df.to_csv(f"./sum_input_data/input_table_{loop_id}.csv", ) +# sleep(1) +# ``` + +# We can then use the Pathway code we built above to listen to the `sum_input_data` directory and process data as it comes in: +# +# ```python +# class InputSchema(pw.Schema): +# value: int +# +# t = pw.io.csv.read("./sum_input_data/", schema=InputSchema) +# t = t.reduce(sum=pw.reducers.sum(t.value)) +# pw.io.csv.write(t, "output_stream.csv") +# pw.run() +# ``` + +# +# **IMPORTANT:** Make sure to always include `pw.run()` at the end of your Pathway script. All the steps before this command are simply defining the pipeline. `pw.run()` tells Pathway to actually execute it. +# +# To be able to see the result of your work in action, let's separate the data generation and data processing code into 2 separate scripts that we can then launch simultaneously from our terminal. +# +# While we're at it, let's convert the data generation code into bash to make it easier to clean up the directory and avoid messy overwrites: +# +# Save the code below as `generate_stream.sh`: +# +# ```shellscript +# # !/bin/bash +# src="./sum_input_data/" +# # rm -r $src +# # mkdir -p $src +# sleep 10 +# for LOOP_ID in {1..30} +# do +# filename=$src"input_table_$LOOP_ID.csv" +# printf "value\n$LOOP_ID\n" >> $filename +# sleep 1 +# done +# ``` +# +# And save the code below as `perform_realtime_sum.py`: +# +# ```python +# import pathway as pw +# class InputSchema(pw.Schema): +# value: int +# +# t = pw.io.csv.read("./sum_input_data/", schema=InputSchema) +# t = t.reduce(sum=pw.reducers.sum(t.value)) +# pw.io.csv.write(t, "output_stream.csv") +# pw.run() +# ``` +# +# Now head over to your terminal, open two tabs / kernels and then: +# 1. run `bash generate_stream.sh` in the first kernel +# 2. run `python perform_realtime_sum.py` in the second +# +# +# +# The input connector requires the input CSV directory to exist so the script generating the stream should be launched first. The input connector will connect to the file and update the results every time a new CSV file is added, updating automatically and in real-time the output CSV file using the output CSV connector. +# +# ## Interpret the Output +# +# In streaming pipelines, there are generally 3 types of operations you can perform on your data: +# 1. **insert** new data +# 2. **remove** existing data +# 3. **update** existing data, which is actually a combination of operation (2) and (1). + +# In our case, we will mostly be dealing with **updates** to the existing `sum` value. Read our guide on [input and output streams](/developers/user-guide/connecting-to-data/connectors/) to learn more. +# +# The output of this project is a CSV file `output_stream.csv` in which all the operations are recorded: + +import pandas as pd + +output = pd.read_csv("output_stream.csv") +output.head() + +# Let's explain what this output data means in technical terms first, then we will dive deeper with a real-world example. +# +# Each update is represented by up to two rows. Each row contains: +# +# 1. the columns of the output table, here we only have `sum`. +# 2. `time`, which represents the timestamp at which the update has happened. +# 3. `diff`, which represents whether the row represents an addition or a deletion. +# +# An update is represented by two rows: one to remove the old value, one to add the new values. Those two rows will have the same timestamp to ensure the correctness of the update. There is one exception: the first update is represented by a single row because there is no old value to remove. +# +# So the first line represents the insertion (`diff=1`) of a new value (`sum=1`) which has happened at timestamp `time=1675861157539`. +# +# Then a new CSV file is created containing the new value 2, changing the value of `sum` from 1 to 3, resulting in both the deletion (`diff=-1`) of the previous value (`sum=1`) which has happened at `time=1675861157540` AND an addition (`diff=1`) of a new value (`sum=3`) at the same time. +# +# The output file is continuously updated until the process is killed. +# +# ### Real-World Example +# Let's use a real-world example to illustrate the importance and value of this output. +# +# Imagine you work for a bank. The bank stores customers' money in accounts that customers can access whenever they like. Every time a customer of the bank goes to an ATM to withdraw money from their account, the bank wants to make sure that this transaction happens correctly. They only want to give them money that's actually in their account, and then adjust the account balance to reflect the change. This means the ATM withdrawal needs to be syncronised exactly with the adjustment in the customer's account balance. In order to do this we need to know the `time` at which the event happened and the amount that was changed (or the `diff`). +# +# In streaming settings, all the intermediate states of the data are stored during the process. This means you can access not just the final result (of `sum` in our case) but also each value along the way. If you are only interested in the final value, it is up to you to select it, by ordering by the `time` column and then selecting the latest value: + +output = output.sort_values(by=["time"], ascending=False) +output.head(1) + +# In our case, the total sum of the data is `406`. Well done! +# +# > You might have noticed that in this particular example, the `sort_values` call was a little overkill since the data is being processed in a single, sequential process and so the resulting output was already neatly ordered. In real-world streaming use cases, however, you may be dealing with multiple data sources, latency and other factors that will disturb the order of your data. In these cases, the timestamp will enable you to order the data correctly. +# +# ## What's next? +# Congratulations! You've just built your first realtime streaming application with Pathway. +# +# +# +# Now that you're up and running, consider taking a look at our [showcases](/developers/showcases). +# +# We would love to hear your feedback about our product. Come say hi on our [Discord](https://discord.com/invite/pathway)! diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/output_stream.csv b/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/output_stream.csv new file mode 100644 index 00000000..eb26bdf8 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/.from_pandas/output_stream.csv @@ -0,0 +1,22 @@ +sum,time,diff +1,1675861157539,1 +1,1675861157540,-1 +3,1675861157540,1 +3,1675861157541,-1 +10,1675861157541,1 +10,1675861167544,-1 +45,1675861167544,1 +45,1675861167545,-1 +91,1675861167545,1 +91,1675861177551,-1 +120,1675861177551,1 +120,1675861177552,-1 +190,1675861177552,1 +190,1675861177553,-1 +253,1675861177553,1 +253,1675861177554,-1 +276,1675861177554,1 +276,1675861187558,-1 +300,1675861187558,1 +300,1675861187559,-1 +406,1675861187559,1 diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/20.from-jupyter-to-deploy.md b/docs/2.developers/4.user-guide/020.exploring-pathway/20.from-jupyter-to-deploy.md new file mode 100644 index 00000000..45416391 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/20.from-jupyter-to-deploy.md @@ -0,0 +1,673 @@ +--- +title: 'Tutorial: From Jupyter to Deploy' +description: '' +date: '2023-11-29' +github_link: "https://github.com/pathwaycom/pathway/tree/main/examples/projects/from_jupyter_to_deploy" +--- + +# Tutorial: From interactive data exploration to deployment + +## Introduction + +In this tutorial you will create a realtime operational stock trading dashboard with alerting capabilities by following a typical data science project in four parts: + +| | Part 1: Static data Exploration | Part 2: Realtime dashboard | Part 3: Kafka data source | Part 4: Production | +| ------------------------------------------------ | ------------------------------- | -------------------------- | ------------------------- | ------------------ | +| Interactive development in Jupyter with Pathway | ✅ | | | | +| Experiments with static data samples | ✅ | | | | +| Streaming data simulation | | ✅ | | | +| Live visualization in Jupyter | | ✅ | | | +| Alerting functionality / sending data to outputs | | | ✅ | | +| Connecting Kafka data sources | | | ✅ | | +| Deployment of dockerized container | | | | ✅ | +| Embedding Jupyter live dashboards in the web | | | | ✅ | + + +This tutorial demonstrates the key design choice we made while building Pathway: the same code works with both bounded and unbounded datasets, which we shortly call _batch and streaming parity_. In Part 1, this parity allows you to consider only a simple scenario in which you work with static dataset. Then, you can use the same code in Part 2 to run computation processing a _data stream_, without even bothering yourself with thinking about updates, triggers, operator states, and all the things that come in the package with dynamic datasets. However, Pathway still allows you to extend the logic to benefit from temporal nature of data, which includes possibility of buffering early data, filtering late data, and cleaning memory by discarding old data. Part 3 shows you how to switch to a Kafka data sources and adds alerting, which is a final step towards final, deployable version of code. Finally, Part 4 takes the code from Part 3 and sets up all you need to deploy a production ready dashboard. + +The following video highlights the dashboard, still running in notebook: +![image](https://github.com/pathwaycom/pathway-examples/blob/c20cd69a6c9c87fc70a9082de57666c50f2ab3c2/documentation/from_jupyter_to_deploy/jupyter-demo-final-smallest-compressed.gif?raw=true) + + +These four parts build on top of each other, but can be read independently. + +For more explanation on Bollinger Bands, please check out the [showcase](/developers/showcases/live_data_jupyter/). + +### Project description + +Your market finance team wants to assess the probable evolution of some prices in a high frequency trading setup and you are to analyze immediate price movements. To this end, proper price monitoring is required and alerts should be raised when an asset is to be bought (low price) or sold (high price). [Bollinger Bands](https://en.wikipedia.org/wiki/Bollinger_Bands) are a well known method for performing such technical analysis and provide a widely used statistical chart for visualizing price action. The dashboard you will build is a decision-making tool based on this chart and a sequence of alerts to inform about buying or selling opportunities. + + +### Tips for following the tutorial + +Parts 1, 2 and 3 of this tutorial happen entirely inside Jupyter notebooks. You can execute them locally in a code environment with Pathway installed (please follow instructions in [Getting Started](/developers/user-guide/introduction/welcome/) to set up the environment). Alternatively, you can run them in the cloud, using [Google Colab](https://colab.research.google.com/). + +Part 4 transforms the project from notebooks into a deployable Python package. To execute it you need a Linux or Mac system with [Docker](https://www.docker.com/) installed. + +## Part 1: Static data exploration in Jupyter + +You have obtained some historical data in CSV format. Now you will analyze it statically with Pathway inside a Jupyter notebook. The best part: the Pathway code developed during exploration on static data is ready to be used with streaming data (Part 2)! + +The instruction below implement a data exploration notebook. You can follow them starting with an empty [Jupyter](https://jupyter.org/) or [Colab](https://colab.research.google.com/) notebook. Alternatively, you can go straight to and download the final Part 1 [notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb) or open it directly in your browser using [Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb). + +### Installing dependencies + +Make sure you have `pathway` installed. If working in Colab, you can install it into the runtime by creating a code cell with the command `!pip install pathway`. + +Next, download the data by using a code cell with the command `!wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv`. + +If you plan to run the tutorial notebooks under VSCode, note that a specific version of `jupyterlab` is required due to [a known compatibility issue](https://github.com/bokeh/jupyter_bokeh/pull/196) with `jupyter_bokeh`. Please refer to [this discussion](https://github.com/holoviz/hvplot/issues/1111) for workarounds. + +### Loading data + +You can now start data wrangling. The first thing to do is to load the data from the CSV file into a Pathway table, using `pw.io.csv.read`. In general, at the time you define the logic of our Pathway script, the data is not available yet. As such, [the graph representing Pathway computation](/glossary/computation-graph/#computation-graph) is typed, just to be sure it works properly on data streamed in the future. For this reason, the `pw.io.csv.read` function needs a schema which is a blueprint for the data structure. + +However, for the sake of design, Pathway provides the `schema_from_csv` function that can generate schema out of csv file for you (using some static csv file) and schema definition is discussed later in this tutorial. Right now you can generate the schema and load the CSV file using the code below: + + +```python +import datetime + +import pathway as pw + +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) +data = pw.io.csv.read(fname, schema=schema, mode="static") +data +``` + +The last line (empty `data` statement) prints the contents obtained from the CSV file. To do this, Pathway temporarily builds and executes a dataflow that reads the data and captures it in a table. It can do so, because the file reader is told to read the file and exit by setting the `mode="static"` flag. Thus, when data is known in advance, Pathway allows previewing results on demand. The second part of this tutorial describes what happens if the data source is `streaming` future data. + +After you run this cell you should get the following table: + + + +Inspecting the contents of the `data` table, you can notice that timestamps are integers that need to be parsed. We can use the `utc_from_timestamp` located in the `dt` namespace for it. It'll return a Universal (`pw.DateTimeUtc`) time. + +The date parsing code is given below, along with the call to print the contents of the table. As you can see, the timestamps are now properly parsed and pretty-printed. + +```python +data = data.with_columns(t=data.t.dt.utc_from_timestamp(unit="ms")) +data +``` + + + +To make a plot from a Pathway table, an external visualization library is required. In perspective of streaming data, such a library should make it possible to update a chart as Pathway processes the data. The [Bokeh library](https://bokeh.org/) has be chosen for that in this tutorial. The plotting function only takes a Bokeh ColumnDataSource `src` argument which will be automatically populated by Pathway with the table contents. Here is a simple example to plot the history of the volume-weighted average price `vwap` available in the `data` table: + +```python +import bokeh.plotting + +def vwap_history(src): + fig = bokeh.plotting.figure( + height=400, width=600, + title="Volume-weighted average price", + x_axis_type="datetime" + ) + fig.line("t", "vwap", source=src) + return fig +``` + +To use this function on the `data` table, simply call the method `plot` indicating that values should be sorted using the `t` column before plotting. + +```python +data.plot(vwap_history, sorting_col="t") +``` + +You should get the following plot: + + + +### Designing the algorithm + +Once the data is properly loaded and verified, you can start designing our trading algorithm. The Bollinger Bands method is based on upper and lower bounds to trigger sell or buy alerts respectively. These bounds are centered around a moving average whose standard deviation gives the radius of the band up to a multiplicative factor. This particular example uses a 20 minutes period for the moving average and a radius of 2 times the standard deviation. + +To compute these statistics, the data must be grouped according to a sliding 20 minutes window and then reduced to obtain the moving average `vwap`, standard deviation `vwstd` and Bollinger bounds `bollinger_upper` and `bollinger_lower`. Such grouping operation is straightforward with the Pathway `windowby` function followed by a `reduce` call as explained in the [Windowby - Reduce user guide](/developers/user-guide/temporal-data/windows-manual#temporal-sliding-windowing). Thereafter, the sliding windows move through the data in 1 minute steps `hop`, but this parameter can be adjusted. + +```python +minute_20_stats = ( + data + .windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), + duration=datetime.timedelta(minutes=20) + ), + instance=pw.this.ticker + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2) + ) + .with_columns( + vwap=pw.this.transact_total / pw.this.volume + ) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2)**0.5 + ).with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd + ) +) +minute_20_stats +``` + +Pathway uses the `instance` parameter to specify independent instances of the analysis - in this case, different stock tickers. Obviously, you do not want to mix trades of different companies' stock. Note that the content of the `_pw_instance` column is given by this `instance` parameter of the `windowby` call whose results also provide the end point `_pw_window_end` of the window. + +In order to raise alerts and help with the decision to sell or buy, instantaneous price evolution is also necessary. To this end, the sample code computes the average value over one minute [tumbling windows](/developers/user-guide/temporal-data/windows-manual#temporal-tumbling-windowing), a special case of sliding windows, in a similar way. + +```python +minute_1_stats = ( + data + .windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + instance=pw.this.ticker + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap) + ) + .with_columns( + vwap=pw.this.transact_total / pw.this.volume + ) +) +minute_1_stats +``` + +The two statistics tables can now be joined to automatically detect when an alert should be triggered if there is sufficient transaction volume and if the price falls outside the Bollinger bounds. With the help of Pathway `if_else` function, the `is_alert` column is instantly translated into a usable decision making tool to advice [buy, hodl or sell](https://www.investopedia.com/terms/h/hodl.asp). + +```python +joint_stats = ( + minute_1_stats.join( + minute_20_stats, pw.left.t == pw.right.t, pw.left.ticker == pw.right.ticker + ) + .select( + *pw.left, + bollinger_lower=pw.right.bollinger_lower, + bollinger_upper=pw.right.bollinger_upper + ) + .with_columns( + is_alert=( + (pw.this.volume > 10000) + & ( + (pw.this.vwap > pw.this.bollinger_upper) + | (pw.this.vwap < pw.this.bollinger_lower) + ) + ) + ) + .with_columns( + action=pw.if_else( + pw.this.is_alert, + pw.if_else(pw.this.vwap > pw.this.bollinger_upper, "sell", "buy"), + "hodl" + ) + ) +) +joint_stats +``` + +Filtering on alerts can easily be transformed into a table with all relevant information to be integrated into the final dashboard. + +```python +alerts = ( + joint_stats + .filter(pw.this.is_alert) + .select(pw.this.ticker, pw.this.t, pw.this.vwap, pw.this.action) +) +alerts +``` + +### Plotting Bollinger Bands + +To get the Bollinger Bands chart, the plotting function to pass to the `plot` method is slightly longer but `joint_stats` table contains all the needed information. The moving average `vwap` is used to plot the instant price and `bollinger_lower` and `bollinger_upper` for the Bollinger band. The alerts triggered by our Pathway code are also plotted to get a visual decision-making tool. This produces a ready to use widget without additional effort. + +```python +import bokeh.models + +def stats_plotter(src): + actions=["buy", "sell", "hodl"] + color_map = bokeh.models.CategoricalColorMapper( + factors=actions, + palette=("#00ff00", "#ff0000", "#00000000") + ) + + fig = bokeh.plotting.figure( + height=400, width=600, + title="20 minutes Bollinger bands with last 1 minute average", + x_axis_type="datetime" + ) + + fig.line("t", "vwap", source=src) + + fig.line("t", "bollinger_lower", source=src, line_alpha=0.3) + fig.line("t", "bollinger_upper", source=src, line_alpha=0.3) + fig.varea( + x="t", + y1="bollinger_lower", + y2="bollinger_upper", + fill_alpha=0.3, + fill_color="gray", + source=src, + ) + + fig.scatter( + "t", "vwap", + size=10, marker="circle", + color={"field": "action", "transform": color_map}, + source=src + ) + + return fig + +joint_stats.plot(stats_plotter, sorting_col="t") +``` + + + +It looks like the Bollinger Band strategy is promising - it usually decides to buy at a lower value than it is selling. This concludes the data exploration part of the tutorial. You have successfully loaded a static data file into Pathway and used Pathway data transformation tools to detect interesting patterns in data and to make compelling visualizations. + +This concludes Part 1. You can view the final notebook [in Github](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb) or in [Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb). + +Please proceed to Part 2 to turn this static data exploration into a live dashboard. + +## Part 2: From static data exploration to interactive dashboard prototyping + +You finished data exploration and now you are ready to prototype a live-updating dashboard inside Jupyter. To this end, you will replace the static data source by a streaming one and reuse the code from Part 1 to make a dashboard of interactively updating widgets. + +We start with the notebook created in Part 1 - please copy it into a new file or [download it](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part1_jupyter_exploration.ipynb). + +Follow instructions below to prototype a dashboard, or preview directly the results of Part 2 as a [notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb) or in [Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb). + +### Switching to streaming data + +Pathway `pw.demo.replay_csv` function is the easiest way to go from a static data source provided by a CSV file to a streaming one based on the same data. This function uses the same schema as `pw.io.csv.read` and has an additional `input_rate` parameter to indicate the number of rows to read from the CSV file per second. Thus, just replace the `pw.io.csv.read` call with a `pw.demo.replay_csv` statement to start using a streaming data source. + +```python +# data = pw.io.csv.read(fname, schema=schema, mode="static") +data = pw.demo.replay_csv(fname, schema=schema, input_rate=1000) + +# data +``` + +Since your task here is to create a dashboard, you no longer need to display intermediate tables. Therefore, you can remove the empty `data` statement at the end of the data loading cell. Similarly, you can comment out or remove the data printing statements at the end of every other cells. + +As promised, you can use the code designed for static data with streaming data. + +### Running the dashboard + +You are now ready to prototype a dashboard. You can use the [Panel library](https://panel.holoviz.org/) for this and simply move the Bollinger Bands chart widget and the `alerts` table in a row to obtain the announced result. The table `show` method offers some nice features like hiding the Pathway `id` column and sorting by timestamp to get the latest alerts first. + +```python +import panel as pn + +pn.Row( + joint_stats.plot(stats_plotter, sorting_col='t'), + alerts.show(include_id=False, sorters=[{"field": "t", "dir": "desc"}]) +) +``` + +Note that both widgets now advertise us that they are in `Streaming mode` instead of displaying a `Static preview` as in Part 1. + + + +The final step to get a live-updating dashboard is to start streaming the data with `pw.run()`. There you go! + +```python +pw.run() +``` +This ends the dashboard prototyping part of the tutorial, at this moment your code should generate plot as shown below: + + +There are several remarks with respect to the current behavior of the code. First, the datapoint come out of order, and as such, Pathway updates the old results. One of the effects of such behavior is the fact that you can see some temporary alerts that disappear after more data arrives. In principle it is desired behavior, but perhaps you would like to adjust it so that Pathway waits a little bit with reporting the output in order to produce a result that is not changing too often. + +Then, since Pathway guarantees that the output can be updated at any time in the future it actually needs to store some (possible aggregated) information about all past events. As such, in some cases long streams of data may require unbounded space to perform the computation. However, you can tell Pathway that some datapoints won't be changing anymore, and as such the system is allowed to free the memory from data related to such datapoints. More precisely - you can promise Pathway that some entries won't be updated and whenever this promise does not hold, the system is allowed ignore such late update. + +These changes deviate a little bit from batch and stream parity: in static mode all data arrive at the same time, and as such no data is to be delayed or discarded. Still, the deviation is fairly easily configurable via `behavior` and allows you to benefit from the temporal aspect of datastreams. + +### Defining behaviors for streaming windows + +This part of the article explains one of the most intriguing topics in batch to streaming conversion: temporal behavior of the system. During static data exploration you were mostly concerned with _what_ is going to be computed. Now you can also specify _when_ the results of the computation are visible to the rest of the system, and _when_ the system is allowed to free memory from old data. + +Consider the windowing code that computes `minute_20_stats`. Every window groups together samples from 20 minutes. However, when does it produce its outputs? Does the rest of the system see temporary results computed on incomplete windows? Or does it only see the aggregations of complete windows? How long does the system wait for late data? A window `behavior` controls these aspects of execution. + +In the code below, each window first gathers all data, and only then sends the result for further processing. You can instruct the system to act so by setting the `behavior` argument to `exactly_once_behavior()`, which makes Pathway to produce the first result when the window is complete and ignore all subsequent (late) updates that would fall within this window. Additionally, Pathway uses the information that some windows won't be changed anymore to reclaim the memory used to store information about those windows. + +Please add the `behavior` argument to window definition as in the code snippet below. + +```python +minute_20_stats = ( + data + .windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), + duration=datetime.timedelta(minutes=20) + ), + # Wait until the window collected all data before producing a result + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2) + ) + .with_columns( + vwap=pw.this.transact_total / pw.this.volume + ) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2)**0.5 + ).with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd + ) +) + +minute_1_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) +) + +``` + + +With those changes you should see a plot behaving as in the video below. + + + +This concludes Part 2 of the tutorial. You managed to move from a static data source to a streaming one and to reuse the code from the data exploration part. You also integrated widgets built from Pathway tables into a live-updating dashboard. Now, you are ready to prepare the result for production. + +You can see the results in [Github](https://github.com/pathwaycom/pathway/blob/examples/main/projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb) or in [Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb). + +The next part of this article explains how to change the data source to a production grade one, like Kafka. + +## Part 3: Kafka integration and alerts forwarding + +You successfully prototyped the dashboard in Jupyter with data streamed from a CSV file. To scale to production, you will now use streaming data from a Kafka server and push alerts to Slack. + +Please start with the notebook created in Part 2 - either copy its content into a new file or [download it](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part2_interactive_dashboard.ipynb). + +Follow along to integrate Kafka and alerts forwarding, or preview directly the results of Part 3 as a [notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb) or in [Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb). + +### Kafka integration + +This part of the tutorial requires accessing a running Kafka connection. For simplicity, this example uses the hosted service at [upstash.com](https://upstash.com/) but all descriptions should also work with a self-hosted Kafka. Please register and create a new Kafka cluster with a topic named `ticker`. + +#### Writing messages to Kafka + +First, you need to create a helper data streaming notebook. You can populate the `ticker` topic by reading the data from the CSV file as in Part 2, then writing it to the Kafka topic using the Pathway `pw.io.kafka.write` function. Please start a new notebook, then fetch the csv file: + +```python +!wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv +``` + +We will generate a schema definition for our data. The previous notebooks simply inferred the schema from the CSV file: + +```python +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) +``` + +You can generate a schema class by running: + +```python +print(schema.generate_class(class_name="DataSchema")) +``` + +In this way you can enforce a schema for messages sent via Kafka. You are now ready to write the data streaming code: + +```python +# The schema definition is autogenerated +class DataSchema(pw.Schema): + ticker: str + open: float + high: float + low: float + close: float + volume: float + vwap: float + t: int + transactions: int + otc: str + +data = pw.demo.replay_csv(fname, schema=DataSchema, input_rate=1000) + +rdkafka_producer_settings = { + "bootstrap.servers": "KAFKA_ENDPOINT:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "sasl.username": "KAFKA_USERNAME", + "sasl.password": "KAFKA_PASSWORD" +} + +pw.io.kafka.write(csv_data, rdkafka_producer_settings, topic_name="ticker") +``` + +Finally, start the pipeline production with `pw.run()` as usual. You can verify in the Upstash console that messages have reached the topic. + +#### Reading messages from Kafka + +To consume the topic messages as a streaming data source, Pathway provides the `pw.io.kafka.read` function. Thus, the `data` table can now be populated in the following way. + +```python +# Please fill in KAFKA_ENDPOINT, KAFKA_USERNAME, and KAFKA_PASSWORD from your +# cluster configuration. +# Message read status is tracked by consumer group - resetting to a new name +# will cause the program to read messages from the start of the topic. +rdkafka_consumer_settings = { + "bootstrap.servers": "KAFKA_ENDPOINT:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "sasl.username": "KAFKA_USERNAME", + "sasl.password": "KAFKA_PASSWORD", + "group.id": "kafka-group-0", + "auto.offset.reset": "earliest" +} + +# The schema definition is autogenerated +class DataSchema(pw.Schema): + ticker: str + open: float + high: float + low: float + close: float + volume: float + vwap: float + t: int + transactions: int + otc: str + +data = pw.io.kafka.read( + rdkafka_consumer_settings, + topic="ticker", + format="json", + schema=DataSchema +) +``` + +The sequel of the code does not change and can be used with this new streaming data source but your algorithm is now based on data from the Kafka topic as in production. Being able to seamlessly reuse the code developed in the previous parts is one of Pathway many strengths. + +### Alerts forwarding to Slack + +Below, you can find a piece of Python code that defines callbacks on the `alerts` table - it sends us notifications on [Slack](https://slack.com/). To start, you have to [get a token](https://api.slack.com/tutorials/tracks/getting-a-token). Please create a new app, install it to your workspace and fetch the token. Then follow the tutorial to fetch the ID of the channel to which you want to send the notifications. + +To send the alerts you should attach a callback on the `alerts` table using `pw.io.subscribe` function. The example below only reacts to row additions and for each addition it uses `requests` to call the Slack API: + +```python +import requests + +slack_alert_channel_id = "SLACK_CHANNEL_ID" +slack_alert_token = "SLACK_TOKEN" + +def send_slack_alert(key, row, time, is_addition): + if not is_addition: + return + alert_message = f'Please {row["action"]} {row["ticker"]}' + print(f'Sending alert "{alert_message}"') + requests.post( + "https://slack.com/api/chat.postMessage", + data="text={}&channel={}".format(alert_message, slack_alert_channel_id), + headers={ + "Authorization": "Bearer {}".format(slack_alert_token), + "Content-Type": "application/x-www-form-urlencoded", + }, + ).raise_for_status() + +pw.io.subscribe(alerts, send_slack_alert) +``` + +### Testing the application in a notebook + +Finally, to test the live dashboards and alerts on data incoming over a live Kafka connection you will reset the Kafka topic and run both notebooks simultaneously: +1. delete the `ticker` Kafka topic, +2. recreate the `ticker` topic, such that it is empty, +3. start the [live dashboarding notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb), +4. start the [data streaming notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_data_streamer.ipynb). + +This concludes Part 3 of this tutorial. As usual, you can access the final [live dashboarding notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb) and [data streaming notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_data_streamer.ipynb). + +## Part 4: From Jupyter to standalone deployment + +In Part 4 you will extract the Pathway data processing code along with the dashboard out of the Jupyter notebook into a standalone Python application. +As usual, you can browse through the [completed project](https://github.com/pathwaycom/pathway/tree/main/examples/projects/from_jupyter_to_deploy/part4_deployment) + +### Exporting the data streaming notebook +Start by exporting the [data streaming notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_data_streamer.ipynb) from Step 3 and call it `kafka_data_streamer.py`. You can do it in Jupyter lab by `File -> Save and Export Notebook as... -> Executable Script`. You need to comment out the shell commands in the beginning of the file responsible for installing Pathway and downloading dataset (after exporting these start with `get_ipython()`). + +You also need the CSV file with data, the same as in previous steps. You can get it by running `wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv`. + +### Exporting live dashoarding notebook +Now export the [live dashboarding notebook](https://github.com/pathwaycom/pathway/blob/main/examples/projects/from_jupyter_to_deploy/part3_kafka_and_alerts.ipynb) from Step 3 and call it `dashboard.py`, the same way as for the data streaming notebook. Again, comment out any shell commands if you have any. + +As you are no longer relying on Jupyter to display visualizations, you need to replace `pw.run()` at the end of `dashboard.py` with the following lines: +```python +viz_thread = viz.show(threaded=True, port=8080) + +try: + pw.run(monitoring_level=pw.MonitoringLevel.ALL) +finally: + viz_thread.stop() +``` +This piece of code is responsible for running a webserver at `localhost:8080` on a separate thread. You can read more about configuring the webserver on [Panel website](https://panel.holoviz.org/). + +### Prepare Docker +Finally, you need to create the `docker-compose.yml` and `Dockerfile` to configure Docker to run Kafka and both your Pathway applications - Dashboard and Data Streamer. +The following `Dockerfile` for running Pathway apps will be responsible for installing Pathway and copying files: +``` +FROM python:3.11 + +RUN pip install pathway +COPY . . +``` + +The deployment is defined in `docker-compose.yml`. It begins with the configuration of Kafka: +``` +version: '3' + +services: + zookeeper: + image: confluentinc/cp-zookeeper:5.5.3 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + kafka: + image: confluentinc/cp-kafka:5.5.3 + depends_on: + - zookeeper + environment: + KAFKA_AUTO_CREATE_TOPICS: true + KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_HOST_NAME: kafka + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT + CONFLUENT_SUPPORT_METRICS_ENABLE: false + command: sh -c "((sleep 15 && kafka-topics --create --zookeeper zookeeper:2181 --replication-factor 1 --partitions 1 --topic tickers)&) && /etc/confluent/docker/run " +``` + +Then you add two containers for `dashboard.py` and `kafka-data-streamer.py`. Container for the dashboard requires port forwarding, so that the web server can be accessed. The `sleep` lines make sure that the processes start after Kafka is initialized. +``` + dashboard: + build: + context: ./ + depends_on: + - kafka + ports: + - 8080:8080 + command: sh -c "sleep 10 && python dashboard.py" + + data-streamer: + build: + context: ./ + depends_on: + - kafka + command: sh -c "sleep 10 && python kafka-data-streamer.py" +``` + +Once the Kafka configuration is prepared you need to update Kafka settings in `dashboard.py` and `data-streamer.py`. In `dashboard.py` change the lines +defining `rdkafkasettings` to: +```python +rdkafka_consumer_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "kafka-group-0", + "auto.offset.reset": "earliest", +} +``` +and in `kafka-data-streamer.py` to: +``` +rdkafka_producer_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", +} +``` + +### Running the dashboard + +You are ready to run your application with Docker. +``` +docker compose -f "docker-compose.yml" build +docker compose -f "docker-compose.yml" up +``` + +Once the containers are running you will get logs in the terminal, including ones from Pathway applications. + + + +To see the dashboard go to `localhost:8080` in your web browser. + + + +Now you have a web dashboard and events pushed to Slack! + +## Conclusions + +Welcome to the Manul pack: starting with static data exploration, you have built and deployed a dashboarding data application updating in realtime using streaming data! You have integrated Kafka, a production grade streaming data broker, and have implemented an alerting functionality. You now know how to run a full Data Science project in Pathway, from idea to deployment. diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/30.linear_regression_with_kafka.md b/docs/2.developers/4.user-guide/020.exploring-pathway/30.linear_regression_with_kafka.md new file mode 100644 index 00000000..a854eb84 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/30.linear_regression_with_kafka.md @@ -0,0 +1,316 @@ +--- +title: 'Linear regression on a Kafka stream' +description: '' +date: '2022-12-23' +thumbnail: '' +tags: ['tutorial', 'data-pipeline'] +keywords: ['linear regression', 'Kafka', 'upstash'] +--- + + +# Doing a linear regression on data from Kafka +Start doing ML on a streaming data source with Pathway. + +In this article, we are going to see how to do a simple linear regression on streaming data from Kafka. +This article can be seen as an extension of our [realtime sum using CSV connectors](/developers/user-guide/introduction/first_realtime_app_with_pathway/). + + + +We have a data stream of data points $(x_i,y_i)$, and we want to compute a [simple linear regression](https://en.wikipedia.org/wiki/Simple_linear_regression) on those points: we want to compute the two parameters $(a,b)$ so that, for each point $(x_i,y_i)$, $y_i$ can be approximated by $y_i \approx a + b \times x_i$. + +We are not going to explain the mathematical details here, but you can find all the details in the [Wikipedia article](https://en.wikipedia.org/wiki/Simple_linear_regression). + +## Connectors + +First, we need a connector to connect to our input data streams and receive values on which the regression will be computed. +In this article, we will set up a Kafka connector. + +To be able to reproduce this example, you may want to use [upstash](https://upstash.com/) which provides a free Kafka instance. + +To use a Kafka connector, we need to set all the Kafka parameters by using a dictionary, following the format of [librdkafka](https://github.com/confluentinc/librdkafka/blob/master/CONFIGURATION.md). +We also need to define a Kafka topic on which to connect onto: we will go with `"linear-regression"`. +Here is an example of settings to connect to Kafka using SASL-SSL authentication over SCRAM-SHA-256 mechanism: + +```python +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} +``` + +You need, of course, to replace the server address and the associated credentials. + +With this, setting the connector is straightforward, you just need to specify the topic and the schema for your table: + +```python +class InputSchema(pw.Schema): + x: float + y: float + + +t = pw.io.kafka.read( + rdkafka_settings, + topic="linear-regression", + schema=InputSchema, + format="csv", + autocommit_duration_ms=1000 +) +``` + +We used the `csv` format, but there are two other ways to read from Kafka: `raw` which reads a table with only one column `data` in which the whole message is dumped and `json` which reads JSON messages. You can see more about this connector in its dedicated [tutorial](/developers/user-guide/connectors/kafka_connectors/). In our case we expect CSV messages. + +💡 If you only want to test the linear regression, without generating a data stream through Kafka, you can directly use our stream generator: +```python +t = pw.demo.noisy_linear_stream() +``` + +For the output, we use a CSV connector, which is set up as follows: + +```python +pw.io.csv.write(t, "regression_output_stream.csv") +``` + +For more details on how this connector works, checkout the [tutorial](/developers/user-guide/connectors/csv_connectors/) about it. + +## Doing a linear regression +To do the regression, we need to compute the sum of the $x_i$, of the $x_i^2$, of the $y_i$ and of the $x_i\times y_i$ and the total number of data points received until then. +This is done as follows: + +```python +t = t.select( + *pw.this, + x_square=t.x * t.x, + x_y=t.x * t.y +) +statistics_table = t.reduce( + count=pw.reducers.count(), + sum_x=pw.reducers.sum(t.x), + sum_y=pw.reducers.sum(t.y), + sum_x_y=pw.reducers.sum(t.x_y), + sum_x_square=pw.reducers.sum(t.x_square), +) +``` + +Then we can compute the estimation of $a$ and $b$: + +```python +def compute_a(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (sum_y * sum_x_square - sum_x * sum_x_y) / d + +def compute_b(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (count * sum_x_y - sum_x * sum_y) / d + +results_table = statistics_table.select( + a=pw.apply(compute_a, **statistics_table), + b=pw.apply(compute_b, **statistics_table), +) +``` + +## Creating the input stream + +> You can skip this section if you use our stream generator `pw.demo.noisy_linear_stream()` + +To use the Kafka connector, we have to follow a few rules. +First, the Kafka connector expects the first message to contain the names of the columns. +The connector will not properly work without this message, however it must be sent only once: if sent twice, the second message will be treated like a normal row. + +We can use the KafkaProducer API provided by Kafka to send message using Python: + +```python + +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", +) +producer.send(topic, ("x,y").encode("utf-8"), partition=0) +producer.send( + "linear-regression", ("0,0").encode("utf-8"), partition=0 +) +producer.send( + "linear-regression", ("1,1").encode("utf-8"), partition=0 +) +producer.close() +``` + +This code sample sends $(0,0)$ and $(1,1)$ and then closes the Kafka Producer. +For our example, we are going to send more messages containing different pairs $(x,y)$ which are samples from the line $y=x$. +However, for the example not to be too simple, we are going to add a small random error to each $y$. + +> Note that, depending on your version of Kafka, you may need to specify the API version to make this code work: `api_version=(0,10,2)`. + +## Gathering everything into one piece +The final version of our project contains two files: `realtime_regression.py` which processes the stream using Pathway and `generating_kafka_stream.py` which generates the streams. + +Here is `realtime_regression.py`: + +```python [realtime_regression.py] +import pathway as pw + +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} + +class InputSchema(pw.Schema): + x: float + y: float + + +t = pw.io.kafka.read( + rdkafka_settings, + topic="linear-regression", + schema=InputSchema, + format="csv", + autocommit_duration_ms=1000, +) +pw.io.csv.write(t, "regression_input.csv") + +t += t.select( + x_square=t.x * t.x, + x_y=t.x * t.y, +) +statistics_table = t.reduce( + count=pw.reducers.count(), + sum_x=pw.reducers.sum(t.x), + sum_y=pw.reducers.sum(t.y), + sum_x_y=pw.reducers.sum(t.x_y), + sum_x_square=pw.reducers.sum(t.x_square), +) + +def compute_a(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (sum_y * sum_x_square - sum_x * sum_x_y) / d + +def compute_b(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (count * sum_x_y - sum_x * sum_y) / d + +results_table = statistics_table.select( + a=pw.apply(compute_a, **statistics_table), + b=pw.apply(compute_b, **statistics_table), +) + +pw.io.csv.write(results_table, "regression_output_stream.csv") +pw.run() +``` + +Don't forget the `pw.run()` otherwise no computation will be done! +Once `pw.run()` is called, the computation will be run forever until it gets killed. + +And the `generating_kafka_stream.py`: + + +```python [generating_kafka_stream.py] +from kafka import KafkaProducer +import time +import random + +topic = "linear-regression" + +random.seed(0) +def get_value(i): + return i + (2 * random.random() - 1)/10 + +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", +) +producer.send(topic, ("x,y").encode("utf-8"), partition=0) + +time.sleep(5) +for i in range(10): + time.sleep(1) + producer.send( + topic, (str(i) + "," + str(get_value(i))).encode("utf-8"), partition=0 + ) + +producer.close() +``` + +## Output + +There are two outputs in this project: the CSV file `regression_input.csv` which keeps all the updates received from Kafka and the CSV file `output_stream.csv` in which all the successive updates of the sum values are displayed. + +As in our [previous example](/developers/user-guide/introduction/first_realtime_app_with_pathway/), the outputs are tables of changes. +Each new message of Kafka triggers a new computation and the new values are output in the CSV files! + +First, we can check that the generated values are correct: + +```shellscript +x,y,time,diff +"0","0.06888437030500963",0,1 +"1","1.0515908805880605",1,1 +"2","1.984114316166169",2,1 +"3","2.9517833500585926",3,1 +"4","4.002254944273722",4,1 +"5","4.980986827490083",5,1 +"6","6.056759717806955",6,1 +"7","6.9606625452157855",7,1 +"8","7.995319390830471",8,1 +"9","9.016676407891007",9,1 +``` + +We obtain ten values which are sampled around the $y=x$ line. Let's check the regression we obtain: + +```shellscript +a,b,time,diff +0,0,0,1 +0,0,1,-1 +0.06888437030500971,0.9827065102830508,1,1 +0.06888437030500971,0.9827065102830508,2,-1 +0.07724821608916699,0.9576149729305795,2,1 +0.0769101730536299,0.9581220374838857,3,1 +0.07724821608916699,0.9576149729305795,3,-1 +0.05833884879671927,0.9766933617407955,4,1 +0.0769101730536299,0.9581220374838857,4,-1 +0.05087576879874134,0.9822906717392795,5,1 +0.05833884879671927,0.9766933617407955,5,-1 +0.03085078333935821,0.9943056630149089,6,1 +0.05087576879874134,0.9822906717392795,6,-1 +0.03085078333935821,0.9943056630149089,7,-1 +0.03590542987734715,0.9917783397459139,7,1 +0.03198741430177742,0.9934574892783012,8,1 +0.03590542987734715,0.9917783397459139,8,-1 +0.025649728471303895,0.9958341214647295,9,1 +0.03198741430177742,0.9934574892783012,9,-1 +``` + +We obtain close values to what we expect ($a=0$ and $b=1$). +You can play the values (number of samples, error, linear function to approximate etc.) to see how the algorithm reacts. + + +## To go further +Congrats, you are now able to use Pathway with Kafka and do some non-trivial computation! + +Why not try to do some more advanced computation such as linear regression with several explanatory variables? +Or you may want to do some [classification](/developers/showcases/lsh/lsh_chapter1)? diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/40.migrate-from-pandas.md b/docs/2.developers/4.user-guide/020.exploring-pathway/40.migrate-from-pandas.md new file mode 120000 index 00000000..9d61af3f --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/40.migrate-from-pandas.md @@ -0,0 +1 @@ +.from_pandas/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/50.realtime-analytics-with-CDC.md b/docs/2.developers/4.user-guide/020.exploring-pathway/50.realtime-analytics-with-CDC.md new file mode 100644 index 00000000..cd3e8af1 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/50.realtime-analytics-with-CDC.md @@ -0,0 +1,476 @@ +--- +title: 'Database connectors' +description: 'Tutorial on Database connectors' +date: '2023-01-19' +thumbnail: '/assets/content/blog/th-database-connectors.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'database', 'PostgreSQL', 'CDC', 'Debezium', 'Kafka', 'Zookeeper', 'Docker'] +--- + + +# Using database connectors +Connect Pathway on top of your PostgreSQL/Debezium database. + + +In this tutorial, you will learn how to set up an architecture using Pathway on top of PostgreSQL/Debezium. +The aim is to show you how to use Pathway to perform real time processing of the changes of your PostgreSQL database. +Traditional databases, such as PostgreSQL, are not designed for streaming scenarios: we need a change data capture ([CDC](https://en.wikipedia.org/wiki/Change_data_capture)) mechanism to monitor a database and stream the changes. +To create our input stream from PostgreSQL, we will use [Debezium](https://debezium.io/) and its associated Pathway's connector. +The output stream will be sent back to the PostgreSQL database using Pathway's PostgreSQL connector. + +This tutorial is a bit longer than the other ones, as you will need to deploy several containers using `docker-compose` to have a working example. +If you have never used `docker-compose`, don't be afraid! +This tutorial does not require any knowledge about `docker-compose`: we will provide all the required settings in order to make it as pleasant as possible 😉, you will see that it makes the deployment of the database quite easy. + + +If you have your own PostgreSQL and Debezium instances already deployed, you may skip the architecture part to focus on the input and output connectors. + + + +## Short version +Let's consider a simple scenario: you have a SQL table `values` with only one column `value` which has new entries, and you want to compute the sum of the values to be stored in another table `sum_table`. The changes in the table `values` are captured and streamed by Debezium. + +You can do it as follows in Pathway: + +```python [sum.py] +import pathway as pw + +# Debezium settings +input_rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} +# PostgreSQL settings +output_postgres_settings = { + "host": "postgres", + "port": "5432", + "dbname": "values_db", + "user": "user", + "password": "password", +} + +# We define a schema for the table +# It set all the columns and their types +class InputSchema(pw.Schema): + value: int + + +# We use the Debezium connector to listen to the "postgres.public.values" topic +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, +) + +# # We compute the sum (this part is independent of the connectors). +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# We use the PostgreSQL connector to send the resulting output stream containing the sum +pw.io.postgres.write(t, output_postgres_settings, "sum_table") + +# We launch the computation. +pw.run() +``` + +## Architecture + +First, let's take a moment to study the architecture of our project. + +You need a PostgreSQL database with two tables: the tables `values` on which regular updates will be performed, creating the input data stream, and the `sum_table` output in which the sum will be stored and periodically updated by Pathway. + +Then, you need to set up our CDC to capture the changes in the PostgreSQL database and create a stream of it. +This part is done by a Debezium instance. + +The updates from Debezium are sent to Kafka, which relies on ZooKeeper, which then propagates them in Pathway. + +Finally, Pathway receives the updates from Kafka, processes the data, and sends the sum back to PostgreSQL. + +![Architecture](/assets/content/documentation/debezium-architecture.svg) + +### Docker containers + +To summarize, our puzzle has five pieces: + - PostgreSQL + - Debezium + - Kafka + - ZooKeeper + - Pathway + +No need to say that installing those from scratch can be a hurdle... + +Fortunately, we can use Docker containers and Docker Compose to ease the deployment of our project. +In a nutshell, a container is a virtual environment in which we can run standalone applications. +As an example, the PostgreSQL container you will use in this project contains a minimal distribution to run a PostgreSQL database: it is lightweight and PostgreSQL is already installed, but it may not work for other purposes. +In our case, the main advantages of using docker containers is the simplicity of the installation and the setting. +If you are interested in Docker containers, or if you want to adapt this example to your project, checkout the [doc](https://docs.docker.com/). + +You will use a `docker-compose.yml` in the root of this project to set it up. +Such a file is organized as follows: + +```docker-compose +version: "3.7" +services: + postgres: + build: + environment: + volumes: + kafka: + build: + ... +``` + +Each application, called a service here, is defined there with all its relevant parameters, for example parameters in `build` define how to build the container (which image and so on). Note that `postgres` here is the name given to the service, whether a PostgreSQL image is used is determined by the `build` parameter. + +### Postgres + +You first need to configure our database: + +```docker-compose + postgres: + container_name: db_tuto_postgres + image: debezium/postgres:13 + ports: + - 5432:5432 + environment: + - POSTGRES_USER=user + - POSTGRES_PASSWORD=password + - POSTGRES_DB=values_db + - PGPASSWORD=password + volumes: + - ./sql/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql + - ./sql/update_db.sh:/update_db.sh +``` + +Most of the variables are self-explanatory. +The only subtlety here is how the database is created and updated. +The container will use the script in `/docker-entrypoint-initdb.d/` to initialize the database. +Docker-compose copies files into the container using volumes. +We create a file `init-db.sql` in which we create our two tables `values` and `sum_table`: + +```sql [./sql/init-db.sql] +CREATE TABLE IF NOT EXISTS values ( + value integer NOT NULL +); + +CREATE TABLE IF NOT EXISTS sum_table ( + sum REAL NOT NULL, + time BIGINT NOT NULL, + diff INTEGER NOT NULL +); +``` + +You also need a script `update_db.sh` to insert data into the table, to create the stream: + +```shellscript [./sql/update_db.sh] +#!/bin/bash +export PGPASSWORD='password' + +for LOOP_ID in {1..1000} +do + psql -d values_db -U user -c "INSERT INTO values VALUES ($LOOP_ID);" + sleep 0.5 +done +``` + +⚠️ This file should have executable rights. + +We will copy this file at the root, not at `/docker-entrypoint-initdb.d/` as we want to launch it manually. + +### Debezium + +For Debezium, we need to configure ZooKeeper and Kafka. +For all of those, we are going to use very specific images which considerably limit the number of settings to do. + +#### ZooKeeper + +```docker-compose + zookeeper: + container_name: db_tuto_zookeeper + image: confluentinc/cp-zookeeper:5.5.3 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 +``` + +#### Kafka + +Kafka will be connected to ZooKeeper. +Thanks to docker compose, all the containers share the same network. +To connect to a given service, we just need to use its name: +to connect to our ZooKeeper container we only need to write `"zookeeper:2181"`. + +```docker-compose + kafka: + container_name: db_tuto_kafka + image: confluentinc/cp-enterprise-kafka:5.5.3 + depends_on: [zookeeper] + environment: + KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 +``` + +#### Debezium + +```docker-compose + debezium: + container_name: db_tuto_debezium + image: debezium/connect:1.4 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: connect_configs + OFFSET_STORAGE_TOPIC: connect_offsets + volumes: + - ./debezium/connector.sh:/kafka/connector.sh + depends_on: [kafka] + ports: + - 8083:8083 +``` + +We need to connect the Debezium connector to our PostgreSQL database: + +```shellscript [./debezium/connector.sh] +#!/bin/bash + +curl -H 'Content-Type: application/json' debezium:8083/connectors --data ' +{ + "name": "values-connector", + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "plugin.name": "pgoutput", + "database.hostname": "postgres", + "database.port": "5432", + "database.user": "user", + "database.password": "password", + "database.dbname" : "values_db", + "database.server.name": "postgres", + "table.include.list": "public.values" + + } +}' +``` + +⚠️ This file should have executable rights. + +Copy this script at the root of the container to execute it manually. + + +### Pathway + +Now you need to set up the container for Pathway. +Pathway does not have its own docker image, so you are going to use a Dockerfile to configure our container. + +```docker-compose + pathway: + container_name: db_tuto_pathway + build: + context: . + dockerfile: ./pathway-src/Dockerfile + depends_on: [kafka, postgres] +``` + +In our Dockerfile, we use a Python image. +You just need to use the pip install command to install Pathway. + +```DockerFile [./pathway-src/Dockerfile] +FROM --platform=linux/x86_64 python:3.10 + +RUN pip install -U pathway +RUN pip install kafka-python +COPY ./pathway-src/sum.py sum.py + +CMD ["python", "-u", "sum.py"] +``` + +⚠️ For compatibility reasons, we use a x86_64 Linux container. + +The file `sum.py` is the entry of our container: it will be automatically launched when the container has started. +Once it is terminated, the container is stopped. +Our pipeline will be defined in this file. + + +That's it! +That's a large docker-compose file, but when you think that this is enough to launch the 5 containers and make them work together, it is quite impressive! + +### Makefile + +To launch the containers, we only need to run `docker-compose up` in the same directly as `docker-compose.yaml`. +Nevertheless, it may be more simple to have a dedicated Makefile: + +```Makefile [./Makefile] +build: + chmod +x ./debezium/connector.sh + chmod +x ./sql/update_db.sh + docker-compose up -d + sleep 5 + docker-compose exec debezium ./connector.sh + docker-compose exec postgres ./update_db.sh + +stop: + docker-compose down -v + docker rmi tutorial-example-pathway:latest +``` + +You can launch the experiment with `make` and stop it with `make stop`. + +Now you only need to do our pipeline in `sum.py`. + + +## Debezium input connector + +**Data stream**: +For the input connector, the stream should be in the form of Debezium messages received on a given topic. +Each received update is atomic, and triggers the update of the pipeline created in Pathway. + +Note that Debezium connectors only work in [streaming mode](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). + +⚠️ We talk about Debezium messages but it is a simplification. +Debezium works with Kafka: in practice, the connector should be connected to Kafka. +The main difference with the regular Kafka connector is the expected formatting of the messages. + +**Usage**: +The Debezium input connector `pw.io.debezium.read` takes several arguments: +- `rdkafka_settings`: the settings used to connect to the Kafka instance receiving the Debezium messages; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md), +- `topic_name`: the topic which is listened to, +- `schema`: the schema of the table. It defines the columns names and their types. It also defines the primary keys. +- `autocommit_duration_ms`: the maximum time between two commits. Every `autocommit_duration_ms` milliseconds, the updates received by the connector are committed and pushed into Pathway's dataflow. + + +⚠️ Note that a Debezium connector listens to *only one topic*. + +**Usage**: +```python +class InputSchema(pw.Schema): + value: int +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, + autocommit_duration_ms=100 +) +``` + +The optional parameter `types` used to cast the input values relies on specific `pw.Type`: `types={"value": pw.Type.INT}`. + +## PostgreSQL output connector + +The output connector adds the updates made to a table `t` to a given PostgreSQL table. + + +**Usage**: +the output connector takes the following arguments: +- `table`: the Pathway table to send to PostgreSQL, +- `postgres_settings`: the settings used to connect to the PostgreSQL database; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md), +- `table_name`: PostgreSQL table on which the messages are sent. + +```python +pw.io.postgres.write(t, output_postgres_settings, "sum_table") +``` +Every time the table `t` is updated, the changes will be automatically appended in `output_stream.csv`. + +⚠️ The table should already be created in PostgreSQL: the creation is not done by Pathway. +**The table must include all the columns of the table `t` has, as well as two extra columns: `time` and `diff`.** +The columns `time` and `diff` are expressing the timestamp of the update and whether the update is an insertion or a deletion (an update is simply the simultaneous suppression of the old value along with the insertion of the new one). + +In our example, we must create a `sum_table` using this SQL command: +```sql +CREATE TABLE IF NOT EXISTS sum_table ( + sum REAL NOT NULL, + time BIGINT NOT NULL, + diff INTEGER NOT NULL +); +``` + + +## Complete example + +To summarize, we have a project with the following structure: + +``` +. +├── debezium/ +│ └── connector.sh +├── pathway-src/ +│ ├── Dockerfile +│ └── sum.py +├── sql/ +│ ├── init-db.sql +│ └── update_db.sh +├── docker-compose.yml +└── Makefile +``` + +Except the `sum.py`, all the files have been explained previously. +The full example is accessible in our [public repository](https://github.com/pathwaycom/pathway/tree/main/examples/projects/debezium-postgres-example). + +Let's see in more details how the pipeline is done in Pathway in `sum.py`: + +```python [./pathway-src/sum.py] +import pathway as pw + +input_rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} +output_postgres_settings = { + "host": "postgres", + "port": "5432", + "dbname": "values_db", + "user": "user", + "password": "password", +} + +class InputSchema(pw.Schema): + value: int + + +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, + autocommit_duration_ms=100 +) + +t = t.reduce(sum=pw.reducers.sum(t.value)) + +pw.io.postgres.write(t, output_postgres_settings, "sum_table") + +pw.run() +``` + +Don't forget to run the computation with `pw.run()`, otherwise nothing will happen. +Once `pw.run()` is called, the computation will be run forever until it gets killed. +If you need some reminder on Pathway operations, don't hesitate to take a look at our [First steps guide](/developers/user-guide/data-transformation/table-operations/). + +To launch the computation you just need to run `make` in the root of the project, it will launch all the containers, initialize the database and start adding new values. +Every addition in the table `values` in PostgreSQL will trigger an update, through Debezium, and Pathway will send an update to the table `sum_table`. + +To monitor the changes, you can log in the PostgreSQL container: + +`docker-compose exec postgres psql values_db -U user -W` + +After typing your password, you should be able to see all the updates to the table `sum_table`. + +To see the latest value of the sum, we just need to type: + +```sql +SELECT sum FROM sum_table ORDER BY time DESC, diff DESC LIMIT 1; +``` + +This value is updated in **realtime** by Pathway. + +In you want more details, you can see the 10 latest updates by typing: + +```sql +SELECT * FROM sum_table ORDER BY time DESC, diff DESC LIMIT 10; +``` + +To learn more about what those updates mean, don't hesitate to read the [explanation provided in our first example](/developers/user-guide/introduction/first_realtime_app_with_pathway#understanding-the-output). diff --git a/docs/2.developers/4.user-guide/020.exploring-pathway/_dir.yml b/docs/2.developers/4.user-guide/020.exploring-pathway/_dir.yml new file mode 100644 index 00000000..9a8aa341 --- /dev/null +++ b/docs/2.developers/4.user-guide/020.exploring-pathway/_dir.yml @@ -0,0 +1 @@ +title: 'Pathway In Action' diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.datatypes/__init__.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.datatypes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.datatypes/article.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.datatypes/article.py new file mode 100644 index 00000000..8c8fcede --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.datatypes/article.py @@ -0,0 +1,432 @@ +# --- +# title: Basic Data Types +# description: An article explaining how to use basic data types +# date: '2024-01-26' +# tags: ['tutorial', 'table'] +# keywords: ['type', 'schema'] +# notebook_export_path: notebooks/tutorials/basic_datatypes.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Basic Data Types +# This guide is about basic data types in Pathway: it covers the list of basic data types that can be used in Pathway, explores several available conversion methods, and wraps up with examples of operators that require a column of specific data type as input. + +# Currently, Pathway allows using the following basic Python types: `bool`, `str`, `int`, `float`, and `bytes`. Additionally, there is support for types `datetime` and `duration` from `datetime` module, distinguishing between `utc datetimes` and `naive datetimes`. Finally, Pathway also introduces an additional type for columns representing pointers, called `Pointer`. Below, you can find an example table with six columns, one example column for each of the basic Python types, and one column of type Pointer. The complex types (as datetime) need some conversion, and they are covered in [the later part of the article](/developers/user-guide/types-in-pathway/datatypes#temporal-data-types). +# The standard way to define a type in Pathway is to use a [`schema`](/developers/api-docs/pathway/#pathway.Schema) (you can learn more about schemas in this [article](/developers/user-guide/types-in-pathway/schema/)): +# %% +import datetime + +import pathway as pw + + +class SimpleTypesInputSchema(pw.Schema): + bool_column: bool + str_column: str + bytes_column: bytes + int_column: int + float_column: float + +example_table = pw.debug.table_from_markdown( + ''' + | bool_column | str_column | bytes_column | int_column | float_column + 1 | True | example | example | 42 | 42.16 + 2 | False | text | text | 16 | -16.42 + ''', schema = SimpleTypesInputSchema +).with_columns(id_in_column = pw.this.id) + +pw.debug.compute_and_print(example_table, include_id = False) +# %% [markdown] + +# %% +print(example_table.schema) + +# %% [markdown] +# ## Implicit Typing +# By default, you don't need to worry about they types of columns created with `select` or `with_columns`. The expressions used in those operators have defined output type, Pathway knows it, and assigns the types of new columns automatically. +# In the example below, the new column is of type `float`, as it is a result of multiplication of +# `int` column with a `float` constant. + +# %% +example_table = pw.debug.table_from_markdown( + ''' + | int_number + 1 | 16 + 2 | 42 + ''' +) +example_table += example_table.select(should_be_float_number = example_table.int_number*0.5) +pw.debug.compute_and_print(example_table, include_id=False) +# %% [markdown] +# As you can see, the type of int_number is `int`, and the new column is of type `float`. +# %% +print(example_table.schema) + +# %% [markdown] +# Similarly, the special columns produced by some of the Pathway operators (examples in the later part of the article) have fixed types and as such, you don't need to bother with the types of those columns. + +# ## Apply With Type +# Sometimes you may want to compute a value of a column, using e.g. a function from an external library, that does not define the output type explicitly. In this case, you can use either [`pw.apply`](/developers/api-docs/pathway/#pathway.apply) or [`pw.apply_with_type`](/developers/api-docs/pathway/#pathway.apply_with_type). The first creates a new column of type `any` and the other requires you to specify the type of the output of function that is applied. + +# ## Data types for columns storing text and unstructured data +# In Pathway you can store unstructured data either as `str` or as `bytes`. Both can be converted to other data types, either by built in methods (some examples in this article) or by user defined functions (i.e. via [`pw.apply`](/developers/api-docs/pathway/#pathway.apply) or [`pw.apply_with_type`](/developers/api-docs/pathway/#pathway.apply_with_type)). + +# ### Type `str` +# %% +class StrExampleInputSchema(pw.Schema): + text: str + +str_table = pw.debug.table_from_markdown( + ''' + | text + 1 | cd + 2 | dd + ''', schema = StrExampleInputSchema +) +# %% [markdown] +# Below is an example of conversion from `str` to `bytes`. Currently, there is no built-in conversion method. The recommended way is to use `apply_with_type`. +# %% +str_table = str_table.with_columns(text_as_bytes = pw.apply_with_type(lambda x: x.encode("utf8"), bytes, str_table.text)) + +pw.debug.compute_and_print(str_table, include_id=False) +# %% [markdown] + +# %% +print(str_table.schema) +# %% [markdown] +# ### Module `str` +# Furthermore, Pathway provides a [string module](/developers/api-docs/pathway/#pathway.internals.expressions.StringNamespace) containing string operations. Among other things, it provides several methods that allow parsing converting `str` to other simple types, accessible via the `str` namespace of column (e.g. `table_name.column_name.str.parse_*`). You can find examples of usage of those methods in the [remaining part of this article](/developers/user-guide/types-in-pathway/datatypes#parse-numbers-from-str). + +# ### Type `bytes` +# %% +class BytesExampleInputSchema(pw.Schema): + bytes_from_markdown: bytes + +bytes_table = pw.debug.table_from_markdown( + ''' + | bytes_from_markdown + 1 | cd + 2 | dd + ''', schema = BytesExampleInputSchema +) + +# %% [markdown] +# Below is an example of conversion from `bytes` to `str`. Currently, there is no built-in conversion method. The recommended way is to use `apply_with_type`. Remark: the `to_string` function does not decode the bytes, but shows a string representation of byte numbers. +# %% +bytes_table = bytes_table.with_columns( + text_from_bytes = pw.apply_with_type(lambda x: x.decode("utf8"), str, bytes_table.bytes_from_markdown), + text_representation_of_bytes = bytes_table.bytes_from_markdown.to_string() +) + +pw.debug.compute_and_print(bytes_table, include_id=False) +# %% [markdown] +# +# %% +print(bytes_table.schema) + +# %% [markdown] +# ## Numerical Data Types +# Pathway supports operations on Python `int` and `float` types, and on their `numpy` counterparts. Below, you can find a few short examples that read and convert numbers in Pathway. +# +# ### Type `int` +# %% +class IntExampleInputSchema(pw.Schema): + int_number: int + +int_table = pw.debug.table_from_markdown( + ''' + | int_number + 1 | 2 + 2 | 3 + ''', schema = IntExampleInputSchema +) + +# %% [markdown] +# Similarly, as in the conversion between `str` and `bytes`, you can use `apply_with_type` to convert a column of type `int` into a column of type `float`. Furthermore, it can be expressed in a more concise way, with `apply`. Moreover, in this case you can also use the built-in `cast` function. All mentioned examples can be found in the code snippet below: +# %% +int_table = int_table.with_columns( + int_as_float = pw.apply_with_type(lambda x: float(x), float, int_table.int_number), + int_as_float_via_constructor = pw.apply(float, int_table.int_number), + int_as_float_casted = pw.cast(float, int_table.int_number) +) + + +pw.debug.compute_and_print(int_table, include_id=False) +# %% [markdown] + +# %% +print(int_table.schema) + +# %% [markdown] +# ### Type `float` +# %% +class FloatExampleInputSchema(pw.Schema): + float_number: float + another_float_number: float + +float_table = pw.debug.table_from_markdown( + ''' + | float_number | another_float_number + 1 | 2 | -5.7 + 2 | 3 | 6.6 + ''', schema = FloatExampleInputSchema +) + +# %% [markdown] +# As in the case of conversion from `int` to `float`, you can use `pw.cast` to convert data from type `float` to `int`. +# %% +float_table = float_table.with_columns(another_number_as_int = pw.cast(int, float_table.another_float_number)) +print(float_table.schema) + +# %% [markdown] +# ### Parse numbers from `str` +# Below, you can find an application of the parsing methods from the `str` namespace ([`parse_int`](/developers/api-docs/pathway/#pathway.internals.expressions.StringNamespace.parse_int) and [`parse_float`](/developers/api-docs/pathway/#pathway.internals.expressions.StringNamespace.parse_float)) to parse ints and floats for columns of type `str`. + +# %% +class StrNumberExampleInputSchema(pw.Schema): + number: str + +str_number_table = pw.debug.table_from_markdown( + ''' + | number + 1 | 2 + 2 | 3 + ''', schema = StrNumberExampleInputSchema +) + +str_number_table = str_number_table.with_columns( + number_as_int = str_number_table.number.str.parse_int(), + number_as_float = str_number_table.number.str.parse_float(), + number_with_extra_text = str_number_table.number + "a" +) + +pw.debug.compute_and_print(str_number_table) + +# %% [markdown] +# As you can see, the schema shows that the original column was of type `str`, and each new column has a different type, as expected. + +# %% +print(str_number_table.schema) + +# %% [markdown] +# ### Numerical Module +# In case you need to use some basic operations on columns of numerical type, Pathway provides a [module](/developers/api-docs/pathway/#pathway.internals.expressions.NumericalNamespace) containing functions over numerical data types such as [`abs`](/developers/api-docs/pathway/#pathway.internals.expressions.NumericalNamespace.abs) or [`round`](/developers/api-docs/pathway/#pathway.internals.expressions.NumericalNamespace.round). + +# %% [markdown] +# ## Temporal Data Types +# In Pathway, temporal data types (`datetime.datetime`) are complex data types with some representation as some simple type (as `int` or `str`). As such, you first need to load the input as simple type, and only then convert it to temporal type. +# Similarly to Python, Pathway distinguishes between [naive datetime](/developers/api-docs/pathway/#pathway.DateTimeNaive) (not aware of timezones) and [UTC datetime](/developers/api-docs/pathway/#pathway.DateTimeUtc) (aware of time zones). +# Below, you can find examples of reading both kinds of datetime, initially provided as `str` and `int`, using methods from the Pathway [`dt` module](/developers/api-docs/pathway/#pathway.internals.expressions.DateTimeNamespace): + +# %% +class DatetimeNaiveExampleInputSchema(pw.Schema): + t1: str + t2: int + +naive_datetime = pw.debug.table_from_markdown( + """ + | t1 | t2 + 0 | 2023-05-15T10:13:00 | 1684138380000 + """, schema = DatetimeNaiveExampleInputSchema +) +fmt = "%Y-%m-%dT%H:%M:%S" +naive_datetime = naive_datetime.with_columns( + dt1 = naive_datetime.t1.dt.strptime(fmt=fmt), + dt2 = naive_datetime.t2.dt.from_timestamp("ms") +) + +naive_datetime = naive_datetime.with_columns( + difference = naive_datetime.dt1 - naive_datetime.dt2 +) + +pw.debug.compute_and_print(naive_datetime) + +print(naive_datetime.schema) +# %% [markdown] + +# %% +utc_datetime = pw.debug.table_from_markdown( + """ + | t1 | t2 + 0 | 2023-05-15T10:13:00+01:00 | 1684138380000 + """, schema = DatetimeNaiveExampleInputSchema +) + +fmt = "%Y-%m-%dT%H:%M:%S%z" +utc_datetime = utc_datetime.with_columns( + dt1 = utc_datetime.t1.dt.strptime(fmt=fmt), + dt2 = utc_datetime.t2.dt.utc_from_timestamp("ms") +) + +utc_datetime = utc_datetime.with_columns( + difference = utc_datetime.dt1 - utc_datetime.dt2 +) + +pw.debug.compute_and_print(utc_datetime) + +print(utc_datetime.schema) + +# %% [markdown] +# ## Type `bool` +# Below, you can find a piece of code reading and converting boolean data. +# %% +class BoolExampleInputSchema(pw.Schema): + boolean_column: bool + +bool_table = pw.debug.table_from_markdown( + ''' + | boolean_column + 1 | True + 2 | False + ''', schema = BoolExampleInputSchema +) + + +bool_table = bool_table.with_columns(bool_as_str = bool_table.boolean_column.to_string()) +bool_table = bool_table.with_columns(bool_as_str_as_bool_parse = bool_table.bool_as_str.str.parse_bool()) +pw.debug.compute_and_print(bool_table, include_id=False) +print(bool_table.schema) + +# %% [markdown] +# Warning: please do not use cast to convert boolean data type. While it is possible to call it, its behavior is counterintuitive and will be deprecated. Below, we demonstrate the odd behavior. +# %% +bool_table = bool_table.with_columns(bool_as_str_as_bool_cast = pw.cast(bool, bool_table.bool_as_str)) +pw.debug.compute_and_print(bool_table, include_id=False) +print(bool_table.schema) + +# %% [markdown] +# ## Optional Data Types +# Sometimes, you don't have a guarantee that the data is always present. To accommodate for such columns, Pathway provides support for the `Optional` data type. More precisely, whenever you expect the column to have values of type `T`, but not necessarily always present, the type of Pathway column to store this data should be `Optional[T]` which can also be denoted as `T | None`. Below, you can find a short example of the column with optional floats and two conversion methods. +# %% +class OptInputSchema(pw.Schema): + opt_float_num: float | None + +t = pw.debug.table_from_markdown( + """ + | opt_float_num +1 | 1 +2 | 2 +3 | None +""", + schema=OptInputSchema, +) + +pw.debug.compute_and_print(t, include_id=False) +print(t.schema) + +# %% [markdown] +# To obtain a column with a non-optional type, you can filter the non-empty values using `filter` and `is_not_none`: +# %% +t1 = t.filter(t.opt_float_num.is_not_none()).rename_columns(float_num = t.opt_float_num) +pw.debug.compute_and_print(t1, include_id=False) +print(t1.schema) +# %% [markdown] +# The more general way of making the type non-optional is via `unwrap`. The code below is equivalent to the application of `filter` and `is_not_none()` above. +# %% + +t2 = t.filter(t.opt_float_num != None) +t2 = t2.with_columns(float_num = pw.unwrap(t2.opt_float_num)).without(t2.opt_float_num) +pw.debug.compute_and_print(t2, include_id=False) +print(t2.schema) + +# %% [markdown] +# ## Operators with Type Constraints +# Pathway provides several operators requiring input columns to have specific types. The input types are constrained because the functions are not defined for all types, e.g., temporal operators require time-like input columns, sort operator requires data to be sortable, and `diff` requires that we can subtract two elements of considered type. + +# ### Temporal operators +# An example of a temporal operator is the `windowby` operator. Its first argument is `time_expr` - the operator uses this column to store time associated with each row and then uses it according to window type and temporal behavior defined in other parameters. Since this column is supposed to represent time, we accept the types `int`, `float`, `datetime`, as they can be reasonably used to do so. In the example below, the `windowby` operator uses a column with naive `datetime`. +# %% +fmt = "%Y-%m-%dT%H:%M:%S" + +table = pw.debug.table_from_markdown( + """ + | time | number + 0 | 2023-06-22T09:12:34 | 2 + 1 | 2023-06-22T09:23:56 | 2 + 2 | 2023-06-22T09:45:20 | 1 + 3 | 2023-06-22T09:06:30 | 1 + 4 | 2023-06-22T10:11:42 | 2 +""" +).with_columns(time=pw.this.time.dt.strptime(fmt)) + +result = table.windowby( + table.time, + window=pw.temporal.tumbling(duration=datetime.timedelta(minutes=30)), +).reduce( + window_start = pw.this._pw_window_start, + chocolate_bars=pw.reducers.sum(pw.this.number), +) + +pw.debug.compute_and_print(result, include_id=False) + +# %% [markdown] +# ### Sorting Operator +# Another example of an operator that accepts type-constrained columns is `sort`. It requires that the values in the column can be sorted (i.e., the column has type with total order). Currently, it can be used with all simple types, however please take into account that comparing elements of type `str` or `bytes` may be slow, so it's generally not recommended. +# %% +table_to_sort = pw.debug.table_from_markdown(''' + value | value_str + 1 | de + 2 | fg + 3 | cd + 4 | ab + 5 | ef + 6 | bc +''') + +sorted_by_value = table_to_sort.sort(table_to_sort.value) + table_to_sort +print(sorted_by_value.schema) + +# %% [markdown] + +# %% +sorted_by_value_str = table_to_sort.sort(table_to_sort.value_str) + table_to_sort +print(sorted_by_value_str.schema) + +# %% [markdown] +# ### Diff +# Below are a few examples demonstrating the `diff` operator. Essentially, it sorts the table with respect to one column, and then, for each row and some other column, it subtracts the previous value from the current value. As such, it has two types of constrained columns, one with constraints for the `sort` operator, and the other requires that we can subtract the elements. Currently, among simple types, the subtraction can be done on elements of type `int`, `float` and `datetime`. + +# %% +table = pw.debug.table_from_markdown(''' + timestamp | values | values_str + 1 | 1 | fg + 2 | 2 | ef + 3 | 4 | de + 4 | 7 | cd + 5 | 11 | bc + 6 | 16 | ab + ''') +table1 = table + table.diff(pw.this.timestamp, pw.this.values) +print(table1.schema) +pw.debug.compute_and_print(table1, include_id=False) + + +table = table.with_columns(date = table.values.dt.from_timestamp("ms")) + +table2 = table + table.diff(pw.this.timestamp, pw.this.date) +print(table2.schema) +pw.debug.compute_and_print(table2, include_id=False) + +table3 = table + table.diff(pw.this.values_str, pw.this.values) +print(table3.schema) +pw.debug.compute_and_print(table3, include_id=False) + +# %% [markdown] +# In particular, calling `diff` on elements from `values_str`, which cannot be subtracted, causes the following error: +# ``` +# TypeError: Pathway does not support using binary operator sub on columns of types , . +# ``` + + + diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.json_type/__init__.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.json_type/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.json_type/article.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.json_type/article.py new file mode 100644 index 00000000..3b43c9a0 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.json_type/article.py @@ -0,0 +1,172 @@ +# --- +# title: Dealing with JSON data type +# description: An article explaining how to use JSON in Pathway +# date: '2023-12-22' +# thumbnail: '/assets/content/blog/th-json.png' +# tags: ['tutorial', 'table'] +# keywords: ['JSON', 'type', 'schema'] +# notebook_export_path: notebooks/tutorials/json_type.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Handling JSON in Pathway +# +# JSON is a widely used format for data interchange due to its simplicity and readability. Upon finishing this article, managing JSON in Pathway should become effortlessly intuitive. +# +# As an example, we'll use JSON objects loaded directly from python list. However, JSON data can come from various sources that support this format, such as [Kafka](/developers/user-guide/connectors/kafka_connectors) or an [HTTP connector](/developers/api-docs/pathway-io/http/#pathway.io.http.rest_connector). +# %% +rows = [ + ( + 1, + { + "author": {"id": 1, "name": "Haruki Murakami"}, + "books": [ + {"title": "Norwegian Wood", "year": 1987}, + { + "title": "Kafka on the Shore", + "year": 2002, + "category": "Literary Fiction", + }, + ], + }, + ), + ( + 2, + { + "author": {"id": 2, "name": "Stanisław Lem"}, + "books": [ + {"title": "Solaris", "year": 1961, "category": "Science Fiction"}, + {"title": "The Cyberiad", "year": 1967, "category": "Science Fiction"}, + ], + }, + ), + ( + 3, + { + "author": {"id": 3, "name": "William Shakespeare"}, + "books": [ + {"title": "Hamlet", "year": 1603, "category": "Tragedy"}, + {"title": "Macbeth", "year": 1623, "category": "Tragedy"}, + ], + }, + ), +] + +# _MD_COMMENT_START_ +import logging + +# %% [markdown] +# Each JSON object carries information about an author and their associated books. To load it, let's establish a [schema](/developers/user-guide/types-in-pathway/schema/#understanding-data-types-and-schemas) reflecting the data's structure and then proceed to load this data into a table. +# %% +import pathway as pw + +logging.basicConfig(level=logging.CRITICAL) +# _MD_COMMENT_END_ + + +class InputSchema(pw.Schema): + key: int + data: pw.Json + + +table = pw.debug.table_from_rows(schema=InputSchema, rows=rows) + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(table) +# _MD_COMMENT_END_ +# _MD_SHOW_table + + +# %% [markdown] +# Pathway enables manipulation of JSON from two perspectives: expressions and [user-defined functions](/developers/api-docs/pathway#pathway.udf). Let's examine each one separately. + +# %% [markdown] +# ## Working with JSONs using expressions +# +# ### Accessing JSON fields +# +# A column of type [`pw.Json`](/developers/api-docs/pathway/#pathway.Json) enables access to its attributes using the index operator (`[]`). This operator accepts an index in the form of a string for JSON objects, an integer for JSON arrays, or an expression evaluating to one of these types. If there's no element at the index or if the value is `pw.Json.NULL`, it returns `pw.Json.NULL`, making this operator convenient for chaining. + +# %% +books = table.select(author=pw.this.data["author"]["name"], books=pw.this.data["books"]) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(books) +# _MD_COMMENT_END_ +# _MD_SHOW_books + + +# %% [markdown] +# Alternatively, a `get()` method can be used to access `JSON` attributes. This method allows defining a custom default value. + +# %% +sample = table.select( + author=pw.this.data["author"]["name"], + title=pw.this.data["books"][0]["title"], + category=pw.this.data["books"][0].get("category", default=pw.Json("Uncategorized")), +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(sample) +# _MD_COMMENT_END_ +# _MD_SHOW_sample + + +# %% [markdown] +# ### Converting to simple types +# +# `JSON` column can be converted into `Optional[T]` where `T` is one of the simple types, using methods: [`as_int()`](/developers/api-docs/pathway/#pathway.ColumnExpression.as_int), [`as_str()`](/developers/api-docs/pathway/#pathway.ColumnExpression.as_str), [`as_float()`](/developers/api-docs/pathway/#pathway.ColumnExpression.as_float), [`as_bool()`](/developers/api-docs/pathway/#pathway.ColumnExpression.as_bool). + +# %% +books.select(author=pw.unwrap(pw.this.author.as_str()).str.upper()) +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + books.select(author=pw.unwrap(pw.this.author.as_str()).str.upper()) +) +# _MD_COMMENT_END_ + +# %% [markdown] +# ### Flatten +# +# You can utilize the [`flatten()`](/developers/api-docs/pathway-table/#pathway.internals.table.Table.flatten) operator specifically on columns that contain JSON arrays. It's a useful tool when working with complex JSON structures. + +# %% +flat_list = books.flatten(pw.this.books) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(flat_list) +# _MD_COMMENT_END_ +# _MD_SHOW_flat_list + +# %% [markdown] +# ## JSON in UDFs + + +# %% [markdown] +# Pathway enables manipulation of JSON using [user-defined functions](/developers/api-docs/pathway/#pathway.udf). Just like with expressions, the index operator (`[]`) and methods allowing conversion into specific types are available. It's crucial to note that this conversion is strict— attempting to convert incompatible data will result in an exception. + + +# %% +@pw.udf +def transform(data: pw.Json) -> pw.Json: + return {"century": (data["year"].as_int()) // 100 + 1, **data.as_dict()} + + +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + flat_list.select(title=pw.this.books["title"], metadata=transform(pw.this.books)) +) +# _MD_COMMENT_END_ +# _MD_SHOW_flat_list.select(title=pw.this.books["title"], metadata=transform(pw.this.books)) + +# %% [markdown] +# Further details about `pw.Json` functionality are available in the dedicated [API documentation](/developers/api-docs/pathway/#pathway.Json). diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/.gitignore b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/.gitignore new file mode 100644 index 00000000..287859db --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/.gitignore @@ -0,0 +1 @@ +myschema.py diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/__init__.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/article.py b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/article.py new file mode 100644 index 00000000..f17decb9 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/article.py @@ -0,0 +1,131 @@ +# --- +# title: 'Automatic generation of Schema class' +# description: 'Tutorial on generating Schema from sample CSV file or definition in JSON' +# date: '2023-11-08' +# thumbnail: '/assets/content/blog/th-custom-connectors.png' +# tags: ['tutorial', 'table'] +# keywords: ['schema', 'type', 'JSON', 'connectors'] +# notebook_export_path: notebooks/tutorials/schema-generation.ipynb +# --- + +# # Automatic Generation of Schema in Pathway +# +# In this article, you will learn how to easily generate Schemas, which are used to define the +# structure of a Pathway Table. + + +# In Pathway, Schemas are used to define the structure of a Table, that is, names and types +# of columns. To learn the basics of Schemas in Pathway, and how you can create them by writing +# a class definition in Python, read our [user guide](/developers/user-guide/types-in-pathway/schema/). +# In this tutorial, you will learn alternative ways to create Schemas, either by providing a dictionary of column definitions or sample input data in CSV format. + +# To run the examples in this tutorial, you can download the needed files by uncommenting and executing the following code: + +# %%capture --no-display +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/schema-generation-sample-data.csv -O data.csv +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/schema-generation-schema.json -O schema.json + + +# ## Schema generation from JSON +# +# In Pathway, you can build a Schema from a dictionary using the +# [schema_builder function](/developers/user-guide/types-in-pathway/schema/#schema-as-a-dictionary), described in our user guide. +# The values in the dictionary given as an input to `pw.schema_builder`, however, are instances of `pw.column_definition`, which has some limitations, e.g. they can't be easily serialized in a JSON file. For this purpose, in Pathway there is a `pw.schema_from_dict` function, which takes +# as an input a dictionary whose keys are names of the columns, and values are either: +# - type of a column +# - a dictionary with keys "dtype", "primary_key", "default_value", which define respectively the type of the column, whether it is a primary key of the Table, and what is its default value. + +# To see a working example of `pw.schema_from_dict`, start by creating a JSON file with a definition of the Schema you want to generate. + +# ```JSON +# { +# "key": { +# "dtype": "int", +# "primary_key": true +# }, +# "name": { +# "dtype": "str", +# "default_value": "" +# }, +# "value": "int" +# } +# ``` + + +# Now, you need to load it into Python, where it will be represented as a `dict` type. + +# + +import json + +with open("schema.json") as f: + schema_definition = json.load(f) +# - + +# Then all you need to do is pass it as an argument to `pw.schema_from_dict` which will return a Schema object. + +# + +import pathway as pw + +schema = pw.schema_from_dict(schema_definition) + +# Check the resulting Schema +schema, schema.primary_key_columns(), schema.default_values() +# - + +# You can now use the created Schema as an argument to the connector you want to use. + +# ## Schema generation from data in a CSV file +# +# If you have a CSV file with sample data, you can use it to generate a schema in Pathway. The name of columns will be taken from the header of the CSV file, whereas types of columns are inferred by checking if values in the given column can be parsed to int or float. This method of Schema generation does not support choosing primary keys or setting default values. Still, once you generate a Schema, you can generate a class definition using the method described later in this tutorial and then edit it to suit your needs. +# +# To generate a Schema based on a CSV file, use a `pw.schema_from_csv` function. To see how it works, you will use the following example data in CSV format: +# ```csv +# age,owner,pet +# 10,Alice,dog +# 9,Bob,dog +# 8,Alice,cat +# 7,Bob,dog +# ``` + +# To use `pw.schema_from_csv`, you only need to provide a path of CSV file, but it also has some number of optional arguments: +# - name - the name of the resulting Schema class +# - properties - an instance of `SchemaProperties` +# - delimiter - the delimiter used in the CSV file +# - comment_character - the character used to denote that a row is a comment +# - escape - the escape character used in the CSV file +# - quote - the character used to quote fields +# - enable_double_quote_escapes - enables escaping quotes by using double quotes +# - num_parsed_rows - how many rows should be parsed. If None, all rows will be parsed. + +# For the example, you can use the default values of the optional arguments, so the Python code is: + +# + +schema = pw.schema_from_csv("data.csv") + +# Check the resulting Schema +schema +# - + +# ## Persisting generated Schema as a Python class definition +# +# If you have a Schema object, you may choose to generate a class definition, either +# to make the codebase independent of other files you need to generate the Schema or to +# change it, for example, adding default values to a Schema generated based on a CSV file. +# +# To do that, you can use a method in the Schema class called `generate_class` to generate a string with a class definition or `generate_class_to_file` to generate a class definition and save it to a file. + +# Let's go through an example of using `generate_class` and `generate_class_to_file`. In the example, you will work on the schema generated in the Section on `schema_from_dict`. + +with open("schema.json") as f: + schema_definition = json.load(f) +schema = pw.schema_from_dict(schema_definition) + +# The first method - `generate_class` - has no required arguments, and two optional argument - `class_name`, is the name of the class with the generated Schema and `generate_imports` specifies if imports of modules used in the Class definition should be included in the beginning of the string. If `class_name` is not provided, the schema's name will be used, or if it is not a correct identifier, the default name `CustomSchema` will be used. + +print(schema.generate_class(class_name="MySchema")) + +# Method `generate_class_to_file` has one required argument, which is the path where class definition is to be saved. Its optional arguments `class_name` and `generate_includes` are the same as for `generate_class` method. + +schema.generate_class_to_file("myschema.py", class_name="MySchema", generate_imports=True) +with open("myschema.py") as f: + print(f.read()) diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/data.csv b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/data.csv new file mode 100644 index 00000000..b582cf32 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/data.csv @@ -0,0 +1,5 @@ +age,owner,pet +10,Alice,dog +9,Bob,dog +8,Alice,cat +7,Bob,dog \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/schema.json b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/schema.json new file mode 100644 index 00000000..976ebd76 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/.schema_generation/schema.json @@ -0,0 +1,11 @@ +{ + "key": { + "dtype": "int", + "primary_key": true + }, + "name": { + "dtype": "str", + "default_value": "" + }, + "value": "int" +} \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/10.schema.md b/docs/2.developers/4.user-guide/025.types-in-pathway/10.schema.md new file mode 100644 index 00000000..9880de76 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/10.schema.md @@ -0,0 +1,235 @@ +--- +title: Defining Schema +description: 'Defining schema in Pathway' +--- + +# Data Types and Schemas +In this guide, you will explore how to effectively utilize data types and schemas. + + +## Understanding Data Types and Schemas + +In Pathway, data is represented in the form of tables. +The structure of each table is defined by a schema, which serves as a blueprint for the data. +The schema ensures that the column types are correctly preserved, regardless of variations in the incoming data. + +Typically, Pathway automatically infers the schema, but there are cases where enforcing a specific schema for input proves useful. + +Here is a simple example on how to define a schema in Pathway: + +```python +import pathway as pw + +class InputSchema(pw.Schema): + colA: int + colB: float + colC: str +``` + +## Schema Usage in Pathway +Schemas play an important role in Pathway by allowing you to declare constraints on tables via input connectors. +You can declare the following attributes within a schema: + - **Columns**: Select the desired columns for your table; any undeclared columns will be ignored by the connectors. + - **Columns Types**: Define the data type for each column, Pathway will automatically convert the input data accordingly. + - **Primary Keys**: Set primary keys to determine the indexes. If no primary keys are defined, indexes will be generated automatically. + - **Default values**: Specify default values for columns, making it easier to handle missing data. + + +## How to Define and Use a Schema + +To create a schema, you need to define a class that inherits from `pathway.Schema` or `pw.Schema`, depending on your import. +In the following we will use `pw.Schema`. +Each column is declared as an attribute of the class. The schema is then passed as a parameter to the input connector: + +```python +class InputSchema(pw.Schema): + value: int + + +table = pw.io.csv.read("./input/", schema=InputSchema) +``` +The above example defines a table with only one column named `value` of type `int`. + +## Defining Multiple Columns +You can declare multiple columns in a schema by simply adding them as attributes to your class: + +```python +class MyFirstTwoColumnsSchema(pw.Schema): + colA: int + colB: int +``` + +## Typing the Columns +To assign data types to columns, simply specify the desired types for the associated attributes in your class: + +```python +class TypedSchema(pw.Schema): + colA: int + colB: float +``` + +With `pw.Schema`, you have to type the columns. +If you don't know the type of the input column, you can type the column as `typing.Any`: + +```python +class TypedSchema(pw.Schema): + colA: typing.Any +``` + +⚠️ While tempting, `any` is not a Python type but a function. +Be careful to use `typing.Any` as using `any` will raise a `ValueError`. + +## Defining Primary Keys +To designate primary keys, use the `column_definition` function and the `primary_key` parameter: + +```python +class PrimarySchema(pw.Schema): + colA: int = pw.column_definition(primary_key=True) + colB: float +``` + +In this example, the index will be based on the `colA` column. + +You can select multiple columns to be a part of primary key: + +```python +class MultiplePrimarySchema(pw.Schema): + colA: int = pw.column_definition(primary_key=True) + colB: float + colC: str = pw.column_definition(primary_key=True) +``` + +## Defining Default Values +Similar to primary keys, you can set default values using the `column_definition` function and the `default_value` parameter: + +```python +class DefaultValueSchema(pw.Schema): + colA: int = pw.column_definition(default_value=0) + colB: float + colC: str = pw.column_definition(default_value="Empty") +``` + + + +## Inline Schemas Definitions +When it may not be practical to define a class, when automating schema definitions for example, +Pathway offers an alternative approaches using inline schema definition. + +### Schema from Dictionary +You can define a schema using a dictionary through the [`schema_builder`](/developers/api-docs/pathway/#pathway.schema_builder) function. +`schema_builder` takes as argument a parameter `columns` which is a dictionary that maps column names to column definitions created using the `column_definition` function. +The `dtype` parameter of `column_definition` is used to specify the types, if not provided it defaults to `typing.Any`. +Additionally, if desired, you can assign a name to the schema using the optional `name` parameter. + + +```python +schema = pw.schema_builder(columns={ + 'key': pw.column_definition(dtype=int, primary_key=True), + 'data': pw.column_definition(dtype=int, default_value=0) + }, name="my_schema") +``` + +This resulting schema is equivalent to the following class-based schema: + +```python +class InputSchema(pw.Schema): + key: int = pw.column_definition(primary_key=True) + data: int = pw.column_definition(default_value=0) +``` + +With `schema_builder`, defining the type is optional. The default type is `typing.Any`: + +```python +schema = pw.schema_builder(columns={ + 'key': pw.column_definition(dtype=int, primary_key=True), + 'data': pw.column_definition() + }, name="my_schema") +table = pw.io.csv.read("./input/", schema=schema) +print(table.typehints()) +``` + +``` +{'key': , 'data': } +``` + +### Schema from Types +For the simple cases where you only need to define types and not default values nor primary keys, you can use [`schema_from_types`](/developers/api-docs/pathway/#pathway.schema_from_types). + +`schema_from_types` simply takes the types as `field=type` kwargs: + +```python +schema = pw.schema_from_types(key=int, data=int) +``` + +This resulting schema is equivalent to the following class-based schema: + +```python +class InputSchema(pw.Schema): + key: int + data: int +``` + +## Accessing Table Types +During debugging, you may need to assess the schema of a table. +You can achieve this by printing its `typehints`: + +```python +print(table.typehints()) +``` + +This will display the data types of each column in the table. +For example: + +``` +{'age': , 'owner': , 'pet': } +``` + +You can also print the schema of a single column by using `schema`: + +```python +print(table.schema['age']) +``` + +``` + +``` + +⚠️ Please note that these functions are executed during the creation +of the pipeline *before* any computation is launched by pw.run(). + +## Type Casting an Existing Table +You can also want to cast the data of an existing table. +This can be done using `cast`: + +```python +table = table.select(value = pw.cast(int, pw.this.value)) +``` +This will cast the values of the column `value` to `int`. + +### Typing a Column Created with `apply` +If Pathway fails to infer the correct column's type created with `apply`, +you can enforce the resulting type with `apply_with_type`: + +```python +table = table.select( + value = pw.apply_with_type(lambda x: int(x)+1, int, pw.this.value) +) +``` +This will cast the values to integers and increment each value by one, +resulting in the value column being of type `int`. + +This is only a workaround since Pathway should be able to correctly infer your data type. + + +## Conclusions +Mastering data types and schemas is essential for effectively managing Tables. +By leveraging schemas, you can define the structure of your tables, improving the efficiency of your data pipeline. +If you encounter typing issues, please contact us on [discord](https://discord.com/invite/pathway), so we can help you. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/20.datatypes.md b/docs/2.developers/4.user-guide/025.types-in-pathway/20.datatypes.md new file mode 120000 index 00000000..e88c16d1 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/20.datatypes.md @@ -0,0 +1 @@ +.datatypes/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/30.schema-generation.md b/docs/2.developers/4.user-guide/025.types-in-pathway/30.schema-generation.md new file mode 120000 index 00000000..53b6bdff --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/30.schema-generation.md @@ -0,0 +1 @@ +.schema_generation/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/025.types-in-pathway/40.json_type.md b/docs/2.developers/4.user-guide/025.types-in-pathway/40.json_type.md new file mode 120000 index 00000000..8cfc2ef8 --- /dev/null +++ b/docs/2.developers/4.user-guide/025.types-in-pathway/40.json_type.md @@ -0,0 +1 @@ +.json_type/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/__init__.py b/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/article.py b/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/article.py new file mode 100644 index 00000000..3228567f --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/article.py @@ -0,0 +1,166 @@ +# --- +# title: 'Connectors in Pathway' +# description: 'Presentation of Pathway connectors.' +# --- + +# # Connectors in Pathway +# +# In order to use Pathway, one of the first things you need to do is to access the data you want to manipulate. +# In Pathway, accessing the data is done using **connectors**. +# +# Pathway comes with connectors which connect to external data sources at input, as well as connectors which output the data outside of Pathway. +# Before reading any further, make sure you are already familiar with the [streaming and static modes](/developers/user-guide/connecting-to-data/streaming-and-static-modes/) of Pathway as the connectors are different depending on the chosen mode. +# +# Before going into more details about the different connectors and how they work, here is a summary of available connectors, grouped by mode and whether they are input or output connectors: + +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +#
Streaming modeStatic mode
Input connectors +# Airbyte +# Kafka +# CSV +# Debezium +# Amazon S3 +# http +# JSON Lines +# Redpanda +# Python +# Plain text +# File System +# SQLite +# +# Markdown +# Pandas +# CSV +# Amazon S3 +# JSON Lines +# File System +#
Output connectors +# Kafka +# BigQuery +# PostgreSQL +# Elastic Search +# http +# JSON Lines +# Redpanda +# Logstash +# Slack +# File System +# +# pw.debug.compute_and_print +# pw.debug.compute_and_print_update_stream +# CSV +#
+ + +# The one you need is not in the table? Don't worry, more are coming and you can always reach out to let us know what you would find helpful! + +# ## Formatting +# +# Connectors support different formats, such as CSV or JSON. +# The supported formats depend on the connectors. However, **all the connectors support the binary format**. + +# ## Streaming mode connectors +# +# In the streaming mode, input connectors wait for incoming updates. +# Whenever an update is received, it is propagated in the dataflow (more details [here](/developers/user-guide/connecting-to-data/streaming-and-static-modes/)) until it reaches the output connectors which output the resulting changes. +# This is where it becomes interesting: the table created by the input connector, and all the computations based on it, will be automatically updated whenever an update is received (e.g. a new CSV file has been created). +# All the computations and outputs are automatically updated by Pathway to take into account the updates from the stream, without requiring any operation from your part: this is the magic of Pathway! +# In practice, the updates are triggered by commits, which ensure the atomicity of each update. +# +# As Pathway deals with never-ending streaming data: the computation runs forever until the process gets killed. +# **This is the normal behavior of Pathway.** +# +# Output connectors are the only way to access the results obtained in Pathway in the streaming mode. +# However, our outputs are not static but are updated with every new received update: we do not output the full table but only the changes. +# Every change is represented by a row containing the following columns: +# * the columns of the table, representing the values which have been modified. +# * `time`, representing the logical time of the update: this time is incremented at each new commit. +# * `diff`, representing whether this update is an addition or a removal. Only two values are possible, 1 for addition and -1 for removal. +# An update is represented by two rows, one for deleting the previous value and one to add the new. +# +# To see how to set up a realtime streaming application with Pathway, you can try our examples with [csv](/developers/user-guide/introduction/first_realtime_app_with_pathway/) and [Kafka](/developers/showcases/linear_regression_with_kafka) input connectors. + +# ## Connectors in static mode +# +# In the static mode, the computation is done in batch mode: all the data is read, processed, and output at once. +# There is no notion of update in this mode. +# **This mode is mainly used for debugging and testing.** +# +# In addition of a csv connector which dumps the output table into a csv file, Pathway provides a function which allows to build the graph, to ingest all the data and to print a given table contained in the graph: `pw.debug.compute_and_print`. + +# ## Persistence in connectors +# +# Regardless of the mode, a connector can persist the data read along with results of some intermediate computations. This feature can be enabled by specifying persistence config in `pw.run` method. If the connector is persistent, the Pathway program will preserve its' auxiliary data and will be able to restart from the place where it was terminated last time. This feature may be useful for you in case you need to do re-runs of a program with some added data, or, you want to have a possibility to survive code crashes. + +# ### Example +# +# Let's have a quick example on how to manipulate hand-written table in the static mode: + +# + +import pathway as pw + +t = pw.debug.table_from_markdown( + """ + | name | age + 1 | Alice | 15 + 2 | Bob | 32 + 3 | Carole| 28 + 4 | David | 35 """ +) + +pw.debug.compute_and_print(t) +# - + +# ## Compatibility issues +# +# Both modes are incompatible: **you cannot mix connectors from streaming and static modes**. +# Indeed, the nature of the data manipulated by both types of connectors is very different: data streams versus static data. +# +# For instance, one might want to print the value of their table `table` in their pipeline to check if the values are correct. +# Between two `select`, a `pw.debug.compute_and_print(table)` is inserted and the computation is run with streaming input connectors. +# +# What do you think would happen? +# +# The program loops. Indeed, `pw.debug.compute_and_print` waits for the whole data to be ingested entirely in the dataflow before printing the table. +# This makes sense with finite static data but not in the streaming mode where updates are continuously coming! +# +# Be careful when you want to debug your pipeline with static data! + +# ## Tutorials +# +# To learn how to use the different connectors, you can see our tutorials: +# * [CSV connectors](/developers/user-guide/connectors/csv_connectors/) +# * [Database connectors](/developers/user-guide/connectors/database-connectors/) +# * [Kafka connectors](/developers/user-guide/connectors/kafka_connectors/) +# * [Switch from Kafka to Redpanda](/developers/user-guide/connectors/switching-to-redpanda/) +# * [Python input connector](/developers/user-guide/connectors/custom-python-connectors) +# * [Python output connector](/developers/user-guide/connectors/python-output-connectors) + +# ## Conclusion +# +# Connectors are a vital part of Pathway as they define how your data is accessed to and from Pathway. +# It is important to make the distinction between input/output and streaming/static connectors as they have very different purposes and cannot be mixed. +# +# Pathway provides several connectors, allowing you to connect to your data in different settings in a simple and efficient way. +# We will regularly update this section and provide more connectors. +# +# You can see one of our recipes to see how a [full data processing pipeline](/developers/showcases/suspicious_activity_tumbling_window) works with connectors. diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/example-stream.csv b/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/example-stream.csv new file mode 100644 index 00000000..4148166f --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/.connectors/example-stream.csv @@ -0,0 +1,6 @@ +,date,amount,recipient,sender,recipient_acc_no,sender_acc_no +0,2020-06-04,8946,M. Perez,Jessie Roberts,HU30186000000000000008280573,ES2314520000000006226902 +1,2014-08-06,8529,C. Barnard,Mario Miller,ES8300590000000002968016,PL59879710390000000009681693 +2,2017-01-22,5048,S. Card,James Paletta,PL65889200090000000009197250,PL46193013890000000009427616 +3,2020-09-15,7541,C. Baxter,Hector Haley,PL40881800090000000005784046,DE84733500000003419377 +4,2019-05-25,3580,L. Prouse,Ronald Adams,PL44124061590000000008986827,SI54028570008259759 diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/.streaming-and-static-modes/__init__.py b/docs/2.developers/4.user-guide/030.connecting-to-data/.streaming-and-static-modes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/.streaming-and-static-modes/article.py b/docs/2.developers/4.user-guide/030.connecting-to-data/.streaming-and-static-modes/article.py new file mode 100644 index 00000000..5987f966 --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/.streaming-and-static-modes/article.py @@ -0,0 +1,223 @@ +# --- +# title: "Streaming and static modes" +# description: '' +# --- + +# # Streaming and static modes +# While Pathway is made for processing bounded and unbounded streaming data, entirely static data can also be used for testing and debugging purposes. This article explains what are those two modes -streaming and static- and their differences. +# +# Pathway is purposely designed to work with streaming data. +# However, working in the **streaming mode** may not be the most convenient way to test and debug your application. +# To ease testing and debugging, Pathway provides a **static mode** which allows you to manipulate static and finite data. +# In the following, you are going to see what lies behind those terms, their differences, and how to use both modes. +# +# ## Dataflow +# +# Let's start with a brief explanation of Pathway's [dataflow](/developers/user-guide/introduction/concepts#dataflow). +# In Pathway, the processing pipeline that you define is modeled using a graph. +# This graph, called the dataflow, models the different transformation steps performed on the data. +# Each node is either an operation performed on one or more tables (e.g. a transformer), or a connector. +# +# +# For example, consider the following scenario: you have two tables containing the ages (in T1) and the countries (in T2) of different people and you want to compute the list of people from a given country (let's say the US) and the list of people in this country and whether they are adults or not. +# In Pathway, you can build a function which works on two tables T1 and T2 as follows: + +# ```python +# T1bis = T1.select(*pw.this, adult=pw.apply(lambda x: x>18, pw.this.age)) +# T2bis = T2.filter(pw.this.country == "US") +# T3 = T1bis.join(T2bis, pw.left.name == pw.right.name).select( +# pw.left.name, pw.left.adult +# ) +# ``` + +# In practice, you would need two input connectors to create T1 and T2, you can use the previous sample of code to build T2bis and T3, and then use output connectors to output the resulting tables outside of Pathway. +# The dataflow of such a pipeline would be looking like this: +# +# ![Universe](/assets/content/documentation/graph.svg) +# +# The corresponding notations would be: T1' $\rightarrow$ T1bis, T2' $\rightarrow$ T2bis, t1 $\rightarrow$ `select` with `apply`, t2 $\rightarrow$ `filter`, and t3 $\rightarrow$ `join` and `select`. +# +# The main take-away is that Pathway builds this graph based on the pipeline defined by the user, before any computation is actually done. +# This graph models the relations between the different connectors, tables, and transformers. +# Its main purpose is to provide fast updates in streaming mode. +# +# +# ## Streaming mode +# In the **streaming mode**, Pathway assumes unbounded input data updates. +# This mode requires input connectors listening to streaming data sources. +# Pathway starts to ingest the data into the dataflow only when `pw.run()` is called and the data is maintained in the graph during the computation: results cannot be printed but should be accessed using output connectors. +# The computation runs until the process is killed: everything after `pw.run()` is unreachable code. +# +# +# In this example, T1 and T2 could be obtained using Kafka connectors, and T2bis and T3 could be output using PostgreSQL connectors. +# +# Pathway uses the graph to provide fast updates: instead of ingesting the data from scratch in the graph everytime an update is received, the data is maintained in the graph and locally updated. +# For instance, the reception of an update from the Input Data 1, would only modify T1, T1', and T3 without any impact on T2 and T2'. +# The updates are processed as they are received, without notion of batch, providing realtime streaming processing. +# +# Pathway is designed to be run in this streaming mode and it is the standard way to run Pathway. +# +# ## Static mode +# As briefly mentioned, the streaming mode may not be the most convenient when testing or debugging. +# For that purpose, Pathway provides a **static mode** in which static data may be attached to the connectors. +# In that mode, finite and static data can be loaded, e.g. from a table written in a static csv file or from a markdown table. +# +# In our example, Input Data 1 and Input Data 2 can be small static tables written by hand in a markdown file. +# Pathway provides static output connectors to load this static data: +# +# ```python +# T1 = pw.debug.table_from_markdown( +# """ +# | name | age +# 1 | Alice | 15 +# 2 | Bob | 32 +# 3 | Carole| 28 +# 4 | David | 35 +# """) +# T2 = pw.debug.table_from_markdown( +# """ +# | name | country +# 1 | Alice | US +# 2 | Bob | France +# 3 | Carole| Germany +# 4 | David | US +# """) +# ``` +# +# When the computation is run in the static mode, all the data is loaded and processed at once. +# While the static mode does not fully benefit from the dataflow, it allows checking if the graph is correctly built. +# +# To ease the debugging, Pathway provides a function called `compute_and_print`. +# When calling `pw.debug.compute_and_print(t)`, Pathway builds the whole graph, ingests all the available static data, prints the obtained table `t`, and then discards the data. +# Calling twice `compute_and_print` will result in ingesting the data twice. + +# ## Summaries of differences +# +# Here is a summary of the main differences in the two modes: +# +# | | **Streaming mode** | **Static mode** | +# | ------------------------------ | :-----------------------------------------------: | :-------------: | +# | Data type | unbounded, streaming data | finite, static data | +# | Computation type | streaming | batch | +# | Can be used in production | yes | no | +# | Starting the computation | `pw.run` | `pw.debug.compute_and_print` or `pw.run` | +# | Builds a dataflow | yes | yes | +# | How the data is ingested in the graph | maintained at each update | ingested from scratch everytime | +# | Termination of the computation | runs forever | automatically terminated once the graph is built | +# | Printing data | no, data should be accessed using output connectors | yes, via `pw.debug.compute_and_print` | + +# ## Comparative example +# +# In the following, we implement our full example into Pathway using both modes. +# +# ### Common pipeline +# +# The processing pipeline should be designed in the same way no matter what mode is used in the end. The only difference should be with regartds to how the input and output data is manipulated. +# To highlight the fact that rest of the implementation remains the same, you can implement the pipeline in a function taking T1 and T2 as parameters and returning T2bis and T3: + +# + +import pathway as pw + + +class SchemaT1(pw.Schema): + name: str + age: int + +class SchemaT2(pw.Schema): + name: str + country: str + +def pipeline(T1, T2): + T1bis = T1.select(*pw.this, adult=pw.apply(lambda x: x > 18, pw.this.age)) + T2bis = T2.filter(pw.this.country == "US") + T3 = T1bis.join(T2bis, pw.left.name == pw.right.name).select( + pw.left.name, pw.left.adult + ) + return (T2bis, T3) + + +# - + +# ### Streaming mode +# +# In the streaming mode, we must connect to external data sources such as CSV files for the input and for the output. +# Our implementation would be like this: +# +# ```python +# T1=pw.io.csv.read(inputDir1, schema=SchemaT1, mode="streaming") +# T2=pw.io.csv.read(inputDir2, schema=SchemaT2, mode="streaming") +# T2bis,T3=pipeline(T1,T2) +# pw.io.csv.write(T2bis, outputDir1) +# pw.io.csv.write(T3, outputDir2) +# pw.run() +# ``` +# +# The computation is started with `pw.run()` and will not finish until the process is killed. +# The results of the computations are sent to PostgreSQL via the output connectors. + +# ### Static mode +# +# In the static mode, if the connectors are compatible, we only need to change the mode from "streaming" to "static": +# +# ```python +# T1=pw.io.csv.read(inputDir1, schema=SchemaT1, mode="static") +# T2=pw.io.csv.read(inputDir2, schema=SchemaT2, mode="static") +# T2bis,T3=pipeline(T1,T2) +# pw.io.csv.write(T2bis, outputDir1) +# pw.io.csv.write(T3, outputDir2) +# pw.run() +# ``` +# +# And that's it! +# The computation will be launched, all the available data will be ingested and processed, +# and the output will be written in the same CSV files. +# The only difference is that the files will not be updated whenever new CSV files are added to the input directories: +# only the data available at launch time will be processed. +# +# But the static mode has more to offer: we can enter the input data by hand and check step by step what is happening on this data. +# With manually entered data, our example becomes: + +T1 = pw.debug.table_from_markdown( + """ + | name | age + 1 | Alice | 15 + 2 | Bob | 32 + 3 | Carole| 28 + 4 | David | 35 + """ +) +T2 = pw.debug.table_from_markdown( + """ + | name | country + 1 | Alice | US + 2 | Bob | France + 3 | Carole| Germany + 4 | David | US + """ +) +T2bis, T3 = pipeline(T1, T2) + +# Without output connectors, this implementation does not do any computation: it builds the graph but does not add any data. +# We need to trigger the data insertion into the graph by printing some data. +# In the static mode, we can check that our tables have been well defined and loaded by printing them: + +pw.debug.compute_and_print(T1) + +# The extra column contains the indexes of the rows (the `...` at the end means that they are truncated for display – the full IDs are 128 bit and are a bit long in a text format 😉). +# Every table has such a column and `id` is a reserved name which cannot be used as column name. +# Indexes are pointers and can be generated based on a given input column, otherwise they are generated automatically. + +pw.debug.compute_and_print(T2) + +# You can also check that the pipeline returns the expected tables: + +pw.debug.compute_and_print(T2bis) + +pw.debug.compute_and_print(T3) + +# ## Conclusion +# +# While Pathway is made for the streaming mode, the static mode can be used to test and debug your pipeline. +# The implementation should be the same in both modes. Only the way data is input and output differs. +# +# The different ways to access the data both in streaming and static mode is explained in more details in our [guide to connectors](/developers/user-guide/connecting-to-data/connectors/). diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/20.streaming-and-static-modes.md b/docs/2.developers/4.user-guide/030.connecting-to-data/20.streaming-and-static-modes.md new file mode 120000 index 00000000..3fc70838 --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/20.streaming-and-static-modes.md @@ -0,0 +1 @@ +.streaming-and-static-modes/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/30.connectors.md b/docs/2.developers/4.user-guide/030.connecting-to-data/30.connectors.md new file mode 120000 index 00000000..731f4c9a --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/30.connectors.md @@ -0,0 +1 @@ +.connectors/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/50.artificial-streams.md b/docs/2.developers/4.user-guide/030.connecting-to-data/50.artificial-streams.md new file mode 100644 index 00000000..f6bf5de9 --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/50.artificial-streams.md @@ -0,0 +1,210 @@ +--- +title: 'Generate Data Streams with the demo Module' +description: 'How to generate artificial data streams using the `demo` module' +--- + + +# Artificial Data Streams with the `demo` Module +With Pathway's [`demo`](/developers/api-docs/pathway-demo/) module, you can create custom data streams from scratch or by utilizing a CSV file. +This feature empowers you to effectively test and debug your Pathway implementation using realtime data. + +Pathway offers a seamless transition from static to streaming data. +However, obtaining a real data stream for testing your application can be challenging in specific scenarios. +This is where the `demo` module comes in handy. +With the `demo` module, you can generate your own artificial data streams. +With a collection of functions that simplify creating custom data streams, this module allows you to test and experiment with realtime data streams. +This article will teach you how to generate data streams with the `demo` module. + +## Using real data streams is challenging + +While it is not advisable to test directly in a production environment, accessing the same data as in production can be troublesome. +Let's consider an example where you are developing an IoT health monitoring system ⛑️. +You need to analyze data from various health sensors a patient wears, such as a glucose monitor 📈 or a pulse sensor 🫀. +Sharing this sensitive data poses privacy concerns, and merely having a snapshot of the data wouldn't suffice. +You would need to test your system in a live environment to truly evaluate it. +However, conducting a full-scale test with real volunteers wearing all the sensors and sharing their data and time for several hours would be costly and impractical. + +In general, accessing realtime data streams during the development phase can be challenging due to the following reasons: + - **Data Availability**: Data sources might require special permissions, API integrations, or data-sharing agreements, making it challenging to obtain the necessary data for debugging. + - **Data Privacy and Security**: Data might contain sensitive or private information that needs to be handled carefully. Data privacy regulations and security concerns may restrict the availability of realtime data for debugging purposes. + - **Production Data Constraints**: Streaming applications often process large volumes of realtime data in production environments. Directly accessing and replicating such data for local debugging purposes can be resource-intensive and impractical. The scale and complexity of realtime data streams may require specialized infrastructure and tools that are not feasible to replicate in a local debugging setup. + - **Data Consistency**: Realtime data streams continuously evolve, making reproducing specific debugging scenarios challenging. In a debugging environment, it is crucial to have consistent and reproducible data to identify and fix issues effectively. realtime data streams can introduce variability, making it difficult to isolate specific events or situations for debugging purposes. + - **Testing Environment Constraints**: Debugging streaming applications often requires a controlled testing environment. Multiple components and dependencies interact in a production setting to generate realtime data. Isolating and replicating these dependencies in a testing environment while maintaining the fidelity of the data can be complex and time-consuming. + - **realtime Dependencies**: Streaming applications rely on external systems and services for data ingestion, processing, and storage. Debugging such applications may involve interactions with these external dependencies, further complicating the debugging process. Coordinating and synchronizing the availability of these dependencies for debugging purposes can be challenging. + + +Given these challenges, having the ability to generate artificial data streams using the `demo` module becomes invaluable. +By generating your own data streams, you can define a controlled and reproducible environment for testing and debugging. +This allows you to iterate quickly, identify issues, and refine your code without relying on realtime external data sources. + +## The `demo` module + +The Demo module offers several functions that facilitate the generation of artificial data streams. +These functions are valuable tools when real data streams are not readily available. Let's take a closer look at the functions provided by the Demo module: + + - `range_stream`: The hello world of data streams. This function generates a simple data stream with a single column, 'value', where the values range from a specified offset to the number of rows plus the offset. This function is useful when you need a straightforward data stream to verify your application is running. + + - `noisy_linear_stream`: This function generates a data stream with two columns, 'x' and 'y', where the 'x' values range from 0 to a specified number of rows, and the 'y' values are calculated based on 'x' with added random noise. This function was made to experiment with linear regression algorithms. + + - `generate_custom_stream`: This function allows you to create a custom data stream by specifying value generators for each column. You can define the schema of the resulting table and choose the number of rows to generate. Additionally, you can control the input rate of the stream. This function is beneficial when you want to simulate a specific data stream for testing or demonstration purposes. + + - `replay_csv`: This function allows you to replay a static CSV file as a data stream. You simply have to specify the path to the file and the schema of the resulting table. This function is beneficial when you have a static CSV file and want to treat it as a data stream for testing or analysis purposes. + + - `replay_csv_with_time`: Similarly to `replay_csv`, this function allows you to replay a static CSV file as a data stream. The time of the updates is based on the timestamps contained in a given column of the CSV file. The timestamps should be sorted. + +## Examples + +### Generating a single-column data stream with `range_stream` +This function generates a simple data stream with a single column, 'value', whose values range from 0 to `nb_rows`: + +```python +import pathway as pw +table = pw.demo.range_stream(nb_rows=50) +``` + +``` +value +0 +1 +2 +3 +... +``` + +It can be used to verify that your application is responsive. +You can write the table with a CSV output connector to check the stream is generated correctly. + +This function is named after the sum example of the [first realtime app guide](/developers/user-guide/introduction/first_realtime_app_with_pathway/): + +```python +import pathway as pw +table = pw.demo.range_stream(nb_rows=50) +table = table.reduce(sum=pw.reducers.sum(pw.this.value)) +``` + +``` +sum +0 +1 +3 +6 +... +``` + +You can specify offset to change the starting value: + + +```python +import pathway as pw +table = pw.demo.range_stream(nb_rows=50, offset=10) +``` + +``` +value +10 +11 +12 +13 +... +``` + +If you set `nb_rows` to `None`, the stream will be generated indefinitely. +You can also set the input rate with the `input_rate` parameter, which defines the number of insertions per second. It defaults to one. + + +### Linear regression with `noisy_linear_stream` +This function generates an artificial data stream ideal for linear regression tutorials. This function generates a data stream with two columns, 'x' and 'y', where the 'x' values range from 0 to a specified number of rows, and the 'y' values are calculated based on 'x' with added random noise: + +```python +import pathway as pw +table = pw.demo.noisy_linear_stream(nb_rows=100) +``` + +``` +x,y +0,0.06888437030500963 +1,1.0515908805880605 +2,1.984114316166169 +3,2.9517833500585926 +4,4.002254944273722 +5,4.980986827490083 +6,6.056759717806955 +7,6.9606625452157855 +8,7.995319390830471 +9,9.016676407891007 +... +``` + +This function generates the data stream for our [linear regression from Kafka example](/developers/showcases/linear_regression_with_kafka). +As with `range_stream`, you can also set the input rate with the `input_rate` parameter, which defines the number of insertions per second. It defaults to one. + +### Custom data streams with `generate_custom_stream` +`generate_custom_stream` is a generalization of `range_stream` and `noisy_linear_stream`. +It generates rows with an index ranging from 0 to `nb_rows`. +The content of the table is determined by a dictionary `value_functions` mapping column names to a value generator: for each row, and its associated index $i$, the value of the column `col` is `value_functions[col](i)`. +The schema should also be provided. + +```python +import pathway as pw + +value_functions = { + 'number': lambda x: x + 1, + 'name': lambda x: f'Person {x}', + 'age': lambda x: 20 + x, +} + +class InputSchema(pw.Schema): + number: int + name: str + age: int + +table = pw.demo.generate_custom_stream(value_functions, schema=InputSchema, nb_rows=10) +``` + +The `generate_custom_stream` function creates a data stream with 10 rows in this example. +The stream has three columns: 'number', 'name', and 'age'. +The 'number' column values are the index of the row incremented by 1, the 'name' column contains formatted names with corresponding row indices, and the 'age' column contains values starting from 20 incremented by the row index: + +``` +number,name,age +1,"Person 0",20 +2,"Person 1",21 +3,"Person 2",22 +... +``` + +If you set `nb_rows` to `None`, the stream will be generated indefinitely. +You can also set the `autocommit_duration_ms`, which defines the maximum time between two commits. +Finally, you can select the input rate with the `input_rate` parameter, specifying the number of insertions per second. It defaults to one. + +### Replaying static CSV files with `replay_csv` and `replay_csv_with_time` +These functions let you replay a static CSV file as a data stream. You can specify the path to the file, choose which columns to extract, and define the schema of the resulting table. This function is beneficial when you have a static CSV file and want to treat it as a data stream for testing or analysis purposes. + +```python +import pathway as pw + +class InputSchema(pw.Schema): + column1: str + column2: int + +table = pw.demo.replay_csv(path='data.csv', schema=InputSchema, input_rate=1.5) +``` + +In this example, we use the `replay_csv` function to replay a static CSV file `data.csv` as a data stream. +We specify the path to the CSV file and the schema for the resulting table. +The data stream is generated with a specified input rate of 1.5 rows per second. + +If you already have timestamps in your file, you can replay the file while respecting the time between the updates by using `replay_csv_with_time`. +You simply need to specify the column in which the timestamps are stored with `time_column` and the unit. +Only seconds, milliseconds, microseconds and nanoseconds are supported. +The file will be replayed using the first row as starting point, then the file will be replayed and the updates between each row will be based on the timestamps in `time_column`. +Note that the timestamps should be ordered. + +```python +table = pw.demo.replay_csv_with_time(path='data.csv', schema=InputSchema, time_column='column2', unit='ms') +``` + +## Conclusion +Obtaining real data streams can be challenging, particularly for debugging purposes. +Fortunately, Pathway's `demo` module comes to the rescue, offering a seamless solution to create custom data streams from scratch of from existing CSV files. +You can now test and debug your Pathway implementation with real-time data, providing you with invaluable insights and boosting your application's performance. diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/60.switch-from-batch-to-streaming.md b/docs/2.developers/4.user-guide/030.connecting-to-data/60.switch-from-batch-to-streaming.md new file mode 100644 index 00000000..4b36c40e --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/60.switch-from-batch-to-streaming.md @@ -0,0 +1,129 @@ +--- +title: 'Switching from Batch to Streaming' +description: 'How to switch from batch to streaming with Pathway' +--- + + +# Switching from Batch to Streaming +Easily switching from batch to streaming is a core feature of Pathway. +In this article, you will see how easy it is to change your static pipeline to make it run with streaming data. + + +You made your pipeline on static data. +Maybe it was because it was simpler to develop or to test. +Or maybe it was because you came [from Pandas](/developers/user-guide/exploring-pathway/migrate-from-pandas/). + +Anyway, your pipeline is ready, tested, and validated. +Now you want to go one step further and **make it run in realtime**. + +Good news for you, Pathway makes it extremely easy as it is a **unified batch and streaming batch processing framework**. + +In a nutshell, the only thing you need to do is to change your data sources to connect to your streaming data sources, and Pathway will take care of the rest! + +![Batch to Streaming Code in Pathway](https://user-images.githubusercontent.com/68642378/278069996-79f4250d-0641-4b97-87f8-0820d9399c6b.gif) + + +## Starting with static code +Let's start with writing some code working on static, batch data. +It's easier as it allows you to focus on your code logic without worrying about time and consistency. +You can also test on static data to make sure the pipeline works as intended. + +Let's write a simple example that reads CSV files, sums all the values, and then outputs the results in a new CSV file. + +```python +import pathway as pw + +# WRITE SOME STATIC CODE + +# read data +class InputSchema(pw.Schema): + value: int + +t = pw.io.csv.read( + './sum_input_data/', + schema=InputSchema, + mode="static", +) + +# process data +t = t.reduce(pw.reducers.sum(t.value)) + +# write data +pw.io.csv.write(t, "output.csv") + +#run +pw.run() +``` + +You can try this code sample, you will only need CSV files with a single column `value`. + +## Simply change the connectors +Now, you would like to be able to add CSV files and have the sum automatically updated: **you want to go from a static data source to a streaming one**. + +In Pathway, this change is very easy: **you simply need to change the connectors to connect to your streaming data sources**. + +Not all the connectors are working in both static and streaming modes. +You can take a look at [the list of connectors](/developers/user-guide/connecting-to-data/connectors/) and pick the best-suited ones for your project. +You can learn how to use them in the [Input/Output API](/developers/api-docs/pathway-io/). + +The CSV input connector used in our example works in both modes: you only need to change the mode by setting the parameter `mode` to `streaming`: + +```python +t = pw.io.csv.read( + './sum_input_data/', + schema=InputSchema, + mode="streaming", +) +``` + +And that's it! + +The rest of the implementation remains unchanged: + +```python +import pathway as pw + +# WRITE SOME STREAMING CODE + +# read data +class InputSchema(pw.Schema): + value: int + +t = pw.io.csv.read( + './sum_input_data/', + schema=InputSchema, + mode="streaming", +) + +# process data +t = t.reduce(pw.reducers.sum(t.value)) + +# write data +pw.io.csv.write(t, "output.csv") + +#run +pw.run() +``` + +## What else? +Your former static project is now a realtime data processing one, congratulations! + +But what does it change for you in practice? + +Not much: Pathway will handle everything for you. +You don't have to worry about the new temporal nature of your project: Pathway will manage late and out-of-order data points for you. + +In a streaming system, inputs are coming as a never-ending data stream, so Pathway computes an output with the available data and then revises the result whenever new data arrives. +This is the only real change when switching to streaming: **now the output is not a static result but a data stream**. +Whenever a new update, a new CSV file in our example, is coming, the new values are ingested by Pathway and the results are automatically updated. +You can take a look at our [first-steps article](/developers/user-guide/introduction/first_realtime_app_with_pathway/#understanding-the-output) to understand the new results. + + +You can run with an input stream generated using [`pw.demo.range_stream`](/developers/user-guide/connecting-to-data/artificial-streams/#generating-a-single-column-data-stream-with-range_stream). + + +## Conclusion + +As you can see, Pathway makes the transition between batch to streaming as easy as possible. +Using Pathway, you can now focus on building your pipeline on static data, without worrying about the temporal nature of your data. +Once ready, simply change your connectors to connect to your live data sources and let Pathway manage everything for you. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/030.connecting-to-data/_dir.yml b/docs/2.developers/4.user-guide/030.connecting-to-data/_dir.yml new file mode 100644 index 00000000..572de96a --- /dev/null +++ b/docs/2.developers/4.user-guide/030.connecting-to-data/_dir.yml @@ -0,0 +1 @@ +title: 'How To Connect To Data' diff --git a/docs/2.developers/4.user-guide/040.connectors/.fs-connector/.gitignore b/docs/2.developers/4.user-guide/040.connectors/.fs-connector/.gitignore new file mode 100644 index 00000000..97e21319 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/.fs-connector/.gitignore @@ -0,0 +1,4 @@ +csv* +json* +plain* +unstructured* \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/040.connectors/.fs-connector/article.py b/docs/2.developers/4.user-guide/040.connectors/.fs-connector/article.py new file mode 100644 index 00000000..e77a5bfb --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/.fs-connector/article.py @@ -0,0 +1,171 @@ +# --- +# title: File System Connector +# description: An article explaining how to read from files and write to files in a few basic data formats. +# date: '2024-01-26' +# tags: ['tutorial', 'table'] +# keywords: ['connector', 'file system', 'csv', 'json'] +# notebook_export_path: notebooks/tutorials/fs_connector.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # File System Connectors +# This guide explains the [fs connectors](/developers/api-docs/pathway-io/fs) that connect Pathway to your file system to read and write data with the following basic formats: binary, plaintext, CSV, and JSON. +# +# The first part of this guide focuses on defining the source of the data for our connector (using plaintext data format to keep things simple). The second part explains additional configuration that can (or needs to) be defined for all simple data formats. +# In particular we show the input connectors ([`pw.io.fs.read`](/developers/api-docs/pathway-io/fs/#pathway.io.fs.read)) reading data in: +# - [`CSV` format](/developers/user-guide/connectors/fs-connector/#csv), +# - [`JSON` format](/developers/user-guide/connectors/fs-connector/#json), +# - [`plaintext`, `plaintext_by_file`, and `binary` formats](/developers/user-guide/connectors/fs-connector/#unstructured-data). +# +# The output connectors ([`pw.io.fs.write`](/developers/api-docs/pathway-io/fs/#pathway.io.fs.write)) write data in: +# - [`CSV` format](/developers/user-guide/connectors/fs-connector/#csv), +# - [`JSON` format](/developers/user-guide/connectors/fs-connector/#json). +# +# File system connectors work both in streaming and static modes. Be careful as the use of connectors differs depending on the chosen mode: see the [differences](/developers/user-guide/connecting-to-data/streaming-and-static-modes). +# For simplicity, all the examples below are in the "static" mode but can easily be changed to "streaming" mode by changing the `mode` parameter. +# +# ## Location of files and filter. +# The code snippets below prepares the basic file structure that is used in the later part of this article. To keep tings simple, all examples work with data of type `str`, to see more on [schemas](/developers/user-guide/types-in-pathway/schema) and [types](/developers/user-guide/types-in-pathway/datatypes) in other places. + +# %% +! mkdir -p plain_input +! mkdir -p plain_output +! echo -e "test1\ndata1" > plain_input/in1.txt +! echo -e "test2\ndata2" > plain_input/in2.txt +# %% [markdown] +# ### Specify the input and output with `path` and `filename`. +# Below, you can find the simplest examples of input ([`pw.io.fs.read`](/developers/api-docs/pathway-io/fs/#pathway.io.fs.read)) and output ([`pw.io.fs.write`](/developers/api-docs/pathway-io/fs/#pathway.io.fs.write)) connectors. Both examples use plaintext as the input format (more on that [later](/developers/user-guide/connectors/fs-connector#data-formats)). The `path` parameter can point either to a directory or a particular file. If it point so a directory, it reads all files that are inside. Otherwise it reads only the file that is specified (and as such it makes sense only in the static mode). + +# %% +%%capture +import pathway as pw +test1 = pw.io.fs.read(path = "./plain_input/", format = "plaintext", mode="static") +pw.io.fs.write(test1, filename="./plain_output/out1.txt", format="json") + +test2 = pw.io.fs.read(path = "./plain_input/in1.txt", format = "plaintext", mode="static") +pw.io.fs.write(test2, filename="./plain_output/out2.txt", format="json") + +pw.run() + +# %% [markdown] +# The output can be found in the `plain_output` directory. + +# %% +! echo "out1:" +! cat plain_output/out1.txt +! echo "out2:" +! cat plain_output/out2.txt + +# %% [markdown] +# As you can see, the first example read the data from both `in1.txt` and `in2.txt`, while the second read only the data from `in1.txt`. + +# ### Filter the files to read with `object_pattern` +# In case you want to specify a directory as the source of your data, but read only some of its contents, you can specify filter [pattern](https://www.gnu.org/software/findutils/manual/html_node/find_html/Shell-Pattern-Matching.html) and pass it using the `object_pattern` parameter. +# %% +%%capture +test3 = pw.io.fs.read("./plain_input/", format = "plaintext", mode="static", object_pattern = "*2*") +pw.io.fs.write(test3, "./plain_output/output3.txt", "json") +pw.run() + +# %% [markdown] +# The output can be found in the `plain_output` directory. As you can see, `out3.txt` contains data only from `in2.txt`, as it is the only file in the input directory that matches the `*2*` pattern: + +# %% +! echo "out3:" +! cat plain_output/output3.txt + +# %% [markdown] +# ## Data formats +# ### CSV +# For the CSV format, each file on the input needs to have defined headers. + +# %% +! mkdir -p csv_input +! mkdir -p csv_output +! echo -e "header1;header2\ndata1;data2\n\ndata3;data4" > csv_input/csv_in1.txt +! echo -e "header1;header2\ndata5;data6\n\ndata7;data8" > csv_input/csv_in2.txt +! echo -e "csv_in1.txt:" +! cat csv_input/csv_in1.txt +! echo -e "csv_in2.txt:" +! cat csv_input/csv_in2.txt +# %% [markdown] +# In most cases, in order to read the data, you need to define its schema and pass it to the connector. Furthermore, for the `csv` format, you can use [CSVParserSettings](/developers/api-docs/pathway-io#pathway.io.CsvParserSettings) to accommodate for a nonstandard formatting of the input file. In the example below, it is configured to use `;` as a delimiter. +# %% +%%capture +class csv_schema(pw.Schema): + header1: str + header2: str + +csv_settings = pw.io.CsvParserSettings(delimiter=";") +csv_data = pw.io.fs.read(path = "./csv_input/", format="csv", schema=csv_schema, csv_settings=csv_settings,mode="static") +pw.io.fs.write(table=csv_data, filename="./csv_output/csv_out1.txt", format="csv") +pw.run() +# %% +! cat ./csv_output/csv_out1.txt + +# %% [markdown] +# You can also use the dedicated [CSV connector](/developers/user-guide/connectors/csv_connectors). +# ### JSON +# You can use the [JSON format](https://json.org) by setting the parameter `format` to `json`. +# %% +! mkdir -p json_input +! mkdir -p json_output +! echo -e '{"header1":data1",\n"header2":"data2"}\n{"header1":"data3","header2":"data4"}\n' > json_input/json_in1.txt +! echo -e '{"header1":"data5","header2":"data6"}\n{"header1":"data7","header2":"data8"}\n' > json_input/json_in2.txt +! echo -e "json_in1.txt:" +! cat json_input/json_in1.txt +! echo -e "json_in2.txt:" +! cat json_input/json_in2.txt +# %% [markdown] +# As in most cases, in order to read the data, you need to define a schema and pass it to the connector. Each input file needs to be a sequence of properly formatted JSON objects. +# %% +%%capture +class json_schema(pw.Schema): + header1: str + header2: str + +json_data = pw.io.fs.read(path = "./json_input/", format="json", schema=json_schema, mode="static") +pw.io.fs.write(table=json_data, filename="./json_output/json_out1.txt", format="json") +pw.run() +# %% +! cat ./json_output/json_out1.txt +# %% [markdown] +# ### Unstructured data +# Pathway allows you to read unstructured data using three formats: `plaintext`, `plaintext_by_file`, and `binary`. `binary` and `plaintext` considers each line as a separate row that will be stored in the column `data`, and the format `plaintext_by_file` treats each file as a single row. +# %% +! mkdir -p unstructured_output + +# %% +%%capture +plaintext_data = pw.io.fs.read(path = "./plain_input", format = "plaintext", mode="static") +pw.io.fs.write(plaintext_data,"./unstructured_output/output1.txt", "csv") + +plaintext_by_file_data = pw.io.fs.read(path = "./plain_input", format = "plaintext_by_file", mode="static") +pw.io.fs.write(plaintext_by_file_data,"./unstructured_output/output2.txt", "csv") + +binary_data = pw.io.fs.read(path = "./plain_input", format = "binary", mode="static") +pw.io.fs.write(binary_data,"./unstructured_output/output3.txt", "csv") + +pw.run() + +# %% +! echo "plaintext" +! cat ./unstructured_output/output1.txt +! echo "plaintext by file" +! cat ./unstructured_output/output2.txt +! echo "binary" +! cat ./unstructured_output/output3.txt + + diff --git a/docs/2.developers/4.user-guide/040.connectors/.jsonlines/article.py b/docs/2.developers/4.user-guide/040.connectors/.jsonlines/article.py new file mode 100644 index 00000000..b627e1ad --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/.jsonlines/article.py @@ -0,0 +1,143 @@ +# --- +# title: 'JSON Lines connectors' +# description: 'Tutorial on the Pathway JSON Lines connectors.' +# date: '2024-02-20' +# tags: ['tutorial', 'connectors'] +# keywords: ['connector', 'JSON Lines', 'JSON'] +# --- + +# # JSON Lines connectors +# Pathway provides connectors to read and write data streamings using JSON Lines files. +# +# [JSON Lines](https://jsonlines.org/), also called newline-delimited JSON, is a format for structured data, following three requirements: +# 1. UTF-8 Encoding. +# 2. Each Line is a valid JSON Value. +# 3. The line separator is `\n`. +# +# The suggested extension for JSON Lines files is `.jsonl`. +# +# This is a correct JSON Lines entry: +# ``` +# {"key": 1, "recipient": "Bill H.", "sender": "Nancy R."} +# {"key": 2, "recipient": "Harry P.", "sender": "Hermione G."} +# {"key": 3, "recipient": "Julian S.", "sender": "Dick F."} +# ``` +# +# ⚠️ JSON lines connectors work both in streaming and static modes. +# Be careful as the use of connectors differs depending on the chosen mode: see the [differences](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). +# +# ## Input connector +# +# To read a file or a directory, use the [`pw.io.jsonlines.read` function](/developers/api-docs/pathway-io/jsonlines/#pathway.io.jsonlines.read). +# It takes several parameters, including: +# - `path`: the path of the directory or the file to read. +# - `schema` (optional): the schema of the resulting table. +# - `mode` (optional): the mode in which the connector is used, `streaming` or `static`. Defaults to `streaming`. +# +# The connector can be used to read a directory: +# ```python +# table = pw.io.jsonlines.read("./input_directory/", schema=InputSchema) +# ``` +# In this case, all the files within this directory are ingested into the table. +# +# You can choose to read a single file: +# ```python +# table = pw.io.jsonlines.read("./input_file.jsonl", schema=InputSchema) +# ``` +# +# **Any file modification would be reflected in the table read: if you delete a part of a file, the respective data will be deleted from the table**. +# +# Let's consider the following example, which reads a JSON Lines file and outputs it in a CSV file using the [CSV output connector](/developers/user-guide/connectors/csv_connectors/): + +# + +import pathway as pw + + +class InputSchema(pw.Schema): + key: int = pw.column_definition(primary_key=True) + recipient: str + sender: str + + +# _MD_SHOW_table = pw.io.jsonlines.read("./input_file.jsonl", schema=InputSchema) +# _MD_SHOW_pw.io.csv.write(table, "./output.csv") +# _MD_SHOW_pw.run() +# - + +# With the `input_file` file containing the correct example defined in the introduction, the output is: +# ``` +# key,recipient,sender,time,diff +# 1,"Bill H.","Nancy R.",1707985402732,1 +# 2,"Harry P.","Hermione G.",1707985402732,1 +# 3,"Julian S.","Dick F.",1707985402732,1 +# ``` +# +# Adding this line `{"key": 4, "recipient": "Juliet", "sender": "Romeo"}` will update the output: +# ``` +# key,recipient,sender,time,diff +# 1,"Bill H.","Nancy R.",1707985402732,1 +# 2,"Harry P.","Hermione G.",1707985402732,1 +# 3,"Julian S.","Dick F.",1707985402732,1 +# 4,"Juliet","Romeo",1707985410232,1 +# ``` +# The removal of the line is also passed to the output: +# ``` +# key,recipient,sender,time,diff +# 1,"Bill H.","Nancy R.",1707985402732,1 +# 2,"Harry P.","Hermione G.",1707985402732,1 +# 3,"Julian S.","Dick F.",1707985402732,1 +# 4,"Juliet","Romeo",1707985410232,1 +# 4,"Juliet","Romeo",1707985423732,-1 +# ``` +# The `diff` value is set to `-1`, representing a removal. +# +# +# ### Static case +# +# The JSON Lines connector also supports the static mode: it will read all the data at once and then closes the connection. +# To activate it, you must set the `mode` parameter to `"static"`: + +table = pw.io.jsonlines.read("./input_file.jsonl", schema=InputSchema, mode="static") +pw.debug.compute_and_print(table) + +# ## Output connector +# +# To output a table in the JSON Lines format, you should use [`pw.io.jsonlines.write`](/developers/api-docs/pathway-io/jsonlines/#pathway.io.jsonlines.write). +# It takes two parameters: +# - the table to output +# - the filename. +# +# ```python +# pw.io.jsonlines.write(table, "output_file.jsonl") +# ``` +# +# ⚠️ The JSON Lines output connector only works in streaming mode. +# +# ## Complete example +# +# ```python +# import pathway as pw +# +# +# class InputSchema(pw.Schema): +# key: int = pw.column_definition(primary_key=True) +# recipient: str +# sender: str +# +# +# table = pw.io.jsonlines.read("./input_file.jsonl", schema=InputSchema) +# pw.io.jsonlines.write(table, "./output_file.jsonl") +# pw.run() +# ``` +# +# By doing the same operations as before (adding the line and removing it), you obtain the following results: +# ``` +# {"key":1,"recipient":"Bill H.","sender":"Nancy R.","diff":1,"time":1707987230734} +# {"key":2,"recipient":"Harry P.","sender":"Hermione G.","diff":1,"time":1707987230734} +# {"key":3,"recipient":"Julian S.","sender":"Dick F.","diff":1,"time":1707987230734} +# {"key":4,"recipient":"Juliet","sender":"Romeo","diff":1,"time":1707987260732} +# {"key":4,"recipient":"Juliet","sender":"Romeo","diff":-1,"time":1707987269732} +# ``` +# +# This example simply copies the streams of updates. +# To do something more complicated, you can check [our tutorial about the basic table operations](/developers/user-guide/data-transformation/table-operations/) Pathway supports. diff --git a/docs/2.developers/4.user-guide/040.connectors/.jsonlines/input_file.jsonl b/docs/2.developers/4.user-guide/040.connectors/.jsonlines/input_file.jsonl new file mode 100644 index 00000000..0f7ab584 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/.jsonlines/input_file.jsonl @@ -0,0 +1,3 @@ +{"key": 1, "recipient": "Bill H.", "sender": "Nancy R."} +{"key": 2, "recipient": "Harry P.", "sender": "Hermione G."} +{"key": 3, "recipient": "Julian S.", "sender": "Dick F."} diff --git a/docs/2.developers/4.user-guide/040.connectors/05.fs-connector.md b/docs/2.developers/4.user-guide/040.connectors/05.fs-connector.md new file mode 120000 index 00000000..cbb22ec4 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/05.fs-connector.md @@ -0,0 +1 @@ +.fs-connector/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/040.connectors/10.csv_connectors.md b/docs/2.developers/4.user-guide/040.connectors/10.csv_connectors.md new file mode 100644 index 00000000..7a043c4e --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/10.csv_connectors.md @@ -0,0 +1,158 @@ +--- +title: 'CSV connectors' +description: 'Tutorial on CSV connectors' +date: '2023-01-12' +thumbnail: '/assets/content/blog/th-csv-connectors.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'CSV'] +--- + + +# Using CSV connectors +Pathway provides a [`pw.io.csv`](/developers/api-docs/pathway-io/csv/) module with connectors to read and write data streams using CSV files. + +[Comma-separated values](https://en.wikipedia.org/wiki/Comma-separated_values) (CSV) is one of the most common formats for tables. +In a CSV file, each line is a data record, represented by the values of the different fields, separated by commas. + +```csv +key,recipient,sender +1,Bill H.,Nancy R. +2,Harry P., Hermione G. +3,Julian S.,Dick F. +``` + +In pathway, you can read and write CSV files using [`pw.io.csv.read`](/developers/api-docs/pathway-io/csv#pathway.io.csv.read) and [`pw.io.csv.write`](/developers/api-docs/pathway-io/csv#pathway.io.csv.write). + +⚠️ CSV connectors work both in streaming and static modes. +Be careful as the use of connectors differs depending on the chosen mode: see the [differences](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). + +## Short version +Consider a simple scenario: new CSV files are added into a directory `./input_stream_dir/`, each file containing a table with a single column `value`. +Each entry in `value` is an integer, and you want to compute the sum of these values and store the resulting output stream in an `output_stream.csv` file. +You can do it as follows in Pathway: + +```python [realtime_sum.py] +import pathway as pw + +# We define a schema for the table +# It set all the columns and their types +class InputSchema(pw.Schema): + value: int + +# We use the CSV input connector to connect to the directory. +t = pw.io.csv.read( + './input_stream_dir/', + schema=InputSchema, + mode="streaming" +) + +# We compute the sum (this part is independent of the connectors). +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# We use a CSV output connector to write the results in an output file. +pw.io.csv.write(t, "output_stream.csv") + +# We launch the computation. +pw.run() +``` + +## Input connector + +**Data stream**: +Consider a stream in the form of CSV files: each new update is contained in its own CSV file. +The CSV connector [`pw.io.csv.read`](/developers/api-docs/pathway-io/csv#pathway.io.csv.read) takes several arguments: +- `path`: the path to the folder in which the new CSV files are expected. +- `schema`: the schema of the table. It defines the columns' names and their types. +- `csv_settings`: settings for the CSV parser. +- `mode`: indicate whether the engine has to wait for new CSV files. Possible values are `streaming` and `static`. The default value is `streaming`. + +⚠️ **Each** CSV file should start with a header containing the column names, in the correct order, and separated by commas. + +Note that the CSV connector does not recursively read the files in the subdirectories of `path`. + +```csv +colA,colB +``` + +**Usage**: +to read a CSV stream in a directory `./dir/` with tables `colA` and `colB`, you have to do the following: +```python +class InputSchemaTwoColumns(pw.Schema): + colA: any + colB: any + +t = pw.io.csv.read('./dir/', schema=InputSchemaTwoColumns, mode="streaming") +``` +The resulting Pathway table `t` will contain all the values contained inside the CSV files, and will be automatically updated whenever a new CSV file is added. + + +### Special case: static mode +CSV connectors can be used for both streaming and static modes. +There are two ways to set up the CSV connector to be static: +- specify a CSV file in the path: `t = pw.io.csv.read('./input_static_file.csv', schema=InputSchema)`, +- set `mode` to `static`: `t = pw.io.csv.read('./input_stream_dir/', schema=InputSchema, mode="static")`. +In both cases, the connector will read all the available data at once when the computation is triggered (by `pw.debug.compute_and_print` for example). + +Be careful when using the input CSV connector to avoid using it in the wrong mode as they are highly similar. + +## Output connector +The CSV output connector [`pw.io.csv.write`](/developers/api-docs/pathway-io/csv#pathway.io.csv.write) has two parameters: the table you want to output and the address of the CSV file in which the output will be written: +```python +pw.io.csv.write(t, "output_stream.csv") +``` +Every time the table `t` is updated, the changes will be automatically appended in `output_stream.csv`. + +### Special case: static mode +Similarly to its input counterpart, the CSV output connector can be used in the static mode. +Nothing changes here, you can still use `pw.io.csv.write(t, "output_static_file.csv")`. +However, the static data will be printed at once in the file, without any further update. + + + +## Complete example +Let's go back to our example on how to compute a sum over the values of the columns `value` inside CSV files. +The final version of our project contains two files: `realtime_sum.py` which processes the stream using Pathway and `generating_stream.sh` which generates the streams. + +Here is `realtime_sum.py`: + +```python [realtime_sum.py] +import pathway as pw + +class InputSchema(pw.Schema): + value: int + + +t = pw.io.csv.read( + './input_stream_dir/', + schema=InputSchema, + mode="streaming" + ) +t = t.reduce(sum=pw.reducers.sum(t.value)) +pw.io.csv.write(t, "output_stream.csv") +pw.run() +``` +Don't forget to run the computation with `pw.run()`, otherwise nothing will happen. +Once `pw.run()` is called, the computation will be run forever until it gets killed. +If you need some reminders on Pathway operations, don't hesitate to take a look at our [first-steps guide](/developers/user-guide/data-transformation/table-operations/). + +To have a streaming setup, you need to periodically insert new CSV files into a given directory. +It can be easily done using a bash script `generating_stream.sh` which prints every second a new value: + +```shellscript [generating_stream.sh] +#!/bin/bash +src="./sum_input_data/" +rm -r $src +mkdir -p $src + +sleep 10 + +for LOOP_ID in {1..30} +do + filename=$src"input_table_$LOOP_ID.csv" + printf "value\n$LOOP_ID\n" >> $filename + sleep 1 +done +``` + +The input connector requires the input CSV directory to exist so the script generating the stream should be launched first. +The input connector will connect to the file and update the results every time a new CSV file is added, updating automatically and in real time the output CSV file using the output CSV connector. diff --git a/docs/2.developers/4.user-guide/040.connectors/100.slack_send_alerts.md b/docs/2.developers/4.user-guide/040.connectors/100.slack_send_alerts.md new file mode 100644 index 00000000..c9f53ce8 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/100.slack_send_alerts.md @@ -0,0 +1,62 @@ +--- +title: 'Sending alerts to Slack' +description: 'Tutorial on the connector for sending alerts to Slack' +date: '2024-02-16' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Slack', 'alerts'] +--- + +# Sending alerts to Slack + +This tutorial will guide you through connecting Pathway to Slack and sending alerts to a specific channel. + +If you need to send some alerts to Slack based on your Pathway pipeline, you should use a [`pw.io.slack.send_alerts`](/developers/api-docs/pathway-io/slack/#pathway.io.slack.send_alerts) function. It allows for sending messages from a Pathway table to a specific channel in Slack. This tutorial will guide you on how to use it. + +## Setting up Slack authentication + +To send messages to Slack, Pathway uses the [Slack web API](https://api.slack.com/web). To use it, you need to have a token for authenticating to Slack. You can read how to get it in the [Slack documentation](https://api.slack.com/tutorials/tracks/getting-a-token). For `pw.io.slack.send_alerts`, your token needs to have a [`chat:write` scope](https://api.slack.com/scopes/chat:write). + +## Connecting to Slack with Pathway + +If you obtained your Slack token, you are ready to connect your Pathway application to Slack with [`pw.io.slack.send_alerts`](/developers/api-docs/pathway-io/slack/#pathway.io.slack.send_alerts). + +`pw.io.slack.send_alerts` takes 3 arguments: +- `alerts`, a [`ColumnReference`](/developers/api-docs/pathway/#pathway.ColumnReference) with the content of the alerts you want to send to Slack. +- `slack_channel_id`, a string with the [ID of the channel](https://api.slack.com/methods/chat.postMessage#channels) you want to send the alerts to. +- `slack_token`, your token for authenticating to Slack. + +To create a simple example of sending Slack alerts, you need a Table with a column with the content to be sent to Slack. This tutorial uses a simple table defined with [`pw.debug.table_from_markdown`](/developers/api-docs/debug#pathway.debug.table_from_markdown). In the following example, `slack_channel_id` and `slack_token` are obtained from environmental variables, respectively `SLACK_CHANNEL_ID` and `SLACK_TOKEN`. Make sure to set them before running the program. + +```python +import os +import pathway as pw + +slack_channel_id = os.environ["SLACK_CHANNEL_ID"] +slack_token = os.environ["SLACK_TOKEN"] + +t = pw.debug.table_from_markdown(""" +messages +Hello_Slack +This_is_Slack_alert +""") +pw.io.slack.send_alerts(t.messages, slack_channel_id, slack_token) +pw.run() +``` + +When you run it, you will get the alerts in your Slack channel. + +::article-img +--- +src: '/assets/content/documentation/slack-alerts-tutorial.png' +alt: 'Alerts sent to Slack channel' +height: '96' +width: '269' +quality: '100' +class: 'mx-auto' +--- +:: + + +Note, that the values of the column `messages` in the above example do not have spaces. It is a restriction of `pw.debug.table_from_markdown` which uses spaces to separate columns. Any regular string works with the other connectors. + +If you want to see more examples with `pw.io.slack.send_alerts` you can check the [`alert`](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/alert) or [`drive_alert`](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/drive_alert) examples in the llm-app or our [showcase describing the drive alert example](/developers/showcases/llm-alert-pathway/). \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/040.connectors/15.jsonlines-connector.md b/docs/2.developers/4.user-guide/040.connectors/15.jsonlines-connector.md new file mode 120000 index 00000000..c9c30080 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/15.jsonlines-connector.md @@ -0,0 +1 @@ +.jsonlines/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/040.connectors/20.database-connectors.md b/docs/2.developers/4.user-guide/040.connectors/20.database-connectors.md new file mode 100644 index 00000000..4d3823c1 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/20.database-connectors.md @@ -0,0 +1,476 @@ +--- +title: 'Database connectors' +description: 'Tutorial on Database connectors' +date: '2023-01-19' +thumbnail: '/assets/content/blog/th-database-connectors.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'database', 'PostgreSQL', 'CDC', 'Debezium', 'Kafka', 'Zookeeper', 'Docker'] +--- + + +# Using database connectors +Connect Pathway on top of your PostgreSQL/Debezium database using [`pw.io.debezium.read`](/developers/api-docs/pathway-io/debezium#pathway.io.debezium.read) and [`pw.io.postgres.write`](/developers/api-docs/pathway-io/postgres#pathway.io.postgres.write). + + +In this tutorial, you will learn how to set up an architecture using Pathway on top of PostgreSQL/Debezium. +The aim is to show you how to use Pathway to perform real time processing of the changes of your PostgreSQL database. +Traditional databases, such as PostgreSQL, are not designed for streaming scenarios: we need a change data capture ([CDC](https://en.wikipedia.org/wiki/Change_data_capture)) mechanism to monitor a database and stream the changes. +To create our input stream from PostgreSQL, we will use [Debezium](https://debezium.io/) and its associated Pathway's connector [`pw.io.debezium.read`](/developers/api-docs/pathway-io/debezium#pathway.io.debezium.read). +The output stream will be sent back to the PostgreSQL database using Pathway's PostgreSQL connector [`pw.io.postgres.write`](/developers/api-docs/pathway-io/postgres#pathway.io.postgres.write). + +This tutorial is a bit longer than the other ones, as you will need to deploy several containers using `docker-compose` to have a working example. +If you have never used `docker-compose`, don't be afraid! +This tutorial does not require any knowledge about `docker-compose`: we will provide all the required settings in order to make it as pleasant as possible 😉, you will see that it makes the deployment of the database quite easy. + + +If you have your own PostgreSQL and Debezium instances already deployed, you may skip the architecture part to focus on the input and output connectors. + + + +## Short version +Let's consider a simple scenario: you have a SQL table `values` with only one column `value` which has new entries, and you want to compute the sum of the values to be stored in another table `sum_table`. The changes in the table `values` are captured and streamed by Debezium. + +You can do it as follows in Pathway: + +```python [sum.py] +import pathway as pw + +# Debezium settings +input_rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} +# PostgreSQL settings +output_postgres_settings = { + "host": "postgres", + "port": "5432", + "dbname": "values_db", + "user": "user", + "password": "password", +} + +# We define a schema for the table +# It set all the columns and their types +class InputSchema(pw.Schema): + value: int + + +# We use the Debezium connector to listen to the "postgres.public.values" topic +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, +) + +# # We compute the sum (this part is independent of the connectors). +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# We use the PostgreSQL connector to send the resulting output stream containing the sum +pw.io.postgres.write(t, output_postgres_settings, "sum_table") + +# We launch the computation. +pw.run() +``` + +## Architecture + +First, let's take a moment to study the architecture of our project. + +You need a PostgreSQL database with two tables: the tables `values` on which regular updates will be performed, creating the input data stream, and the `sum_table` output in which the sum will be stored and periodically updated by Pathway. + +Then, you need to set up our CDC to capture the changes in the PostgreSQL database and create a stream of it. +This part is done by a Debezium instance. + +The updates from Debezium are sent to Kafka, which relies on ZooKeeper, which then propagates them in Pathway. + +Finally, Pathway receives the updates from Kafka, processes the data, and sends the sum back to PostgreSQL. + +![Architecture](/assets/content/documentation/debezium-architecture.svg) + +### Docker containers + +To summarize, our puzzle has five pieces: + - PostgreSQL + - Debezium + - Kafka + - ZooKeeper + - Pathway + +No need to say that installing those from scratch can be a hurdle... + +Fortunately, we can use Docker containers and Docker Compose to ease the deployment of our project. +In a nutshell, a container is a virtual environment in which we can run standalone applications. +As an example, the PostgreSQL container you will use in this project contains a minimal distribution to run a PostgreSQL database: it is lightweight and PostgreSQL is already installed, but it may not work for other purposes. +In our case, the main advantages of using docker containers is the simplicity of the installation and the setting. +If you are interested in Docker containers, or if you want to adapt this example to your project, checkout the [doc](https://docs.docker.com/). + +You will use a `docker-compose.yml` in the root of this project to set it up. +Such a file is organized as follows: + +```docker-compose +version: "3.7" +services: + postgres: + build: + environment: + volumes: + kafka: + build: + ... +``` + +Each application, called a service here, is defined there with all its relevant parameters, for example parameters in `build` define how to build the container (which image and so on). Note that `postgres` here is the name given to the service, whether a PostgreSQL image is used is determined by the `build` parameter. + +### Postgres + +You first need to configure our database: + +```docker-compose + postgres: + container_name: db_tuto_postgres + image: debezium/postgres:13 + ports: + - 5432:5432 + environment: + - POSTGRES_USER=user + - POSTGRES_PASSWORD=password + - POSTGRES_DB=values_db + - PGPASSWORD=password + volumes: + - ./sql/init-db.sql:/docker-entrypoint-initdb.d/init-db.sql + - ./sql/update_db.sh:/update_db.sh +``` + +Most of the variables are self-explanatory. +The only subtlety here is how the database is created and updated. +The container will use the script in `/docker-entrypoint-initdb.d/` to initialize the database. +Docker-compose copies files into the container using volumes. +We create a file `init-db.sql` in which we create our two tables `values` and `sum_table`: + +```sql [./sql/init-db.sql] +CREATE TABLE IF NOT EXISTS values ( + value integer NOT NULL +); + +CREATE TABLE IF NOT EXISTS sum_table ( + sum REAL NOT NULL, + time BIGINT NOT NULL, + diff INTEGER NOT NULL +); +``` + +You also need a script `update_db.sh` to insert data into the table, to create the stream: + +```shellscript [./sql/update_db.sh] +#!/bin/bash +export PGPASSWORD='password' + +for LOOP_ID in {1..1000} +do + psql -d values_db -U user -c "INSERT INTO values VALUES ($LOOP_ID);" + sleep 0.5 +done +``` + +⚠️ This file should have executable rights. + +We will copy this file at the root, not at `/docker-entrypoint-initdb.d/` as we want to launch it manually. + +### Debezium + +For Debezium, we need to configure ZooKeeper and Kafka. +For all of those, we are going to use very specific images which considerably limit the number of settings to do. + +#### ZooKeeper + +```docker-compose + zookeeper: + container_name: db_tuto_zookeeper + image: confluentinc/cp-zookeeper:5.5.3 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 +``` + +#### Kafka + +Kafka will be connected to ZooKeeper. +Thanks to docker compose, all the containers share the same network. +To connect to a given service, we just need to use its name: +to connect to our ZooKeeper container we only need to write `"zookeeper:2181"`. + +```docker-compose + kafka: + container_name: db_tuto_kafka + image: confluentinc/cp-enterprise-kafka:5.5.3 + depends_on: [zookeeper] + environment: + KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 +``` + +#### Debezium + +```docker-compose + debezium: + container_name: db_tuto_debezium + image: debezium/connect:1.4 + environment: + BOOTSTRAP_SERVERS: kafka:9092 + GROUP_ID: 1 + CONFIG_STORAGE_TOPIC: connect_configs + OFFSET_STORAGE_TOPIC: connect_offsets + volumes: + - ./debezium/connector.sh:/kafka/connector.sh + depends_on: [kafka] + ports: + - 8083:8083 +``` + +We need to connect the Debezium connector to our PostgreSQL database: + +```shellscript [./debezium/connector.sh] +#!/bin/bash + +curl -H 'Content-Type: application/json' debezium:8083/connectors --data ' +{ + "name": "values-connector", + "config": { + "connector.class": "io.debezium.connector.postgresql.PostgresConnector", + "plugin.name": "pgoutput", + "database.hostname": "postgres", + "database.port": "5432", + "database.user": "user", + "database.password": "password", + "database.dbname" : "values_db", + "database.server.name": "postgres", + "table.include.list": "public.values" + + } +}' +``` + +⚠️ This file should have executable rights. + +Copy this script at the root of the container to execute it manually. + + +### Pathway + +Now you need to set up the container for Pathway. +Pathway does not have its own docker image, so you are going to use a Dockerfile to configure our container. + +```docker-compose + pathway: + container_name: db_tuto_pathway + build: + context: . + dockerfile: ./pathway-src/Dockerfile + depends_on: [kafka, postgres] +``` + +In our Dockerfile, we use a Python image. +You just need to use the pip install command to install Pathway. + +```DockerFile [./pathway-src/Dockerfile] +FROM --platform=linux/x86_64 python:3.10 + +RUN pip install -U pathway +RUN pip install kafka-python +COPY ./pathway-src/sum.py sum.py + +CMD ["python", "-u", "sum.py"] +``` + +⚠️ For compatibility reasons, we use a x86_64 Linux container. + +The file `sum.py` is the entry of our container: it will be automatically launched when the container has started. +Once it is terminated, the container is stopped. +Our pipeline will be defined in this file. + + +That's it! +That's a large docker-compose file, but when you think that this is enough to launch the 5 containers and make them work together, it is quite impressive! + +### Makefile + +To launch the containers, we only need to run `docker-compose up` in the same directly as `docker-compose.yaml`. +Nevertheless, it may be more simple to have a dedicated Makefile: + +```Makefile [./Makefile] +build: + chmod +x ./debezium/connector.sh + chmod +x ./sql/update_db.sh + docker-compose up -d + sleep 5 + docker-compose exec debezium ./connector.sh + docker-compose exec postgres ./update_db.sh + +stop: + docker-compose down -v + docker rmi tutorial-example-pathway:latest +``` + +You can launch the experiment with `make` and stop it with `make stop`. + +Now you only need to do our pipeline in `sum.py`. + + +## Debezium input connector + +**Data stream**: +For the input connector, the stream should be in the form of Debezium messages received on a given topic. +Each received update is atomic, and triggers the update of the pipeline created in Pathway. + +Note that Debezium connectors only work in [streaming mode](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). + +⚠️ We talk about Debezium messages but it is a simplification. +Debezium works with Kafka: in practice, the connector should be connected to Kafka. +The main difference with the regular Kafka connector is the expected formatting of the messages. + +**Usage**: +The Debezium input connector [`pw.io.debezium.read`](/developers/api-docs/pathway-io/debezium#pathway.io.debezium.read) takes several arguments: +- `rdkafka_settings`: the settings used to connect to the Kafka instance receiving the Debezium messages; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md), +- `topic_name`: the topic which is listened to, +- `schema`: the schema of the table. It defines the columns names and their types. It also defines the primary keys. +- `autocommit_duration_ms`: the maximum time between two commits. Every `autocommit_duration_ms` milliseconds, the updates received by the connector are committed and pushed into Pathway's dataflow. + + +⚠️ Note that a Debezium connector listens to *only one topic*. + +**Usage**: +```python +class InputSchema(pw.Schema): + value: int +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, + autocommit_duration_ms=100 +) +``` + +The optional parameter `types` used to cast the input values relies on specific `pw.Type`: `types={"value": pw.Type.INT}`. + +## PostgreSQL output connector + +The output connector [`pw.io.postgres.write`](/developers/api-docs/pathway-io/postgres#pathway.io.postgres.write) adds the updates made to a table `t` to a given PostgreSQL table. + + +**Usage**: +the output connector takes the following arguments: +- `table`: the Pathway table to send to PostgreSQL, +- `postgres_settings`: the settings used to connect to the PostgreSQL database; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md), +- `table_name`: PostgreSQL table on which the messages are sent. + +```python +pw.io.postgres.write(t, output_postgres_settings, "sum_table") +``` +Every time the table `t` is updated, the changes will be automatically appended in `output_stream.csv`. + +⚠️ The table should already be created in PostgreSQL: the creation is not done by Pathway. +**The table must include all the columns of the table `t` has, as well as two extra columns: `time` and `diff`.** +The columns `time` and `diff` are expressing the timestamp of the update and whether the update is an insertion or a deletion (an update is simply the simultaneous suppression of the old value along with the insertion of the new one). + +In our example, we must create a `sum_table` using this SQL command: +```sql +CREATE TABLE IF NOT EXISTS sum_table ( + sum REAL NOT NULL, + time BIGINT NOT NULL, + diff INTEGER NOT NULL +); +``` + + +## Complete example + +To summarize, we have a project with the following structure: + +``` +. +├── debezium/ +│ └── connector.sh +├── pathway-src/ +│ ├── Dockerfile +│ └── sum.py +├── sql/ +│ ├── init-db.sql +│ └── update_db.sh +├── docker-compose.yml +└── Makefile +``` + +Except the `sum.py`, all the files have been explained previously. +The full example is accessible in our [public repository](https://github.com/pathwaycom/pathway/tree/main/examples/projects/debezium-postgres-example). + +Let's see in more details how the pipeline is done in Pathway in `sum.py`: + +```python [./pathway-src/sum.py] +import pathway as pw + +input_rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} +output_postgres_settings = { + "host": "postgres", + "port": "5432", + "dbname": "values_db", + "user": "user", + "password": "password", +} + +class InputSchema(pw.Schema): + value: int + + +t = pw.io.debezium.read( + input_rdkafka_settings, + topic_name="postgres.public.values", + schema=InputSchema, + autocommit_duration_ms=100 +) + +t = t.reduce(sum=pw.reducers.sum(t.value)) + +pw.io.postgres.write(t, output_postgres_settings, "sum_table") + +pw.run() +``` + +Don't forget to run the computation with `pw.run()`, otherwise nothing will happen. +Once `pw.run()` is called, the computation will be run forever until it gets killed. +If you need some reminder on Pathway operations, don't hesitate to take a look at our [First steps guide](/developers/user-guide/data-transformation/table-operations/). + +To launch the computation you just need to run `make` in the root of the project, it will launch all the containers, initialize the database and start adding new values. +Every addition in the table `values` in PostgreSQL will trigger an update, through Debezium, and Pathway will send an update to the table `sum_table`. + +To monitor the changes, you can log in the PostgreSQL container: + +`docker-compose exec postgres psql values_db -U user -W` + +After typing your password, you should be able to see all the updates to the table `sum_table`. + +To see the latest value of the sum, we just need to type: + +```sql +SELECT sum FROM sum_table ORDER BY time DESC, diff DESC LIMIT 1; +``` + +This value is updated in **realtime** by Pathway. + +In you want more details, you can see the 10 latest updates by typing: + +```sql +SELECT * FROM sum_table ORDER BY time DESC, diff DESC LIMIT 10; +``` + +To learn more about what those updates mean, don't hesitate to read the [explanation provided in our first example](/developers/user-guide/introduction/first_realtime_app_with_pathway/#understanding-the-output). diff --git a/docs/2.developers/4.user-guide/040.connectors/30.custom-python-connectors.md b/docs/2.developers/4.user-guide/040.connectors/30.custom-python-connectors.md new file mode 100644 index 00000000..b26c56c6 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/30.custom-python-connectors.md @@ -0,0 +1,167 @@ +--- +title: 'Custom Python connectors' +description: 'Tutorial on how to create custom Python connector' +date: '2023-04-13' +thumbnail: '/assets/content/blog/th-custom-connectors.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Python'] +--- + +# Creating a custom Python connector + +In this tutorial, you will learn how to create a Python connector that will allow you to connect to your custom data source and feed data directly into Pathway. + +In order to create a custom connector, you need to extend the [`pw.io.python.ConnectorSubject`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject) class provided by Pathway and implement the `run` method, which is responsible for reading the source and feeding the data into the buffer. + +`ConnectorSubject` serves as a bridge between the data source and the Pathway engine. It comes with several methods allowing you to push data into the buffer. +You can find more about those functions in [the dedicated documentation](/developers/api-docs/pathway-io/python). +There is also an extended explanation [at the end](#connectorsubject-reference) of this article. + +## Simple scenario: reading from a static file + +You will create a connector that turns a static file into a stream. Suppose you have a file `cats.jsonl` in which each line contains data in JSON format: + +```json +{ "key": 1, "genus": "otocolobus", "epithet": "manul" } +{ "key": 2, "genus": "felis", "epithet": "catus" } +{ "key": 3, "genus": "lynx", "epithet": "lynx" } +``` + +In the `run` method, let's read the file and send the contents to the buffer line by line, using [`next_str`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_str) method. + +```python +import time +import pathway as pw + +class FileStreamSubject(pw.io.python.ConnectorSubject): + def run(self): + with open("cats.jsonl") as file: + for line in file: + self.next_str(line) + time.sleep(1) +``` + +You can now read from the subject you created and run computations. + +You need to call the method [`pw.io.python.read`](/developers/api-docs/pathway-io/python#pathway.io.python.read), passing your subject to it. In addition, you need to specify the schema of the data, indicating the format and fields that will be extracted as columns. + +```python +class InputSchema(pw.Schema): + key: int = pw.column_definition(primary_key=True) + genus: str + epithet: str + +table = pw.io.python.read( + FileStreamSubject(), + schema=InputSchema, + format="json" +) + +pw.io.csv.write(table, "output.csv") + +pw.run() +``` + +Under the hood, the connector will be started in a dedicated thread and will continue to work as long as the [`run`](/developers/api-docs/pathway/#pathway.run) method is spinning. + +## Advanced scenario: using external Python libraries + +In the second example, let's utilize an external library called [Tweepy](https://docs.tweepy.org/en/latest/) to load a stream of tweets. + +Tweepy is a Python library for accessing the Twitter API. You can install it with a simple pip command: + +```pip install tweepy``` + +As a first step, you need to create a `TwitterClient` class that extends [`tweepy.StreamingClient`](https://docs.tweepy.org/en/latest/streamingclient.html#streamingclient): + +```python +import tweepy + +class TwitterClient(tweepy.StreamingClient): + _subject: TwitterSubject + + def __init__(self, subject: TwitterSubject) -> None: + super().__init__(BEARER_TOKEN) + self._subject = subject + + def on_response(self, response) -> None: + self._subject.next_json( + { + "key": response.data.id, + "text": response.data.text, + } + ) +``` + +The client holds the subject object, which will be described in a moment. The `on_response` method is called when a new response from a stream is received. This is the right place to convert the message to the desired format and send it to a subject's buffer. +This time you can use the [`next_json`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_json) method which accepts a dictionary. + +As previously, you need to define the subject: + +```python +import pathway as pw + +class TwitterSubject(pw.io.python.ConnectorSubject): + _twitter_client: TwitterClient + + def __init__(self) -> None: + super().__init__() + self._twitter_client = TwitterClient(self) + + def run(self) -> None: + self._twitter_client.sample() + + def on_stop(self) -> None: + self._twitter_client.disconnect() +``` + +Three things are happening here: + +1. `TwitterClient` is created during subject initialization. The subject is accessed from within the `TwitterClient`, so you need to pass the subject to its constructor. +2. The method `run` starts the stream of tweets. Once started, the stream will flow indefinitely until it is closed or in case of failure. +3. The method `on_stop` is called when the stream is closed or in case of failure. You have a chance to perform some cleanup. + +Finally, you call `pw.io.python.read` as before: + +```python +class InputSchema(pw.Schema): + key: int = pw.column_definition(primary_key=True) + text: str + +table = pw.io.python.read( + TwitterSubject(), + schema=InputSchema +) + +pw.io.csv.write(table, "output.csv") + +pw.run() +``` + +The complete working example can be found in the [Pathway examples](https://github.com/pathwaycom/pathway/tree/main/examples/projects/custom-python-connector-twitter) repository. + +## ConnectorSubject reference + +In the examples above, you can see two implementations of the [`ConnectorSubject`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject) class. Now let's look in detail at the interface of this class. + +### Methods to implement +- `run`: main function responsible for consuming the data and feeding the buffer with one of the methods described below. +- `on_stop`: called when the stream is closed or in case of failure. A good place for doing all sorts of cleanup. + +### Embedded methods +- [`next_bytes`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_bytes): sends a message in the form of `bytes`. +- [`next_str`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_str): takes a message in the form of a string and sends it after converting it into `bytes`. +- [`next_json`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_json): takes a message in the form of a dict representing JSON and sends it after converting it into `bytes`. +- [`close`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.close): indicates that there will be no more messages. Will be called automatically when the run method finishes. + + +## Connector method reference + +The [`pw.io.python.read`](/developers/api-docs/pathway-io/python#pathway.io.python.read) connector method takes the following arguments: + +- `subject`: connector subject to consume. +- `format`: in the examples above, we used the `json` format. It is also possible to use the `raw` format. For raw data, there is only one column `data` in the resulting table, where all entries are dumped. +- `value_columns`: if the format is not `raw`, it is the list of the columns which should be expected. +- `primary_key`: in case the table should have a primary key generated according to a subset of its columns, the set of columns should be specified in this field. Otherwise, the primary key will be generated as uuid4. +- `types`: a dictionary containing the mapping between the columns and the data types of the values of those columns. This parameter is optional, and if not provided the default type is string. +- `autocommit_duration_ms`: the maximum time between two commits. Every `autocommit_duration_ms` milliseconds, the updates received by the connector are committed and pushed into Pathway's dataflow. diff --git a/docs/2.developers/4.user-guide/040.connectors/30.kafka_connectors.md b/docs/2.developers/4.user-guide/040.connectors/30.kafka_connectors.md new file mode 100644 index 00000000..903f6679 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/30.kafka_connectors.md @@ -0,0 +1,202 @@ +--- +title: 'Kafka connectors' +description: 'Tutorial on how to use Kafka connectors' +date: '2023-01-15' +thumbnail: '/assets/content/blog/th-kafka-connectors.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Kafka', 'Docker'] +--- + +# Using Kafka connectors +Pathway provides a [`pw.io.kafka`](/developers/api-docs/pathway-io/kafka) module with connectors to read and send messages from a Kafka instance. + +In Pathway, you can read and send messages to a Kafka topic using [`pw.io.kafka.read`](/developers/api-docs/pathway-io/kafka#pathway.io.kafka.read) and [`pw.io.kafka.write`](/developers/api-docs/pathway-io/kafka#pathway.io.kafka.write). +Kafka connectors only work in the [streaming mode](/developers/user-guide/connecting-to-data/streaming-and-static-modes/). + +⚠️ Note there also exist connectors for Redpanda. They work the same; you only need to replace `kafka` with `redpanda`: +[`pw.io.redpanda.read`](/developers/api-docs/pathway-io/redpanda#pathway.io.redpanda.read) and [`pw.io.redpanda.write`](/developers/api-docs/pathway-io/redpanda#pathway.io.redpanda.write). + +## Short version +Consider a simple scenario: messages are sent to a Kafka instance on a topic `connector_example`, each message containing a table with a single column `value` in a CSV format, and we want to compute the sum of these values and send the resulting output stream to the same Kafka instance on a `sum` topic. +You can do it as follows in Pathway: + +```python [realtime_sum.py] +import pathway as pw + +# Kafka settings +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} + +# We define a schema for the table +# It set all the columns and their types +class InputSchema(pw.Schema): + value: int + + +# We use the Kafka connector to listen to the "connector_example" topic +t = pw.io.kafka.read( + rdkafka_settings, + topic="connector_example", + schema=InputSchema, + format="csv", + autocommit_duration_ms=1000 +) + +# We compute the sum (this part is independent of the connectors). +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# We use the Kafka connector to send the resulting output stream containing the sum +pw.io.kafka.write(t, rdkafka_settings, topic_name="sum", format="json") + +# We launch the computation. +pw.run() +``` + +## Input connector + +**Data stream**: +Consider a stream in the form of Kafka messages received on given topics. +An update is a set of messages: the update is triggered by a commit. +Commits ensure the atomicity of each update and are generated periodically. + +**Usage**: +the Kafka input connector [`pw.io.kafka.read`](/developers/api-docs/pathway-io/kafka#pathway.io.kafka.read) takes several arguments: +- `rdkafka_settings`: the settings used to connect to Kafka; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). +- `topic`: the topic which is listened to. +- `format`: format of messages among `raw`, `csv`, and `json`. +- `schema`: if the format is not `raw`, the schema of the table. It defines the columns' names and their types. It also defines the primary keys. +- `autocommit_duration_ms`: the maximum time between two commits. Every `autocommit_duration_ms` milliseconds, the updates received by the connector are committed and pushed into Pathway's dataflow. + +```python +class InputSchema(pw.Schema): + value: int + + +t = pw.io.kafka.read( + rdkafka_settings, + topic="connector_example", + format="csv", + schema=InputSchema, + autocommit_duration_ms=1000 +) +``` + +The way the input connector behaves depends on the format of the input data. +- `raw`: for raw data, there is only one column `data` in which all the entry is dumped. +- `csv` and `json`: the data is formatted according to the pattern. In this case, `value_columns` is expected. + +⚠️ For the `csv` format: the first message should start with a header containing the column names, in the correct order, and separated by commas. +The connector will not properly work without this message, however, it must be sent only once: if sent twice, the second message will be treated like a normal row. + + +## Output connector + +The output connector [`pw.io.kafka.write`](/developers/api-docs/pathway-io/kafka#pathway.io.kafka.write) sends the updates made to a table `t` to a given Kafka instance and *on a single Kafka topic*. +Currently, messages are sent following the JSON format. + + +**Usage**: +the output connector takes the following arguments: +- `table`: the Pathway table to send to Kafka, +- `rdkafka_settings`: the settings used to connect to Kafka; they follow the format of [librdkafka](https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md). +- `topic_name`: topic on which the messages are sent, +- `format`: `binary`, `json`, and `dsv` (a generalization of CSV) are currently supported (more are coming). + +```python +pw.io.kafka.write(t, rdkafka_settings, topic_name="sum", format="json") +``` + +## Complete example + +Let's go back to our example on how to compute a sum over the values of the columns' `value` received on a Kafka topic `connector_example` in a CSV format. +The final version of our project contains two files: `realtime_sum.py` which processes the stream using Pathway and `generating_stream.sh` which generates the streams. + +Here is `realtime_sum.py`: + +```python [realtime_sum.py] +import pathway as pw + +# Kafka settings +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} +class InputSchema(pw.Schema): + value: int + + +# We use the Kafka connector to listen to the "connector_example" topic +t = pw.io.kafka.read( + rdkafka_settings, + topic="connector_example", + format="csv", + schema=InputSchema, + autocommit_duration_ms=1000 +) + +# We compute the sum (this part is independent of the connectors). +t = t.reduce(sum=pw.reducers.sum(t.value)) + +# We use the Kafka connector to send the resulting output stream containing the sum +pw.io.kafka.write(t, rdkafka_settings, topic_name="sum", format="json") + +# We launch the computation. +pw.run() +``` + +Don't forget the `pw.run()` otherwise no computation will be done! +Once `pw.run()` is called, the computation will be run forever until it gets killed. +If you need some reminders on Pathway operations, don't hesitate to take a look at our [First-steps guide](/developers/user-guide/data-transformation/table-operations/). + + +You can use the KafkaProducer API provided by Kafka to send messages to Kafka using Python in a `generating_kafka_stream.py` script: + +```python [generating_kafka_stream.py] +from kafka import KafkaProducer +import time + +topic = "connector_example" + +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", +) +producer.send(topic, ("value").encode("utf-8"), partition=0) + +time.sleep(5) +for i in range(10): + time.sleep(1) + producer.send( + topic, (str(i)).encode("utf-8"), partition=0 + ) + +producer.close() +``` + +> Note that, depending on your version of Kafka, you may need to specify the API version to make this code work: `api_version=(0,10,2)`: + +```python +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", + api_version=(0,10,2), +) +``` diff --git a/docs/2.developers/4.user-guide/040.connectors/35.python-output-connectors.md b/docs/2.developers/4.user-guide/040.connectors/35.python-output-connectors.md new file mode 100644 index 00000000..f03f0951 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/35.python-output-connectors.md @@ -0,0 +1,58 @@ +--- +title: 'Python output connectors' +description: 'Tutorial on how to subscribe to changes with Python callback' +date: '2023-08-31' +thumbnail: '/assets/content/blog/th-output-connector.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Python'] +--- + + +# Subscribing to changes with Python function + +In this article, you will learn how to subscribe to changes in a table using a plain Python function. + +Let's say you would like to perform some action on every change in the table. It could be a simple print to the console or a custom API call. After this tutorial, you will know how to use the [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe) function. + +Assume you have a streaming input table obtained with [`pw.demo.range_stream`](/developers/api-docs/pathway-demo#pathway.demo.range_stream): + +```python +import pathway as pw + +table = pw.demo.range_stream() +``` + +Now you just need to define a Python callback with your custom logic. + +```python +import logging + +def on_change(key: pw.Pointer, row: dict, time: int, is_addition: bool): + logging.info(f"{key}, {row}, {time}, {is_addition}") +``` + +Four parameters of the callback are expected to be (with names respectively): + +- `key` - id of a row, +- `row`- a dictionary from the column name to the column value, +- `time` - time of a change, +- `is_addition` - whether the change is addition. If False, it means data was removed from the table. + +All that's left is to call [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe). Do not forget to run the graph. + +```python +pw.io.subscribe(table, on_change) + +pw.run() +``` + +Function `on_change` will be called on every change in the input stream. + +As an option, you can specify one more function that will be called with no arguments when the stream of changes ends. It will be called for each engine worker separately. + +```python +def on_end(): + logging.info("End of stream.") + +pw.io.subscribe(table, on_change, on_end) +``` diff --git a/docs/2.developers/4.user-guide/040.connectors/70.gdrive-connector.md b/docs/2.developers/4.user-guide/040.connectors/70.gdrive-connector.md new file mode 100644 index 00000000..ee7e1826 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/70.gdrive-connector.md @@ -0,0 +1,63 @@ +--- +title: 'Google Drive connector' +description: 'Tutorial on Google Drive connector' +date: '2023-12-14' +thumbnail: '/assets/content/blog/th-gdrive-connector.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Google Drive', 'cloud'] +--- + +# Google Drive connector +This tutorial will guide you through connecting Pathway to your data stored on Google Drive. +For detailed information about Google Drive connector, refer to [the API documentation](/developers/api-docs/pathway-io/gdrive/). + +## Setting up Google Drive + +To enable access to your Google Drive files through the Pathway app, you'll require a Google Cloud project and a service user. + +### Creating a new project in the Google API console: + +- Go to [https://console.cloud.google.com/projectcreate](https://console.cloud.google.com/projectcreate) and create new project +- Enable Google Drive API by going to [https://console.cloud.google.com/apis/library/drive.googleapis.com](https://console.cloud.google.com/apis/library/drive.googleapis.com), make sure the newly created project is selected in the top left corner +- Configure consent screen: + - Go to [https://console.cloud.google.com/apis/credentials/consent](https://console.cloud.google.com/apis/credentials/consent) + - If using a private Gmail, select "External", and go next. + - Fill required parameters: application name, user support, and developer email (your email is fine) + - On the next screen click "Add or remove scopes" search for "drive.readonly" and select this scope + - Save and click through other steps +- Create service user: + - Go to [https://console.cloud.google.com/apis/credentials](https://console.cloud.google.com/apis/credentials) + - Click "+ Create credentials" and create a service account + - Name the service user and click through the next steps +- Generate service user key: + - Once more go to [https://console.cloud.google.com/apis/credentials](https://console.cloud.google.com/apis/credentials) and click on your newly created user (under Service Accounts) + - Note service user email address, it will be needed later + - Go to "Keys", click "Add key" -> "Create new key" -> "JSON" + +A JSON file will be saved to your computer. Move it to the folder where your Pathway script is located and rename it to `credentials.json`. + +### Sharing files with service user + +You can now share desired Google Drive resources with the created user. To do so, visit your Google Drive page, right click on the resource and share it with the email associated with the service user you generated earlier. + +Once you've done it, you'll require the ID of the shared file or directory. You can obtain this by right-clicking on the file, selecting "Share" and then copying the link. The ID will be part of the URL. + +For files: [https://drive.google.com/file/d/[OBJECT_ID]/view?usp=drive_link](https://drive.google.com/file/d/%5BOBJECT_ID%5D/view?usp=drive_link) +For folders: [https://drive.google.com/drive/folders/[OBJECT_ID]?usp=drive_link](https://drive.google.com/drive/folders/%5BOBJECT_ID%5D?usp=drive_link) + +Copy the OBJECT_ID from the URL. + +## Connecting to Google Drive with Pathway + +Once you've finished setting up your Google Drive, you can now start reading from it using [`pw.io.gdrive.read`](/developers/api-docs/pathway-io/gdrive#pathway.io.gdrive.read) (ensure to replace object_id with the one you copied in the previous step): + +```python +table = pw.io.gdrive.read( + object_id="0BzDTMZY18pgfcGg4ZXFRTDFBX0j", + service_user_credentials_file="credentials.json" +) + +pw.run() +``` + +By default, the connector operates in streaming mode, observing all changes occurring in a particular file or folder. Refresh rate can be adjusted by passing the `refresh_interval` parameter. Static mode can be enabled by specifying the argument `mode='static'`. Further information regarding available parameters can be found in API docs under [this link](/developers/api-docs/pathway-io/gdrive/). diff --git a/docs/2.developers/4.user-guide/040.connectors/80.switching-to-redpanda.md b/docs/2.developers/4.user-guide/040.connectors/80.switching-to-redpanda.md new file mode 100644 index 00000000..b58c63c7 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/80.switching-to-redpanda.md @@ -0,0 +1,640 @@ +--- +title: 'Switching from Kafka to Redpanda' +description: 'Tutorial on how to use Redpanda instead of Kafka' +date: '2023-03-24' +thumbnail: '/assets/content/tutorials/redpanda/th-Kafka_vs_Redpanda.png' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'Redpanda', 'Kafka', 'Zookeeper', 'Docker'] +--- + + +# Switching from Kafka to Redpanda +Not a fan of the JVM and ZooKeeper? +In this article, you will learn how to switch from Kafka to Redpanda and how to adapt your Pathway project to Redpanda. +The change is easier than you might think: your Pathway code remains the same! + +![Thumbnail](/assets/content/tutorials/redpanda/Kafka_vs_Redpanda.png) + +When Kafka was first released in 2011, it was a game-changer for realtime data processing. Its innovative distributed streaming platform brought powerful features that allowed organizations to build scalable and fault-tolerant data pipelines, process streams of data in real time, and integrate with various data sources and sinks. + +However, times have changed, and the world of data processing and streaming has evolved. Some of Kafka's once cutting-edge features are now seen as hindrances by some developers. For instance, Kafka can be complex to set up and manage, relies on ZooKeeper, and uses a JVM. + +This is where Redpanda comes in. +Redpanda is "a kafka" rebuilt in C++ as a, high-performance, Kafka-compatible streaming platform designed for modern applications. +It offers improved performance, lower latency, and is much more efficient with resource utilization than Kafka. +One of the significant benefits of Redpanda is its simplicity, making it easier to deploy and manage, saving time and resources. + +Redpanda claims to be fully Kafka compatible but how easy is it really to switch from Kafka to Redpanda? +Specifically, we want to know, how much of a lift is it to modify our Pathway Kafka integration? + +::quote +--- +--- +Spoiler Alert 🚨 Pathway works with both Kafka and Redpanda fully transparently: you don't have to change anything in your Pathway code. +--- +:: + +If you already have your Pathway project deployed and want to switch to an existing Redpanda instance, you can jump directly to the [Redpanda section](/developers/user-guide/connectors/switching-to-redpanda#switching-to-redpanda). +For both Kafka and Redpanda, we provide [the sources of the respective projects](https://github.com/pathwaycom/pathway/tree/main/examples/projects/best-movies-example). + +> You can also create a new project from our [project template](https://github.com/pathwaycom/cookiecutter-pathway). The creator will ask you to choose between Kafka and Redpanda and will set everything up for you. + +## Best-rated movies problem + +You just have been hired by a trendy VOD online platform. +As a new team member, your first task is to identify the most popular movies in the catalog; +Specifically, you want to find the *K* movies with the highest scores and how many ratings those movies have received. + +For example, this is what the expected table for K=3 could be: + +| | MovieID | Average | RatingNumber +| --- | :----------------: | :-------------: | :-------------: | +| | 218 | 4.9 | 7510 +| | 45 | 4.8 | 9123 +| | 7456 | 4.8 | 1240 + + +The ratings are received as a data stream through a Kafka instance, and you output the table to a CSV file. + +## Solution with Kafka + +With Kafka, we need the following four components: +- ZooKeeper +- Kafka +- Pathway +- a stream producer + +Each component will be hosted in a different docker container. + +Ratings will be sent by the stream producer to Kafka on the topic `ratings`. +Pathway listens to the topic, processes the stream, and outputs the ranking in a CSV file `best_rating.csv`. + +![Kafka architecture](/assets/content/tutorials/redpanda/kafka_architecture.svg) + +Pathway and the stream generator will have their own Dockerfile to install all the required dependencies. +The stream will be created by streaming the lines of a static data `dataset.csv`. + +Our project will have the following structure: + +``` +. +├── pathway-src/ +│ ├── Dockerfile +│ └── process-stream.py +├── producer-src/ +│ ├── create-stream.py +│ ├── dataset.csv +│ └── Dockerfile +├── docker-compose.yml +└── Makefile +``` + +### Kafka and ZooKeeper + +Kafka and ZooKeeper are configurable in the `docker-compose.yml` file. +To keep things simple, no security mechanisms are used. + +```yaml [docker-compose.yml] +version: "3.7" +name: tuto-switch-to-redpanda +networks: + tutorial_network: + driver: bridge +services: + zookeeper: + image: confluentinc/cp-zookeeper:5.5.3 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + networks: + - tutorial_network + kafka: + image: confluentinc/cp-enterprise-kafka:5.5.3 + depends_on: [zookeeper] + environment: + KAFKA_AUTO_CREATE_TOPICS: true + KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_HOST_NAME: kafka + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT + KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT + CONFLUENT_SUPPORT_METRICS_ENABLE: false + ports: + - 9092:9092 + command: sh -c "((sleep 15 && kafka-topics --create --zookeeper zookeeper:2181 --replication-factor 1 --partitions 1 --topic ratings)&) && /etc/confluent/docker/run " + networks: + - tutorial_network +``` + +Here we are sending the messages to a topic called `ratings`, created in the `command` setting. + +### Generating the stream + +To generate the stream, we start with a CSV dataset with the following columns: `userId` (int), `movieId` (int), `rating` (float), and `timestamp` (int). +This is the schema chosen by [GroupLens](https://grouplens.org/about/what-is-grouplens/) for their [MovieLens25M dataset](https://grouplens.org/datasets/movielens/25m/): +we provide a toy dataset as an example, but the project will work with the whole MovieLens25M dataset. + +To generate the stream, you can use a simple Python script to read the CSV file line by line, and each rating will be sent to Kafka using the `kafka-python` package. + +```python [./producer-src/create-stream.py] +import csv +import json +import time + +from kafka import KafkaProducer + +topic = "ratings" + +#We wait for Kafka and ZooKeeper to be ready +time.sleep(30) + +producer = KafkaProducer( + bootstrap_servers=["kafka:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) + +with open("./dataset.csv", newline="") as csvfile: + dataset_reader = csv.reader(csvfile, delimiter=",") + first_line = True + for row in dataset_reader: + # We skip the header + if first_line: + first_line = False + continue + message_json = { + "userId": int(row[0]), + "movieId": int(row[1]), + "rating": float(row[2]), + "timestamp": int(row[3]), + } + producer.send(topic, (json.dumps(message_json)).encode("utf-8")) + time.sleep(0.1) + +producer.send(topic, "*COMMIT*".encode("utf-8")) +time.sleep(2) +producer.close() +``` + +Note that we connect to `kafka:9092` and not localhost. + +This script will have its own container: + +```yaml [docker-compose.yml] + stream-producer: + build: + context: . + dockerfile: ./producer-src/Dockerfile + depends_on: [kafka] + networks: + - tutorial_network +``` + +You only need to use a Python image and install the associated package: + +```yaml [./producer-src/Dockerfile] +FROM python:3.10 + +RUN pip install kafka-python +COPY ./producer-src/create-stream.py create-stream.py +COPY ./producer-src/dataset.csv dataset.csv + +CMD ["python", "-u", "create-stream.py"] +``` + +### Pathway + +Now you have a stream generated from the dataset and sent to Kafka, so at this point you simply need to connect Pathway to Kafka and process the data. +To connect to Kafka, configure the connection: + +```python [./pathway-src/process-stream.py] +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} +``` + +If you want to establish a more secure connection using a SASL-SSL authentication over a SCRAM-SHA-256 mechanism, you can do it as follows: + +```python [./pathway-src/process-stream.py] +rdkafka_settings = { + "bootstrap.servers": "server:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********" +} +``` + +Let's connect to the `ratings` topic using the [`pw.io.kafka.read`](/developers/api-docs/pathway-io/kafka#pathway.io.kafka.read) connector: + +```python [./pathway-src/process-stream.py] +class InputSchema(pw.Schema): + movieId: int + rating: float + +t_ratings = pw.io.kafka.read( + rdkafka_settings, + topic="ratings", + format="json", + schema=InputSchema, + autocommit_duration_ms=100, +) +``` + +You are only interested in the `movieId` and `rating` columns, so there is no need to include the others. + +You can now define a function to find the best-rated movies: + +```python +def compute_best(t_ratings, K): + t_best_ratings = t_ratings.groupby(pw.this.movieId).reduce( + pw.this.movieId, + sum_ratings=pw.reducers.sum(pw.this.rating), + number_ratings=pw.reducers.count(pw.this.rating), + ) + t_best_ratings = t_best_ratings.select( + pw.this.movieId, + pw.this.number_ratings, + average_rating=pw.apply( + lambda x, y: (x / y) if y != 0 else 0, + pw.this.sum_ratings, + pw.this.number_ratings, + ), + ) + t_best_ratings = t_best_ratings.select( + movie_tuple=pw.apply( + lambda x, y, z: (x, y, z), + pw.this.average_rating, + pw.this.number_ratings, + pw.this.movieId, + ) + ) + t_best_ratings = t_best_ratings.reduce( + total_tuple=pw.reducers.sorted_tuple(pw.this.movie_tuple) + ) + t_best_ratings = t_best_ratings.select( + K_best=pw.apply(lambda my_tuple: (list(my_tuple))[-K:], pw.this.total_tuple) + ) + t_best_ratings = t_best_ratings.flatten(pw.this.K_best).select( + pw.this.K_best + ) + t_best_ratings = t_best_ratings.select( + movieId=pw.apply(lambda rating_tuple: rating_tuple[2], pw.this.K_best), + average_rating=pw.apply(lambda rating_tuple: rating_tuple[0], pw.this.K_best), + views=pw.apply(lambda rating_tuple: rating_tuple[1], pw.this.K_best), + ) + return t_best_ratings +``` + +Using the function, your final file will look like this: + +```python [./pathway-src/process-stream.py] +import pathway as pw +import time + +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} + +class InputSchema(pw.Schema): + movieId: int + rating: float + + +t_ratings = pw.io.kafka.read( + rdkafka_settings, + topic="ratings", + format="json", + schema=InputSchema, + autocommit_duration_ms=100, +) + +t_best_ratings = compute_best(t_ratings, 3) + +# We output the results in a dedicated CSV file +pw.io.csv.write(t_best_ratings, "./best_ratings.csv") + +# We wait for Kafka and ZooKeeper to be ready +time.sleep(20) +# We launch the computation +pw.run() +``` + +You can set up a dedicated container: + +```yaml [docker-compose.yml] + pathway: + build: + context: . + dockerfile: ./pathway-src/Dockerfile + depends_on: [kafka] + networks: + - tutorial_network +``` + +```Dockerfile [./pathway-src/Dockerfile] +FROM python:3.10 + +RUN pip install -U pathway +COPY ./pathway-src/process-stream.py process-stream.py + +CMD ["python", "-u", "process-stream.py"] +``` + +### Results + +We provide the sources of the [Kafka project](https://github.com/pathwaycom/pathway/tree/main/examples/projects/best-movies-example/kafka-version). + +Let's use the following toy dataset: + +```csv [./producer-src/dataset.csv] +userId,movieId,rating,timestamp +1,296,5.0,1147880044 +1,306,3.5,1147868817 +1,307,5.0,1147868828 +1,665,5.0,1147878820 +1,899,3.5,1147868510 +1,1088,4.0,1147868495 +2,296,4.0,1147880044 +2,306,2.5,1147868817 +2,307,3.0,1147868828 +2,665,2.0,1147878820 +2,899,4.5,1147868510 +2,1088,2.0,1147868495 +3,296,1.0,1147880044 +3,306,2.5,1147868817 +3,307,4.0,1147868828 +3,665,2.0,1147878820 +3,899,1.5,1147868510 +3,1088,5.0,1147868495 +``` + +You obtain the following results: + +```csv +movieId,average_rating,views,time,diff +296,5,1,1680008702067,1 +306,3.5,1,1680008702167,1 +[...] +296,3.3333333333333335,3,1680008703767,-1 +1088,3.6666666666666665,3,1680008703767,1 +296,3.3333333333333335,3,1680008703767,1 +899,3.1666666666666665,3,1680008703767,-1 +``` + +As expected, the top 3 get updated whenever the ranking changes due to a new rating. + +## Switching to Redpanda + +Congratulations! You can now find the *K* best-rated movies on your VOD platform; +However, your team has discovered a new alternative to Kafka: Redpanda. +It is totally Kafka-compatible, does not rely on ZooKeeper, and much easier to manage, is more durable (meaning no data loss, as there is no page cache) and according to their published benchmarks is faster than Kafka! +Suppose your team is excited about this and proposes your next task: switch from Kafka to Redpanda. + +With Redpanda, the project is simpler, all you have is: +- Redpanda +- Pathway +- the stream producer + +![Kafka architecture](/assets/content/tutorials/redpanda/redpanda_architecture.svg) + +Let's see how to deploy Redpanda in docker and how it impacts your project. + +### Docker + +(You can skip this section if you already have an existing Redpanda instance) + +First, remove the two services `kafka` and `zookeeper`, and replace them with a `redpanda` service: + +```yaml [./docker-compose.yml] +services: + redpanda: + command: + - redpanda + - start + - --kafka-addr internal://0.0.0.0:9092,external://0.0.0.0:19092 + - --advertise-kafka-addr internal://redpanda:9092,external://localhost:19092 + - --pandaproxy-addr internal://0.0.0.0:8082,external://0.0.0.0:18082 + - --advertise-pandaproxy-addr internal://redpanda:8082,external://localhost:18082 + - --schema-registry-addr internal://0.0.0.0:8081,external://0.0.0.0:18081 + - --rpc-addr redpanda:33145 + - --advertise-rpc-addr redpanda:33145 + - --smp 1 + - --memory 1G + - --mode dev-container + - --default-log-level=debug + - --set redpanda.enable_transactions=true + - --set redpanda.enable_idempotence=true + - --set redpanda.auto_create_topics_enabled=true + image: docker.redpanda.com/redpandadata/redpanda:v23.1.2 + container_name: redpanda + volumes: + - redpanda:/var/lib/redpanda/data + networks: + - tutorial_network +``` + +You must now connect Pathway and the stream producer to `redpanda` instead of `kafka`. +This could have been avoided by naming the Kafka container differently or by naming the Redpanda container `kafka`. + +### Pathway + +As previously mentioned, we need to update the server's address in the settings: + +```python [./pathway-src/process-stream.py] +producer = KafkaProducer( + bootstrap_servers=["redpanda:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) +``` + +This is it! The setting will work exactly the same as with Kafka; +However, for consistency, there are also dedicated Redpanda connectors. +There is no difference between Kafka and Redpanda connectors, as the same connector is used under the hood. + +With the Redpanda connector [`pw.io.redpanda.read`](/developers/api-docs/pathway-io/redpanda#pathway.io.redpanda.read), here is what your `./pathway-src/process-stream.py` file looks like: + + +```python [./pathway-src/process-stream.py] +import pathway as pw +import time + +rdkafka_settings = { + "bootstrap.servers": "redpanda:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} + +class InputSchema(pw.Schema): + movieId: int + rating: float + + +t_ratings = pw.io.redpanda.read( + rdkafka_settings, + topic="ratings", + format="json", + schema=InputSchema, + autocommit_duration_ms=100, +) + +t_best_ratings = compute_best(t_ratings, 3) + +# We output the results in a dedicated CSV file +pw.io.csv.write(t_best_ratings, "./best_ratings.csv") + +# We wait for Kafka and ZooKeeper to be ready +time.sleep(20) +# We launch the computation +pw.run() +``` + +If you don't care about the names of the connector and server, you don't have to change the file at all. + +### The stream generator + +As with Pathway, you need to update the server name (if required) for the stream generator: + +```python [./producer-src/create-stream.py] +producer = KafkaProducer( + bootstrap_servers=["redpanda:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) +``` + +Small problem, if you look at the results now, we have an empty `best_ratings.csv`. + +This comes from the creation of the `ratings` topic. +While the topic was already ready with Kafka, it was created at the reception of the first message with Redpanda. +Creating a topic makes Redpanda discard the messages until it is ready. +Sending a message at the beginning of the computation should solve this: + +```python [./producer-src/create-stream.py] +producer.send(topic, "*COMMIT*".encode("utf-8")) +time.sleep(2) +``` + +Note that this difference comes from Redpanda and not Pathway. +Pathway connects to Redpanda and Kafka totally transparently. +Pathway will receive and process the data the same way whether Kafka or Redpanda is used. +Kafka and Redpanda are responsible for handling messages: Redpanda discards the incoming messages while the topic is created. + +Despite the fix, the final file is very similar to the one for the Kafka version: + +```python [./producer-src/create-stream.py] +from kafka import KafkaProducer +import csv +import time +import json + +topic = "ratings" + +#We wait for Kafka and ZooKeeper to be ready +time.sleep(30) + +producer = KafkaProducer( + bootstrap_servers=["kafka:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) +producer.send(topic, "*COMMIT*".encode("utf-8")) +time.sleep(2) + +with open("./dataset.csv", newline="") as csvfile: + dataset_reader = csv.reader(csvfile, delimiter=",") + first_line = True + for row in dataset_reader: + # We skip the header + if first_line: + first_line = False + continue + message_json = { + "userId": int(row[0]), + "movieId": int(row[1]), + "rating": float(row[2]), + "timestamp": int(row[3]), + } + producer.send(topic, (json.dumps(message_json)).encode("utf-8")) + time.sleep(0.1) + +producer.send(topic, "*COMMIT*".encode("utf-8")) +time.sleep(2) +producer.close() +``` + +You can also take a look at the sources of the [Redpanda project](https://github.com/pathwaycom/pathway/tree/main/examples/projects/best-movies-example/redpanda-version). + +With this, the results are the same as with Kafka: +```csv +movieId,average_rating,views,time,diff +296,5,1,1680008702067,1 +306,3.5,1,1680008702167,1 +[...] +296,3.3333333333333335,3,1680008703767,-1 +1088,3.6666666666666665,3,1680008703767,1 +296,3.3333333333333335,3,1680008703767,1 +899,3.1666666666666665,3,1680008703767,-1 +``` + +## Bonus: sending your results to Redpanda + +You've successfully computed the *K* best-rated movies using Redpanda, and your ranking is automatically updated thanks to Pathway; +However, your team still isn't satisfied with the outcome. +After taking a closer look, you realize the issue: you've been sending your results in a CSV file, which isn't the most suitable for handling a data stream. +Not only that, but the file stays on your local computer, preventing others in the organization from accessing the data in real time. + +Your team suggests sending the results back to Redpanda into a `best_ratings` topic instead. +Redpanda is optimized for handling data streams, making it a more efficient and effective solution than sending data in a CSV file. +By doing this, you can ensure the data is accessible to everyone who needs it in real time. + +Connecting to Redpanda with Pathway is as easy as connecting to Kafka. +You need to use the Redpanda connector [`pw.io.redpanda.write`](/developers/api-docs/pathway-io/redpanda#pathway.io.redpanda.write), which is exactly the same as the Kafka connector: + +```python +rdkafka_settings = { + "bootstrap.servers": "redpanda:9092", + "security.protocol": "plaintext", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", +} +pw.io.redpanda.write( + t_best_ratings, + rdkafka_settings, + topic_name="best_ratings", + format="json" +) +``` + +As previously mentioned, you can also establish a more secure connection using a SASL-SSL authentication over a SCRAM-SHA-256 mechanism as (e.g. connecting to Redpanda Cloud) follows: + +```python +rdkafka_settings = { + "bootstrap.servers": "redpanda:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} +``` + + +## Conclusions + +Congratulations! 🎉 You have built a K-best-rated application in Pathway and made it work with Redpanda. +Being fully Kafka API-compatible, Redpanda just works: if the server name remains the same, you have nothing to do with your Pathway code! +For consistency, you can use Pathway's Redpanda connectors which work exactly the same as Kafka connectors. diff --git a/docs/2.developers/4.user-guide/040.connectors/90.websockets-connectors.md b/docs/2.developers/4.user-guide/040.connectors/90.websockets-connectors.md new file mode 100644 index 00000000..5def6df6 --- /dev/null +++ b/docs/2.developers/4.user-guide/040.connectors/90.websockets-connectors.md @@ -0,0 +1,157 @@ +--- +title: 'Custom WebSockets connectors' +description: 'Creating a custom WebSockets connector in Pathway' +date: '2023-01-24' +tags: ['tutorial', 'connectors'] +keywords: ['connector', 'WebSockets', 'Python'] +--- + +# Consuming WebSockets streams + +In this tutorial, you will be guided through creating a custom WebSocket connector. It will allow you to interact with WebSocket data streams and process them as needed. + +You will learn how to implement a custom [Python connector](/developers/user-guide/connectors/custom-python-connectors/) that utilizes the [`aiohttp` library](https://docs.aiohttp.org/en/stable/) to consume WebSockets data streams. + +The WebSockets protocol's nature means that each case's communication process can be unique. +This tutorial will focus on an API that requires a multi-step message exchange to connect to the relevant data streams. +By properly modifying this example, you can connect to any WebSocket API. + +## Generic connector +Let's start by abstracting the logic of the WebSockets connection. For this purpose, you need to implement a class inheriting from [`pw.io.python.ConnectorSubject`](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject): + + +```python +import pathway as pw +import asyncio +import aiohttp +from aiohttp.client_ws import ClientWebSocketResponse + + +class AIOHttpWebsocketSubject(pw.io.python.ConnectorSubject): + _url: str + + def __init__(self, url: str): + super().__init__() + self._url = url + + def run(self): + async def consume(): + async with aiohttp.ClientSession() as session: + async with session.ws_connect(self._url) as ws: + async for msg in ws: + if msg.type == aiohttp.WSMsgType.CLOSE: + break + else: + result = await self.on_ws_message(msg, ws) + for row in result: + self.next_json(row) + + asyncio.new_event_loop().run_until_complete(consume()) + + async def on_ws_message(self, msg, ws: ClientWebSocketResponse) -> list[dict]: + ... + +``` + +The `run` method contains the base message consumption logic. Please note that the `consume` function will be executed within a dedicated `asyncio` loop. The message processing logic has been delegated to an abstract method. The result of the processing is then sent to the buffer using the [self.next_json()](/developers/api-docs/pathway-io/python#pathway.io.python.ConnectorSubject.next_json) method. This method is provided by the `pw.io.python.ConnectorSubject` superclass. + +## Real-world example + +As an example, you will consume the [Polygon.io Stocks API](https://polygon.io/docs/stocks/ws_getting-started) with a connector that subscribes to a one-second aggregations of selected stocks. + +```python +import json + + +class PolygonSubject(AIOHttpWebsocketSubject): + _api_key: str + _symbols: str + + def __init__(self, url: str, api_key: str, symbols: str): + super().__init__(url) + self._api_key = api_key + self._symbols = symbols + + async def on_ws_message( + self, msg: aiohttp.WSMessage, ws: ClientWebSocketResponse + ) -> list[dict]: + if msg.type == aiohttp.WSMsgType.TEXT: + result = [] + payload = json.loads(msg.data) + for object in payload: + match object: + case {"ev": "status", "status": "connected"}: + # make authorization request if connected successfully + await self._authorize(ws) + case {"ev": "status", "status": "auth_success"}: + # request a stream, once authenticated + await self._subscribe(ws) + case {"ev": "A"}: + # append data object to results list + result.append(object) + case {"ev": "status", "status": "error"}: + raise RuntimeError(object["message"]) + case _: + raise RuntimeError(f"Unhandled payload: {object}") + return result + else: + return [] + + async def _authorize(self, ws: ClientWebSocketResponse): + await ws.send_json({"action": "auth", "params": self._api_key}) + + async def _subscribe(self, ws: ClientWebSocketResponse): + await ws.send_json({"action": "subscribe", "params": self._symbols}) +``` + +Each message is a serialized JSON, which contains a list of objects. However, the API requires authentication before we start receiving messages with the data of interest. After a successful connection, indicated by a message with a certain status, we send an authorization message with the API key. Once the authorization is successful, we can send another message to subscribe to the selected data streams. From this point onward, we expect to receive messages containing aggregations of stock market data. + +Let's now define a [schema](/developers/user-guide/types-in-pathway/schema) that describes the structure of the resulting Pathway Table. It will correspond to the incoming data from the API since the incoming payload is not modified. + + +```python +class StockAggregates(pw.Schema): + sym: str # stock symbol + o: float # opening price + v: int # tick volume + s: int # starting tick timestamp + e: int # ending tick timestamp + ... +``` + +You can now utilize the previously defined subject to create an input table using [`pw.io.python.read`](/developers/api-docs/pathway-io/python#pathway.io.python.read): + + +```python +URL = "wss://delayed.polygon.io/stocks" +API_KEY = "your-api-key" + +subject = PolygonSubject(url=URL, api_key=API_KEY, symbols=".*") + +table = pw.io.python.read(subject, schema=StockAggregates) +``` + +For this tutorial, you can use [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe) to subscribe to the changes occurring within the table: + + +```python +import logging + +def on_change( + key: pw.Pointer, + row: dict, + time: int, + is_addition: bool, +): + logging.info(f"{time}: {row}") + + +pw.io.subscribe(table, on_change) +``` + +Now, all that's left is to run Pathway pipeline with [`pw.run`](/developers/api-docs/pathway/#pathway.run): + + +```python +pw.run() +``` diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.async-transformer/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.async-transformer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.async-transformer/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.async-transformer/article.py new file mode 100644 index 00000000..171193ff --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.async-transformer/article.py @@ -0,0 +1,312 @@ +# --- +# title: Asynchronous transformations +# description: An article explaining how to perform asynchronous data transformations in Pathway +# date: '2024-02-20' +# thumbnail: '' +# tags: ['tutorial', 'engineering'] +# keywords: ['python', 'function', 'asynchronous', 'transformation', 'query', 'AsyncTransformer'] +# notebook_export_path: notebooks/tutorials/asynctransformer.ipynb +# --- + +# %% [markdown] +# # AsyncTransformer +# One way of transforming data in Pathway, when simple transformations are not enough, is using [UDFs](/developers/user-guide/data-transformation/user-defined-functions). +# However, if the flexibility of the UDFs is still not enough, you can use even more general and flexible `AsyncTransformer`, useful especially for asynchronous computations. +# +# `AsyncTransformer` is a different mechanism than UDFs. +# It acts on the whole Pathway Table and returns a new Table. +# In contrast to UDFs, it is fully asynchronous. +# It starts the `invoke` method for every row that arrives, +# without waiting for the previous batches to finish. +# When the call is finished, its result is returned to the engine with a new processing time. +# +# To write an `AsyncTransformer` you need to inherit from [`pw.AsyncTransformer`](/developers/api-docs/pathway/#pathway.AsyncTransformer) and implement the [`invoke` method](/developers/api-docs/pathway/#pathway.AsyncTransformer.invoke) (it is a coroutine). The names of the arguments of the method have to be the same as the columns in the input table. You can use additional arguments but then you have to specify their default value (it might be useful if you want to use the same `AsyncTransformer` on multiple Pathway tables with different sets of columns). You have to use all columns from the input table. The order of columns/arguments doesn't matter as they are passed to the method as keyword arguments. +# +# You also need to define the schema of a table that is produced. The `invoke` method has to return a dictionary containing values to put in all columns of the output table. The keys in the dictionary has to match fields from the output schema. +# Let's create a simple `AsyncTransformer` that produces a Table with two output columns - `value` and `ret`. + + +# %% +import pathway as pw +import asyncio + + +class OutputSchema(pw.Schema): + value: int + ret: int + + +class SimpleAsyncTransformer(pw.AsyncTransformer, output_schema=OutputSchema): + async def invoke(self, value: int) -> dict: + await asyncio.sleep(value / 10) + return dict(value=value, ret=value + 1) + + +# %% [markdown] +# Let's use the transformer on the example input table. +# The result table containing only successful calls can be retrieved from the [`successful`](/developers/api-docs/pathway/#pathway.AsyncTransformer.successful) property of the transformer. + +# %% +table = pw.debug.table_from_markdown( + """ + value + 12 + 6 + 2 + 2 + 6 + +""" +) + +result = SimpleAsyncTransformer(input_table=table).successful +pw.debug.compute_and_print(result) + +# %% [markdown] +# The result is correct. Now let's take a look at the output times: + +# %% +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# Even though all values have equal processing times initially, +# the output times are different between the rows. +# It is the effect of `AsyncTransformer` not waiting for other rows to finish. +# Thanks to that some rows can be processed downstream quicker. +# If you want some rows to wait for some other rows to finish, +# take a look at the [`instance`](#asynctransformer-consistency) parameter. + +# %% [markdown] +# ## Failing calls +# The `invoke` method is usually written by an external user (like you) and it can contain bugs (unless you write bug-free code). +# When the `invoke` call raises an exception or times out (see the [next section](#controlling-asynctransformer-behavior) for that), its output won't be included in the `successful` table. +# The failed rows are put in the table accessible by the [`failed`](/developers/api-docs/pathway/#pathway.AsyncTransformer.failed) property. +# Let's define a new `AsyncTransformer` to check that. +# Maybe we don't like the value $12$ and we fail our function whenever we get it as an argument. + + +# %% +class SometimesFailingAsyncTransformer(pw.AsyncTransformer, output_schema=OutputSchema): + async def invoke(self, value: int) -> dict: + if value == 12: + raise ValueError("incorrect value") + return dict(value=value, ret=value + 1) + + +t = SometimesFailingAsyncTransformer(input_table=table) +pw.debug.compute_and_print(t.successful) +pw.debug.compute_and_print(t.failed) + +# %% [markdown] +# In the failed table you only get the ids of failed rows (other columns contain `None`). +# Because the `invoke` call failed it was impossible to return any values. +# You can check which values have failed by joining with the input table: + +# %% +failed = t.failed.join(table, pw.left.id == pw.right.id, id=pw.left.id).select( + pw.right.value +) +pw.debug.compute_and_print(failed) + +# %% [markdown] +# Now, you can see that the failed row actually has $12$ in the `value` column. + +# %% [markdown] +# ## Controlling AsyncTransformer behavior +# It is possible to control the behavior of `AsyncTransformer` using parameters similar to those in UDFs. +# They can be passed to [`with_options`](/developers/api-docs/pathway#pathway.AsyncTransformer.with_options) method. +# The available options are: +# - `capacity` - the maximum number of concurrent operations, +# - `timeout` - the maximum time (in seconds) to wait for the function result, +# - `retry_strategy` - the strategy for handling retries in case of failures. +# The same strategies as for asynchronous UDFs can be used. +# Examples: [`ExponentialBackoffRetryStrategy`](/developers/api-docs/udfs#pathway.udfs.ExponentialBackoffRetryStrategy), [`FixedDelayRetryStrategy`](/developers/api-docs/udfs#pathway.udfs.FixedDelayRetryStrategy), +# - `cache_strategy` - the caching strategy. The same strategies as for UDFs can be used. Examples: [`DiskCache`](/developers/api-docs/udfs#pathway.udfs.DiskCache), [`InMemoryCache`](/developers/api-docs/udfs#pathway.udfs.InMemoryCache). + +# In the following example, you add a timeout to the `SimpleAsyncTransformer` defined above. +# It is set to $0.9$ seconds. +# %% +t = SimpleAsyncTransformer(input_table=table).with_options(timeout=0.9) + +pw.debug.compute_and_print(t.successful) +failed = t.failed.join(table, pw.left.id == pw.right.id).select(pw.right.value) +pw.debug.compute_and_print(failed) + +# %% [markdown] +# Recall that the transformer sleeps the `invoke` method for a time passed as the method argument divided by $10$. +# That's why calls with `value` less than $9$ were successful, and calls with `value` greater than $9$ failed. + +# %% [markdown] +# ## AsyncTransformer consistency +# By default, `AsyncTransformer` preserves order for a given key. +# It means that if some row is still executed by `AsyncTransformer` and its update starts being executed and finishes earlier than the original row, +# it'll wait for the completion of the original row processing before being returned to the engine. +# The update cannot have an earlier time assigned than the original row as it would break the correctness of the computations. +# +# Let's analyze this case by computing the sums of entries from the stream. +# You want to compute the sum for each `group` independently. + +# %% +table = pw.debug.table_from_markdown( + """ + group | value | __time__ + 1 | 2 | 2 + 1 | 3 | 2 + 2 | 1 | 2 + 1 | -3 | 4 + 2 | 2 | 4 +""" +) +sums = table.groupby(pw.this.group).reduce( + pw.this.group, value=pw.reducers.sum(pw.this.value) +) + +pw.debug.compute_and_print_update_stream(sums) + +# %% [markdown] +# The sums computed in time $2$ are $5$ and $1$. +# They are deleted in time $4$ and replaced with sums $2$ and $3$. +# Let's modify `SimpleAsyncTransformer` to propagate the `group` column as well and apply it to the `sums` table. + + +# %% +class OutputWithGroupSchema(pw.Schema): + group: int + value: int + ret: int + + +class GroupAsyncTransformer(pw.AsyncTransformer, output_schema=OutputWithGroupSchema): + async def invoke(self, value: int, group: int) -> dict: + await asyncio.sleep(value / 10) + return dict(group=group, value=value, ret=value + 1) + + +result = GroupAsyncTransformer(input_table=sums).successful +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# All rows reach `GroupAsyncTransformer` at approximately the same time. +# In group $2$, the value at time $2$ is $1$, and at time $4$ is $3$. +# The first value is processed faster and returned to the engine. +# When a call for the next value finishes, the old value is removed and a new value is returned to the engine. +# +# The situation for group $1$ is different. +# The value at time $2$ is greater than the value at time $4$ ($5 > 2$). +# Because of that, the second call to `invoke` finishes earlier and has to wait for the first call to finish. +# When the first call finishes, they are both returned to the engine. +# The value from the second call is newer and immediately replaces the old value. + +# %% [markdown] +# ### Partial consistency +# Sometimes, the consistency for rows with a single key might not be enough for you. +# If you want to guarantee an order within a group of records, you can use the `instance` parameter of the `AsyncTransformer`. +# Rows within a single `instance` are ordered. +# It means that the results for rows with higher initial processing times can't overtake the results for rows with lower initial processing times. +# All results within a single instance with equal processing times wait for all rows with this time to finish. +# Using the `instance` parameter does not block new calls from starting. Only the results of the calls get synchronized. +# To demonstrate the synchronization, we create a new table with more data: + +# %% +table = pw.debug.table_from_markdown( + """ + group | value | __time__ + 1 | 2 | 2 + 1 | 3 | 2 + 2 | 1 | 2 + 3 | 1 | 2 + 4 | 3 | 2 + 1 | -3 | 4 + 2 | 3 | 4 + 3 | 1 | 4 + 4 | -1 | 4 +""" +) +sums = table.groupby(pw.this.group).reduce( + pw.this.group, value=pw.reducers.sum(pw.this.value) +) + +pw.debug.compute_and_print_update_stream(sums) + +# %% [markdown] +# Now, you have four groups, with one row for each group. +# You want to guarantee consistency separately for even and odd groups. +# To do that, you need to set the `instance` of `GroupAsyncTransformer` appropriately. + +# %% +result = GroupAsyncTransformer(input_table=sums, instance=pw.this.group % 2).successful +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# The updates for groups $2,4$ are bundled together. +# Group $2$ at time $2$ could finish earlier, but it waits for group $4$. +# Groups $1,3$ are also dependent on each other. +# Group $3$ could finish quicker, but it waits for group $1$ to finish. +# +# You can have a look at how the updates would proceed if no `instance` was specified: + +# %% +result = GroupAsyncTransformer(input_table=sums).successful +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# As you can see, only ordering within a group is preserved. + +# %% [markdown] +# ### Full consistency +# By using the `instance` parameter, it is possible to make the output preserve the temporal ordering of the input. +# It is enough to set `instance` to the same value for all rows, for example by using a constant. +# Then results for rows with a given time will wait for all previous times to finish before being returned to the engine. +# Rows with a given time are returned all at once and have the same time assigned. +# The new calls are not blocked from starting. Only the results get synchronized. +# +# Let's use constant `instance` in the example from the previous section. + +# %% +result = GroupAsyncTransformer(input_table=sums, instance=0).successful +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# All rows are returned at the same time. +# There are also no updates because calls for time $2$ are finished later than calls for time $4$. +# You can play with the data to make time $2$ finish before time $4$ and see that the update happens once. + +# %% [markdown] +# ### Failing calls consistency +# If the `instance` parameter is used and the call for a given instance fails, the instance is in the failed state from this time. +# `AsyncTransformer` requires all calls with a given `(instance, processing time)` pair to finish successfully. +# If at least one call fails, returning other rows could leave the instance in an inconsistent state. +# Let's take a look at what happens if `group` $4$ fails at time $4$. + + +# %% +class SuspiciousGroupAsyncTransformer( + pw.AsyncTransformer, output_schema=OutputWithGroupSchema +): + async def invoke(self, value: int, group: int) -> dict: + if group == 4 and value == 2: + raise ValueError("err") + await asyncio.sleep(value / 10) + return dict(group=group, value=value, ret=value + 1) + + +result = SuspiciousGroupAsyncTransformer( + input_table=sums, instance=pw.this.group % 2 +).successful +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# New values for the even instance (groups $2,4$) coming from the entries at time $4$ are not inserted because group $4$ fails and hence the whole instance fails. None of the entries in the odd instance (groups $1,3$) fail so it is updated normally. + +# %% [markdown] +# ## Conclusions +# In this guide, you've learned how to create your own `AsyncTransformer` +# when you need to process the data asynchronously in Pathway. +# You know how to control its behavior by setting parameters like `timeout`, `cache_strategy` and `retry_strategy`. +# You can control the tradeoff between the speed and the consistency of the results. +# +# Now, you also understand the difference between asynchronous UDFs and AsyncTransformer. +# The former is asynchronous only within a single batch of data +# and can return values only to a single column, +# while the latter is fully asynchronous and can return multiple columns. +# It also allows for specifying the consistency level by using the `instance` parameter. diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.custom_reducers/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.custom_reducers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.custom_reducers/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.custom_reducers/article.py new file mode 100644 index 00000000..304af33b --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.custom_reducers/article.py @@ -0,0 +1,151 @@ +# --- +# title: Writing custom reducers +# description: An article explaining how to write custom reducers in Pathway +# date: '2024-01-17' +# thumbnail: '/assets/content/blog/th-json.png' +# tags: ['tutorial'] +# keywords: ['reducers', 'aggregate', 'sumofsquares', 'median'] +# notebook_export_path: notebooks/tutorials/custom_reducers.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Writing simple custom reducer in Pathway +# +# Pathway supports natively aggregation using a wide range of [reducers](/developers/api-docs/reducers/), e.g., [`sum`](/developers/api-docs/reducers/#pathway.reducers.sum), [`count`](/developers/api-docs/reducers/#pathway.reducers.count), or [`max`](/developers/api-docs/reducers/#pathway.reducers.max). However, those might not cover all the necessary ways of aggregating values. In this tutorial, you learn how to write reducers implementing custom logic. +# +# For example, let's implement a custom `stdev` reducer that computes the standard deviation. +# %% +import pathway as pw + +SHOW_DEBUG = False + + +class StdDevAccumulator(pw.BaseCustomAccumulator): + def __init__(self, cnt, sum, sum_sq): + self.cnt = cnt + self.sum = sum + self.sum_sq = sum_sq + + @classmethod + def from_row(cls, row): + [val] = row + if SHOW_DEBUG: + print("from_row()") + return cls(1, val, val**2) + + def update(self, other): + self.cnt += other.cnt + self.sum += other.sum + self.sum_sq += other.sum_sq + if SHOW_DEBUG: + print("update()") + + def compute_result(self) -> float: + mean = self.sum / self.cnt + mean_sq = self.sum_sq / self.cnt + if SHOW_DEBUG: + print("compute_result()") + return mean_sq - mean**2 + + +stddev = pw.reducers.udf_reducer(StdDevAccumulator) +# %% [markdown] +# Above, the [`pw.BaseCustomAccumulator`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator) class is used as a base for the `StdDevAccumulator`, which describes the logic of the underlying accumulator. The accumulator class requires a few methods: +# * [`from_row`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator.from_row), which constructs an accumulator from the values of a single row of a table (here, a single value since our reducer applies to a single column), +# * [`update`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator.update), which updates one accumulator by another accumulator, +# * [`compute_result`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator.compute_result), which produces the output based on the accumulator state, +# * [`retract`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator.retract), is an optional method, which processes negative updates, +# * [`neutral`](/developers/api-docs/pathway/#pathway.BaseCustomAccumulator.neutral), is an optional method, which returns state corresponding to consuming 0 rows. +# +# Now, let's see the reducer in action. +# %% +temperature_data = pw.debug.table_from_markdown( + """ +date | temperature +2023-06-06 | 28.0 +2023-06-07 | 23.1 +2023-06-08 | 24.5 +2023-06-09 | 26.0 +2023-06-10 | 28.3 +2023-06-11 | 25.7 +""" +) + +temperature_statistics = temperature_data.reduce( + avg=pw.reducers.avg(pw.this.temperature), stddev=stddev(pw.this.temperature) +) + +pw.debug.compute_and_print(temperature_statistics) +# %% [markdown] +# However, with this logic, our reducer is not smartly processing negative updates: it starts the computation from scratch whenever a negative update is encountered. +# You can see this in action by enabling debug information and processing table where row removal happens. Let's insert several values at time 0 and then remove one already inserted value and add another at time 2. +# %% +SHOW_DEBUG = True +temperature_data_with_updates = pw.debug.table_from_markdown( + """ +date | temperature | __time__ | __diff__ +2023-06-06 | 28.0 | 0 | 1 +2023-06-07 | 23.1 | 0 | 1 +2023-06-08 | 24.5 | 0 | 1 +2023-06-09 | 26.0 | 0 | 1 +2023-06-10 | 28.3 | 0 | 1 +2023-06-11 | 25.7 | 0 | 1 +2023-06-11 | 25.7 | 2 | -1 +2023-06-11 | 25.9 | 2 | 1 +""" +) + +temperature_statistics_with_updates = temperature_data_with_updates.reduce( + avg=pw.reducers.avg(pw.this.temperature), stddev=stddev(pw.this.temperature) +) + +pw.debug.compute_and_print(temperature_statistics_with_updates) + + +# %% [markdown] +# It can be alleviated by extending our reducer and providing a method for processing negative updates. +# %% +class ImprovedStdDevAccumulator(StdDevAccumulator): + def retract(self, other): + self.cnt -= other.cnt + self.sum -= other.sum + self.sum_sq -= other.sum_sq + if SHOW_DEBUG: + print("retract()") + + +improved_stddev = pw.reducers.udf_reducer(ImprovedStdDevAccumulator) +# %% [markdown] +# And now you can test the improved reducer in action. +# %% + +temperature_statistics_improved = temperature_data_with_updates.reduce( + avg=pw.reducers.avg(pw.this.temperature), + stddev=improved_stddev(pw.this.temperature), +) + +pw.debug.compute_and_print(temperature_statistics_improved) + +# %% [markdown] +# In the example above, 10x calls to `update()` and 12x calls to `from_row()` are replaced with 6x calls to `update()`, 1x call to `retract()` and 8x calls to `from_row()`. +# +# This comes from the fact that former reducer: +# * had to call `from_row()` for each row of the table, wrapping each single value into separate `StdDevAccumulator` object, +# * had to call `update()` for each row of the table except the first consumed, +# * had to restart from scratch after the update to the table, thus it had to pay the cost twice. +# +# While the latter reducer aggregated the table at time 0 in the same way as former one, but processed the update differently: +# * had to wrap both delete and insert updates with `from_row()` calls +# * called once `retract()` and once `update()`. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.groupby-reduce-manual/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.groupby-reduce-manual/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.groupby-reduce-manual/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.groupby-reduce-manual/article.py new file mode 100644 index 00000000..2895aab6 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.groupby-reduce-manual/article.py @@ -0,0 +1,297 @@ +# --- +# title: "Groupby Reduce manu[a]l" +# description: 'Groupby Reduce manu[a]l' +# notebook_export_path: notebooks/tutorials/groupby_reduce_manual.ipynb +# --- + +# # Groupby - Reduce +# In this manu\[a\]l, you will learn how to aggregate data with the groupby-reduce scheme. +# +# Together, the [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby) and [`reduce`](/developers/api-docs/pathway-table#pathway.Table.reduce) operations can be used +# to **aggregate data across the rows of the table**. In this guide, +# we expand upon a simple demonstration from the +# [First-steps Guide](/developers/user-guide/data-transformation/table-operations/) +# and: +# * explain syntax of [groupby](#groupby-syntax) and [reduce](#reduce-syntax) +# * explain [two kinds of columns we get from groupby](#groupby-reduce-column-constrains) +# * explain [automatic id generation](#auto-generated-id) +# * show [a few simple applications](#more-examples) + +# ## Prerequisites +# The assumption is that we are familiar with some basic operations +# explained in the [First-steps Guide](/developers/user-guide/data-transformation/table-operations/). +# As usual, we begin with importing Pathway. + +import pathway as pw + +# To demonstrate the capabilities of groupby and reduce operations, +# let us consider a made up scenario. +# +# **Storyline:** let's assume that you made a poll, asking whether a particular food item is a fruit or a vegetable. +# +# An answer to such a question is a tuple `(food_item, label, vote, fractional_vote)`. +# That is, if someone could tell that tomato is a fruit, but they are not really sure, +# it could be registered as two tuples: +# +# * (tomato, fruit, 1, 0.5), +# +# * (tomato, vegetable, 0, 0.5) +# +# Below, we have the results of the poll, stored in the table *poll*. + + +poll = pw.debug.table_from_markdown( + """ + | food_item | label | vote | fractional_vote | time +0 | tomato | fruit | 1 | 0.5 | 1669882728 +1 | tomato | vegetable | 0 | 0.5 | 1669882728 +2 | apple | fruit | 1 | 1 | 1669883612 +3 | apple | vegetable | 0 | 0 | 1669883612 +4 | pepper | vegetable | 1 | 0.5 | 1669883059 +5 | pepper | fruit | 0 | 0.5 | 1669883059 +6 | tomato | fruit | 0 | 0.3 | 1669880159 +7 | tomato | vegetable | 1 | 0.7 | 1669880159 +8 | corn | fruit | 0 | 0.3 | 1669876829 +9 | corn | vegetable | 1 | 0.7 | 1669876829 +10 | tomato | fruit | 0 | 0.4 | 1669874325 +11 | tomato | vegetable | 1 | 0.6 | 1669874325 +12 | pepper | fruit | 0 | 0.45 | 1669887207 +13 | pepper | vegetable | 1 | 0.55 | 1669887207 +14 | apple | fruit | 1 | 1 | 1669874325 +15 | apple | vegetable | 0 | 0 | 1669874325 + +""" +) + +# To demonstrate a simple groupby-reduce application, let's ask about +# the total `fractional_vote` that was assigned to any combination of `foot_item`, `label`. +# +# +# First we explain the syntax of both [`groupby`](#groupby-syntax) and [`reduce`](#reduce-syntax). +# Then, we show groupby-reduce code in [action](#groupby-reduce-simple-example). + +# ## Groupby Syntax +# The syntax of the [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby) operation is fairly simple: + +# + +# _MD_SHOW_table.groupby(*C) +# - + +# +# It takes a list of columns `*C` as argument and groups the row according to their values in those columns. +# In other words, all the rows with the same values, column-wise, in each column of `*C` are put into the same group. +# +# As a result, it returns a [`GroupedTable`](/developers/api-docs/pathway/#pathway.GroupedTable) object, which stores +# a single row for each unique tuple from columns in `*C` and a collection +# of grouped items corresponding to each column that is not in `*C`. +# +# In the example above, if we groupby over a pair of columns `food_item`, `label`, +# the groupby computes a collection of `votes` and a collection of `fractional_votes` +# for each unique combination `food_item`, `label`. +# +# In order to use this object, we need to process those collections +# with the `reduce` operation. +# +# ## Reduce Syntax +# The [`reduce`](/developers/api-docs/pathway/#pathway.GroupedTable.reduce) function behaves a little bit like [`select`](/developers/api-docs/pathway-table#pathway.Table.select), and it also takes +# two kinds of arguments: + +# + +# _MD_SHOW_grouped_table.reduce(*SC, *NC) +# - + +# * *SC is simply a list of columns that were present in the table before the groupby +# operation, +# * *NC is a list of new columns (i.e. columns with new name), each defined as +# some function of cells in the row of grouped_table. +# +# It can be used together with groupby, as follows: + +# + +# _MD_SHOW_table.groupby(*C).reduce(*SC, *NC) +# - + +# The main difference between `reduce` and `select` is that each row of `grouped_table` +# has two kinds of entries, simple cells and groups. +# +# The `reduce` operation allows us to apply a reducer to transform a group into a value. +# +# ## Counting Votes With Groupby-Reduce +# Below, you can see an example that uses the [`sum`](/developers/api-docs/reducers/#pathway.reducers.sum) reducer to compute the sum of all +# votes and the sum of all fractional votes. + +# + +aggregated_results = poll.groupby(poll.food_item, poll.label).reduce( + poll.food_item, + poll.label, + total_votes=pw.reducers.sum(poll.vote), + total_fractional_vote=pw.reducers.sum(poll.fractional_vote), +) + +pw.debug.compute_and_print(aggregated_results) + +# - + +# ## Groupby-Reduce Column Constraints +# To briefly summarize, if $T$ is the set of all columns of a table +# * the columns from $C$ (used as comparison key) can be used as regular columns in the reduce function +# * for all the remaining columns (in $T \setminus C$), we need to apply a reducer, before we use them in expressions +# +# In particular, we can mix columns from $C$ and reduced columns from $T \setminus C$ in column expressions. + +# + +def make_a_note(label: str, tot_votes: int, tot_fractional_vote: float): + return f"{label} got {tot_votes} votes, with total fractional_vote of {round(tot_fractional_vote, 2)}" + + +aggregated_results_note = poll.groupby(poll.food_item, poll.label).reduce( + poll.food_item, + note=pw.apply( + make_a_note, + poll.label, + pw.reducers.sum(poll.vote), + pw.reducers.sum(poll.fractional_vote), + ), +) + +pw.debug.compute_and_print(aggregated_results_note) +# - + +# ## Auto generated id +# The `groupby(*C).reduce(...)` operation guarantees that each row of the +# output corresponds to a unique tuple of values from *C. Therefore *C +# can be used to generate a unique id for the resulting table. +# +# In fact that is the default behavior of Pathway' groupby operation and can be used e.g. to +# join the results of this table with some other table that has this id as a foreign key. + +# + +queries = pw.debug.table_from_markdown( + """ + | food_item | label +1 | tomato | fruit +2 | pepper | vegetable +3 | corn | vegetable + +""" +).with_id_from(pw.this.food_item, pw.this.label) + +pw.debug.compute_and_print( + queries.join( + aggregated_results_note, queries.id == aggregated_results_note.id, id=queries.id + ).select(queries.food_item, aggregated_results_note.note) +) +# - + +# More examples of joins (including another example of a join over a foreign key) +# can be found in the join manual ([full article](/developers/user-guide/data-transformation/join-manual/), [foreign key example](/developers/user-guide/data-transformation/join-manual/#joins-on-a-foreign-key)). +# + +# ## More Examples +# ### Recent activity with max reducer +# Below, you can see a piece of code that finds the latest votes that were submitted to the poll. +# It is done with `groupby`-`reduce` operations chained with [`join`](/developers/api-docs/pathway-table#pathway.Table.join) and [`filter`](/developers/api-docs/pathway-table#pathway.Table.filter), using [`pw.this`](/developers/api-docs/pathway/#pathway.this). +# +# + +hour = 3600 +pw.debug.compute_and_print( + poll.groupby() + .reduce(time=pw.reducers.max(poll.time)) + .join(poll, id=poll.id) + .select( + poll.food_item, + poll.label, + poll.vote, + poll.fractional_vote, + poll.time, + latest=pw.left.time, + ) + .filter(pw.this.time >= pw.this.latest - 2 * hour) + .select( + pw.this.food_item, + pw.this.label, + pw.this.vote, + pw.this.fractional_vote, + ) +) + +# ### Removing duplicates +# Below, duplicates are removed from the table with groupby-reduce. +# On its own, selecting `food_item` and `label` from poll returns duplicate rows: + +pw.debug.compute_and_print(poll.select(poll.food_item, poll.label)) + +# However, we can apply groupby-reduce to select a set of unique rows: + +pw.debug.compute_and_print( + poll.groupby(poll.food_item, poll.label).reduce(poll.food_item, poll.label) +) + + +# ### Chained groupby-reduce-join-select +# Below, you can find an example of groupby - reduce chained with join -select, +# using `pw.this`. +# +# To demonstrate that, we can ask our poll about total fractional vote for each pair +# `food_item`, `label` and total fractional vote assigned to rows for each `food_item`. + +# + +relative_score = ( + poll.groupby(poll.food_item) + .reduce( + poll.food_item, + total_fractional_vote=pw.reducers.sum(poll.fractional_vote), + ) + .join(aggregated_results, pw.left.food_item == pw.right.food_item) + .select( + pw.left.food_item, + pw.right.label, + label_fractional_vote=pw.right.total_fractional_vote, + total_fractional_vote=pw.left.total_fractional_vote, + ) +) + +pw.debug.compute_and_print(relative_score) +# - + +# ### Election using argmax reducer +# Below, we present a snippet of code, that in the context of a poll, +# finds the most obvious information: which label got the most votes. + +# Let's take a look on what exactly is the result of [`argmax`](/developers/api-docs/reducers/#pathway.reducers.argmax) reducer: + +pw.debug.compute_and_print( + relative_score.groupby(relative_score.food_item).reduce( + argmax_id=pw.reducers.argmax(relative_score.label_fractional_vote) + ) +) + +# As you can see, it returns an ID of the row that maximizes `label_fractional_vote` +# for a fixed `food_item`. +# You can filter interesting rows using those ID-s as follows: + +pw.debug.compute_and_print( + relative_score.groupby(relative_score.food_item) + .reduce(argmax_id=pw.reducers.argmax(relative_score.label_fractional_vote)) + .join(relative_score, pw.left.argmax_id == relative_score.id) + .select( + relative_score.food_item, + relative_score.label, + relative_score.label_fractional_vote, + ) +) + +# *Remark:* the code snippet above is equivalent to: + +pw.debug.compute_and_print( + relative_score.groupby(relative_score.food_item) + .reduce(argmax_id=pw.reducers.argmax(relative_score.label_fractional_vote)) + .select( + relative_score.ix(pw.this.argmax_id).food_item, + relative_score.ix(pw.this.argmax_id).label, + relative_score.ix(pw.this.argmax_id).label_fractional_vote, + ) +) + +# You can read more about [joins](/developers/user-guide/data-transformation/join-manual/), [*.ix](/developers/api-docs/pathway/#property-ix) and [ID-s](/developers/user-guide/data-transformation/table-operations/#manipulating-the-table) in other places. diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.indexes-in-pathway/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.indexes-in-pathway/article.py new file mode 100644 index 00000000..451a5538 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.indexes-in-pathway/article.py @@ -0,0 +1,438 @@ +# --- +# title: Indexes in Pathway +# description: An article explaining ways of indexing data in Pathway. +# date: '2023-11-15' +# thumbnail: '/assets/content/blog/th-computing-pagerank.png' +# tags: ['tutorial', 'engineering'] +# related: ['/developers/showcases/lsh/lsh_chapter1', '/developers/showcases/llm-alert-pathway'] +# keywords: ['index', 'indexing', 'join', 'asof join', 'asof_now', 'KNN'] +# notebook_export_path: notebooks/tutorials/indexes.ipynb +# --- + +# %% [markdown] +# # Indexes in Pathway +# In this article, you'll learn about reactive indexes in Pathway and how they differ from conventional indexes used in databases. You'll also see how to use them to respond to a stream of queries in real time. +# +# Indexes are data structures that improve the speed of queries. They are often used in databases. They are helpful if you want to retrieve records with a specific value in a given column (then you need an index based on this column). An example of this is answering a stream of queries using the contents of a database. +# +# ![LSM index context drawing](assets/content/tutorials/indexes/index.svg) +# +# Indexes can also speed up joins - an existing index can be used if it is built on appropriate columns but also an index can be built ad-hoc, during query execution. +# Pathway offers indexes, but because it operates on streams, there are some differences as compared to database indexes. To learn about them, continue reading the article. + +# %% [markdown] +# ## Joins +# Pathway operates on streams. Unless it is informed otherwise, it assumes that new data can arrive from any stream. Thus, when joining two streams, Pathway has to keep these streams in memory. It builds [LSM trees](https://en.wikipedia.org/wiki/Log-structured_merge-tree) on both sides of a join. Thanks to that, new records arriving in any of the two streams can be joined quickly - it is enough to look them up in the index of the other table and no costly scans are needed. +# In contrast, normal databases only use an index on one side of a join because once the query is processed the join results are not updated. +# +# Let's consider a simple example in which you join two tables in Pathway. Here, a table is built from a simulated stream of changes to its rows. The value in the `__time__` column represents the arrival time of the record to the engine. Rows with the same value in the `__time__` column belong to a single batch. +# +# To use an example with a real streaming source it is enough to replace [`pw.debug.table_from_markdown`](/developers/api-docs/debug/#pathway.debug.table_from_markdown) with an appropriate [connector](/developers/user-guide/connecting-to-data/connectors/) (like Redpanda or Kafka connector). +# +# The tables are joined on the `instance` column. + +# %% +import pathway as pw + +table_a = pw.debug.table_from_markdown( + """ + value | instance | __time__ + 1 | 1 | 2 + 2 | 1 | 6 + 3 | 2 | 8 + 4 | 2 | 12 + """ +) +table_b = pw.debug.table_from_markdown( + """ + value | instance | __time__ + 11 | 1 | 4 + 12 | 2 | 6 + 13 | 1 | 8 + """ +) + +result = table_a.join(table_b, pw.left.instance == pw.right.instance).select( + left_value=pw.left.value, right_value=pw.right.value, instance=pw.this.instance +) + +pw.debug.compute_and_print(result) + +# %% [markdown] +# As you can see, the records from both sides get joined with the future records. It is expected, as Pathway incrementally updates all results to match the input data changes. However, if `table_a` would be `queries` on a `table_b` representing the `data` you want to query, you'd be surprised to see that answers to your queries are updated in the future when `data` changes. Let's say, you want to query the number of your website visits by location: + +# %% +import pathway as pw + +queries = pw.debug.table_from_markdown( + """ + query_id | country | __time__ + 1 | France | 4 + 2 | Poland | 6 + 3 | Germany | 8 + 4 | USA | 14 + """ +) +visits = pw.debug.table_from_markdown( + """ + country | __time__ + Poland | 2 + France | 2 + Spain | 2 + Poland | 2 + France | 4 + USA | 4 + USA | 4 + Germany | 6 + USA | 6 + USA | 8 + Poland | 8 + France | 8 + France | 12 + Germany | 14 + """ +) +total_visits_by_country = visits.groupby(pw.this.country).reduce( + pw.this.country, visits=pw.reducers.count() +) + +answers = queries.join( + total_visits_by_country, pw.left.country == pw.right.country +).select(pw.left.query_id, pw.this.country, pw.right.visits) + +pw.debug.compute_and_print(answers) + +# %% [markdown] +# Please note how the answer to your query with `query_no=1` is updated a few times. At first, it is equal to `2`. At time `8`, it changes to `3` and finally is equal to `4` (starting from time `12`). It may be a bit surprising if you're new to Pathway. It turns out, the `join` allows you to keep track of the updates! And it has many cool uses, for instance alerting. You can use it to set up a real-time alerting system. However, if that is not what you want and you'd like to get an answer to your query once, at its processing time, Pathway supports it as well! + +# %% [markdown] +# ## Asof now join +# Monitoring changes of answers to your queries might not be what you want. Especially if you have **a lot of** queries. If you want to get an answer for a query once, and then forget it, you can use [`asof_now_join`](/developers/api-docs/temporal/#pathway.stdlib.temporal.asof_now_join). Its left side is a queries table and the right side is the data you want to query. Note that the right side is still a table dynamically streaming row changes. You can update it but the updates will only affect future queries - no old answers will be updated. +# +# Let's see what `asof_now_join` would return in the example above: + +# %% +import pathway as pw + +queries = pw.debug.table_from_markdown( + """ + query_id | country | __time__ + 1 | France | 4 + 2 | Poland | 6 + 3 | Germany | 8 + 4 | USA | 14 + """ +) +visits = pw.debug.table_from_markdown( + """ + country | __time__ + Poland | 2 + France | 2 + Spain | 2 + Poland | 2 + France | 4 + USA | 4 + USA | 4 + Germany | 6 + USA | 6 + USA | 8 + Poland | 8 + France | 8 + France | 12 + Germany | 14 + """ +) +total_visits_by_country = visits.groupby(pw.this.country).reduce( + pw.this.country, visits=pw.reducers.count() +) + +answers = queries.asof_now_join( + total_visits_by_country, pw.left.country == pw.right.country +).select(pw.left.query_id, pw.this.country, pw.right.visits) + +pw.debug.compute_and_print(answers) + +# %% [markdown] +# This time the answers to queries are not updated. In particular, the answer to the query with `query_no=1` is set to `2` at time `4` and remains equal to `2` until the end of the program. +# +# In contrast to an ordinary [`join`](/developers/user-guide/data-transformation/join-manual), `asof_now_join` is not symmetric. New rows on the left side of the join will produce a result under the condition they can be joined with at least one row from the right side. If you want to produce at least one result from every query, you can use `asof_now_join_left` - then all columns from the right side in the output row will be set to `None`. On the other hand, new rows on the right side of the join won't immediately produce any new rows in the output but will update the index and if they're matched with new records from the left side later, they will appear in the output. +# +# Please note that for a correct operation, the left table of the `asof_now_join` (`queries`) can only be extended with new queries. Pathway verifies it for you. You can't delete or update the queries. It is quite reasonable. Instead of updating the query, you can just send a new query because your previous query has been already forgotten anyway. +# +# Another important thing is that `asof_now_join` depends on the processing time and because of that is non-deterministic. When sending data to a distributed system like pathway, you don't have a guarantee which data will enter the system first (network latencies, etc.). If both queries and data streams are updated at the same time, the amount of data that is already present in the system when a given query is answered is non-deterministic. Thus if you repeat the same experiment with real connectors (like Redpanda or Kafka), you may get different answers in different runs. If you used an ordinary `join`, you would always get the same answers in the end because the answers would be updated with the arrival of the new data. +# +# The differences between an ordinary `join` and `asof_now_join` used to answer the queries are summarized in the table below. Let's denote the left side of the join as "queries" and the right side as "data". +# +# | `join` | `asof_now_join` | +# |--------|-----------------| +# | updates the answers to old queries | doesn't update the answers to old queries | +# | keeps the queries | forgets the queries | +# | consumes memory (approx.) proportional to the sum of sizes of the queries and data tables | consumes memory (approx.) proportional to the size of the data table | +# | final output deterministic | final output non-deterministic | + + +# %% [markdown] +# ## KNN Index +# An approximate [K Nearest Neighbors (KNN) Index](/developers/showcases/lsh/lsh_chapter1) behaves similarly to a join. The default method [`get_nearest_items`](/developers/api-docs/ml/#pathway.stdlib.ml.index.KNNIndex.get_nearest_items) maintains always up-to-date answers to all queries when the set of indexed documents changes. In fact, it uses a join under the hood. +# +# If you don't want answers to your queries to be updated, you can use [`get_nearest_items_asof_now`](/developers/api-docs/ml/#pathway.stdlib.ml.index.KNNIndex.get_nearest_items_asof_now) (experimental). It'll return the closest points once and will forget the query. However, it'll monitor the stream containing index data and update the index if new data arrives (but won't update old queries). As a result, if you ask the same query again and the index has changed in the meantime, you can get a different answer. This behavior is used in our [llm-app](/developers/user-guide/llm-xpack/llm-app-pathway/) to answer queries using an always up-to-date index of documents. +# +# To understand better the differences between the methods, you can analyze the examples below. In the first example, `index.get_nearest_items` is used which leads to updating the answers. In the second example, `index.get_nearest_items_asof_now` is used and thus the answers are not updated. +# Streams of changes to tables are simulated using `pw.debug.table_from_markdown` which, as earlier, uses `__time__` column to split the data into batches. + +# %% +import pathway as pw +from pathway.stdlib.ml.index import KNNIndex + +queries = pw.debug.table_from_markdown( + """ + query_id | x | y | __time__ + 1 | 0 | 0 | 4 + 2 | 2 | -2 | 6 + 3 | -1 | 1 | 8 + 4 | -2 | -3 | 10 + """ +).select(pw.this.query_id, coords=pw.make_tuple(pw.this.x, pw.this.y)) + +data = pw.debug.table_from_markdown( + """ + point_id | x | y | __time__ + A | 2 | 2 | 2 + B | 3 | -2 | 2 + C | -1 | 0 | 6 + D | 1 | 2 | 8 + E | -3 | 1 | 10 + F | 1 | -4 | 12 + """ +).select(pw.this.point_id, coords=pw.make_tuple(pw.this.x, pw.this.y)) + +index = KNNIndex(data.coords, data, n_dimensions=2, n_and=5) +result_with_updates = queries + index.get_nearest_items(queries.coords, k=2).select( + nn_ids=pw.this.point_id, nns=pw.this.coords +) +pw.debug.compute_and_print(result_with_updates) + +# %% [markdown] +# The example below uses the same streams but does not update the answers to the queries. Note the difference between the methods used - `get_nearest_items` vs `get_nearest_items_asof_now`. +# %% +index = KNNIndex(data.coords, data, n_dimensions=2, n_and=5) +result_asof_now = queries + index.get_nearest_items_asof_now( + queries.coords, k=2 +).select(nn_ids=pw.this.point_id, nns=pw.this.coords) +pw.debug.compute_and_print(result_asof_now) + +# %% [markdown] +# ![Points and queries in knn example](assets/content/tutorials/indexes/knn.svg) +# Note the differences between two cases. For instance, for query with `query_id=2`, points returned by `get_nearest_items_asof_now` were B and C because other closer points were not available at the query time. On the other hand, `get_nearest_items` updated the results and they have changed to B, F. +# +# In the example above, 2-dimensional vectors were used to make the analysis simpler. The **llm-app** uses n-dimensional vectors but the general principle doesn't change. + +# %% [markdown] +# ## Applications of `asof_now` indexes to data read using HTTP REST connector +# +# If you want a more practical example, we can set up a webserver that answers queries about `k` nearest neighbors. The architecture of the app is presented on the diagram below: +# +# ![KNN app architecture](assets/content/tutorials/indexes/rest_knn.svg) +# +# First you have to make necessary imports and define a schema of the data (we will use the same schema for data na queries). +# %% +import pathway as pw +from pathway.stdlib.ml.index import KNNIndex + + +class PointSchema(pw.Schema): + x: int + y: int + + +# To receive the queries, you can use the [`rest_connector`](/developers/api-docs/pathway-io/http/#pathway.io.http.rest_connector). +# %% +host = "0.0.0.0" # set as needed +port = 8080 # set as needed +queries, response_writer = pw.io.http.rest_connector( + host=host, + port=port, + schema=PointSchema, + autocommit_duration_ms=50, + delete_completed_queries=False, +) +# %% [markdown] +# The database is represented as a directory containing CSV files with points from which you want to find closest ones. You can use a [CSV connector](/developers/user-guide/connectors/csv_connectors) to read a directory of CSV files. The `mode` parameter is set to `streaming`. If new files appear in the directory, their entries will be appended to the stream and hence will be added to the KNN index. +# %% +data_dir = "points" +data = pw.io.csv.read( + data_dir, + schema=PointSchema, + mode="streaming", + autocommit_duration_ms=500, +) +# %% [markdown] +# To build the index you need to turn point coordinates into a vector: +# %% +data = data.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +index = KNNIndex(data.coords, data, n_dimensions=2, n_and=5) +# %% [markdown] +# To answer the queries, you can use the `get_nearest_items_asof_now` method of the index. It'll find the nearest points but won't update the queries in the future. +# %% +queries = queries.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +result = queries + index.get_nearest_items_asof_now(queries.coords, k=2).select( + result=pw.this.coords +) +# %% [markdown] +# To send back query response to the user, you should use `response_writer` returned by the [`rest_connector`](/developers/api-docs/pathway-io/http#pathway.io.http.rest_connector) +# %% +response_writer(result) +# %% [markdown] +# Firstly, let's populate the directory with a file containing some points. To do that you can run in your terminal: +# %% [markdown] +# ```shell script +# mkdir points && echo -e "x,y\n2,2\n-2,2\n0,6\n2,8" > points/a.csv +# ``` +# %% [markdown] +# Now you can start the computation: +# %% [markdown] +# ```python +# pw.run() +# ``` +# %% [markdown] +# It is most convenient to copy the whole code and run it as a standalone script: +# %% [markdown] +# ```python +# import pathway as pw +# from pathway.stdlib.ml.index import KNNIndex +# import argparse +# +# +# class PointSchema(pw.Schema): +# x: int +# y: int +# +# +# def run(data_dir: str, host: str, port: int): +# queries, response_writer = pw.io.http.rest_connector( +# host=host, +# port=port, +# schema=PointSchema, +# autocommit_duration_ms=50, +# delete_completed_queries=False, +# ) +# data = pw.io.csv.read( +# data_dir, +# schema=PointSchema, +# mode="streaming", +# autocommit_duration_ms=500, +# ) +# data = data.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +# +# index = KNNIndex(data.coords, data, n_dimensions=2, n_and=5) +# +# queries = queries.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +# +# result = queries + index.get_nearest_items_asof_now(queries.coords, k=2).select( +# result=pw.this.coords +# ) +# +# response_writer(result) +# +# pw.run() +# +# +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument("data_dir") +# parser.add_argument("--host", default="0.0.0.0") +# parser.add_argument("--port", type=int, default=8080) +# args = parser.parse_args() +# run(data_dir=args.data_dir, host=args.host, port=args.port) +# ``` +# %% [markdown] +# Now you can send a request to the webserver. You can play with it and try different points. You can use +# %% [markdown] +# ```shell script +# curl --data '{"x": 2, "y": -2}' http://localhost:8080/ +# ``` + +# %% [markdown] +# You can also add new points to the index by adding a new file to a directory: +# %% [markdown] +# ```shell script +# echo -e "x,y\n-3,1\n1,-4" > points/b.csv +# ```` +# %% [markdown] +# Let's repeat the first query. +# %% [markdown] +# ```shell script +# curl --data '{"x": 2, "y": -2}' http://localhost:8080/ +# ``` +# %% [markdown] +# The result has changed which indicates that the index has been updated. +# +# In this case, you didn't need to keep the queries. However, if you wanted to alert your users (for example via email) that answers to their queries have changed, you could use the normal `get_nearest_items` method instead of `get_nearest_items_asof_now` and keep watching the changes in the output stream. To not complicate the example, instead of sending an email, we print a message to the console. +# To intercept messages in a stream, you can use the [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe) method. The function you provide, is called on every row of the stream. It'll also include deletion rows and that's why we check if `is_addition` argument is true before printing (an update in a stream consists of two records - a deletion record with the content of the old row and an addition record with a content of the new row). +# The code below is a slight modification of the code answering queries. You can run and experiment with it in the same way as you did with the code above. +# %% [markdown] +# ```python +# import pathway as pw +# from pathway.stdlib.ml.index import KNNIndex +# import argparse +# import logging +# +# +# class PointSchema(pw.Schema): +# x: int +# y: int +# +# +# def run(data_dir: str, host: str, port: int): +# queries, response_writer = pw.io.http.rest_connector( +# host=host, +# port=port, +# schema=PointSchema, +# autocommit_duration_ms=50, +# delete_completed_queries=False, +# ) +# data = pw.io.csv.read( +# data_dir, +# schema=PointSchema, +# mode="streaming", +# autocommit_duration_ms=500, +# ) +# data = data.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +# +# index = KNNIndex(data.coords, data, n_dimensions=2, n_and=5) +# +# queries = queries.select(coords=pw.make_tuple(pw.this.x, pw.this.y)) +# +# result = queries + index.get_nearest_items(queries.coords, k=2).select( +# result=pw.this.coords +# ) +# +# response_writer(result) +# +# def on_change(key, row, time, is_addition): +# if is_addition: +# query_coords = row["coords"] +# result_coords = row["result"] +# logging.info( +# f"The current answer to query {query_coords} is {result_coords}." +# ) +# +# pw.io.subscribe(result, on_change) +# +# pw.run() +# +# +# if __name__ == "__main__": +# parser = argparse.ArgumentParser() +# parser.add_argument("data_dir") +# parser.add_argument("--host", default="0.0.0.0") +# parser.add_argument("--port", type=int, default=8080) +# args = parser.parse_args() +# run(data_dir=args.data_dir, host=args.host, port=args.port) +# ``` +# %% [markdown] +# A similar approach was taken in our [alerting example](/developers/showcases/llm-alert-pathway/). +# It is an LLM app that can send you alerts on slack when the response to your query has changed significantly. +# %% [markdown] +# ## Summary +# In this article you learned about the differences in indexing between databases and Pathway. You can see that both approaches - keeping the queries to update them in the future or forgetting queries immediately after answering, are useful. It depends on your objective which approach should be used. Pathway provides methods to handle both variants. diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.indexing-grouped-tables/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.indexing-grouped-tables/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.indexing-grouped-tables/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.indexing-grouped-tables/article.py new file mode 100644 index 00000000..9cc3cb55 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.indexing-grouped-tables/article.py @@ -0,0 +1,254 @@ +# --- +# title: "Indexing from 'groupby' and single-row tables" +# description: +# date: '2023-03-31' +# thumbnail: '/assets/content/blog/th-computing-pagerank.png' +# tags: ['tutorial', 'table'] +# keywords: ['groupby', 'ix_ref', 'single-row table'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Indexing from a `groupby` and single-row tables using `ix_ref` +# In this tutorial, you will learn how to use `ix_ref` in a grouped table to access a row from the grouped table and how to manipulate single-row tables. +# +# ## Accessing a grouped row by value using `ix_ref` +# +# Imagine you've just been assigned a new project: analyzing salary statistics in your company. +# In particular, your goal is to determine the number of employees who earn more than the average salary in each department. +# +# Let's consider the following table: + +# %% +import pathway as pw + +salaries = pw.debug.table_from_markdown( + """ + salary | department + 1800 | Sales + 2000 | Finance + 2300 | Sales + 2700 | Finance + 1900 | Finance + """ +) +pw.debug.compute_and_print(salaries) + +# %% [markdown] +# You can easily compute the average the salary in using a `groupby` on the `department` column: + +# %% +statistics = salaries.groupby(pw.this.department).reduce( + pw.this.department, average_salary=pw.reducers.avg(pw.this.salary) +) +pw.debug.compute_and_print(statistics) + +# %% [markdown] +# Now, you could do a join to add the average salary column to each entry in the `salary` table: + +# %% +salaries_join = salaries.join( + statistics, pw.left.department == pw.right.department +).select(*pw.left, pw.right.average_salary) +pw.debug.compute_and_print(salaries_join) + +# %% [markdown] +# And then, you can filter whether the salary is higher than the `average_salary` column. +# +# However, this is very tedious. +# +# **In Pathway, you can directly access the row of a grouped table from the corresponding value using `ix_ref`** +# +# `ix_ref` is a value-based indexer: you can access the row corresponding to the value `Sales` with `statistics.ix_ref("Sales")`. +# +# ![ix_ref](/assets/content/blog/ix_ref.svg) +# +# With `ix_ref(value)`, you access the row associated with `value` by the `groupby`, and you can directly access the wanted column with the usual dot notation: +# to access the average salary of a given department, you can do `statistics.ix_ref(pw.this.department).average_salary` + +# %% +statistics = statistics.select( + average_salary=pw.cast(int, pw.this.average_salary), + department=pw.this.department, +) +higher_than_average_salaries = salaries.filter( + pw.this.salary >= statistics.ix_ref(pw.this.department).average_salary +) +pw.debug.compute_and_print(higher_than_average_salaries) + +# %% [markdown] +# Now you can count the number of entries in each department with a simple `groupby`: + +# %% +number_employees = higher_than_average_salaries.groupby(pw.this.department).reduce( + count=pw.reducers.count() +) +pw.debug.compute_and_print(number_employees) + +# %% [markdown] +# ## Multi-values indexing +# +# We can also use `ix_ref` to index using tuples of values by using `ix_ref(*args)`. +# +# For example, let's say we have an extra column `position`: + +# %% +salaries_with_position = pw.debug.table_from_markdown( + """ + salary | department | position + 1800 | Sales | junior + 2000 | Finance | junior + 2300 | Sales | senior + 2700 | Finance | senior + 1900 | Finance | junior + """ +) + +# %% [markdown] +# We can now make a groupby using both the `department` and `position` columns: + +# %% +grouped_table_with_position = salaries_with_position.groupby( + pw.this.department, pw.this.position +).reduce( + pw.this.department, pw.this.position, average_salary=pw.reducers.avg(pw.this.salary) +) +pw.debug.compute_and_print(grouped_table_with_position) + +# %% [markdown] +# Now we can access the row using both `department` and `position` columns: + +# %% +pw.debug.compute_and_print( + salaries_with_position.select( + *pw.this, + average_salary=grouped_table_with_position.ix_ref( + pw.this.department, pw.this.position + ).average_salary, + ) +) + +# %% [markdown] +# You can also use the values directly: + +# %% +pw.debug.compute_and_print( + salaries_with_position.select( + *pw.this, + average_salary=grouped_table_with_position.ix_ref( + "Sales", "junior" + ).average_salary, + ) +) + +# %% [markdown] +# ## Tables with primary keys +# +# `ix_ref` is not limited to tables obtained by a groupby/reduce scheme: it works with any table with **primary keys**. +# Primary keys are the columns chosen to index the table. +# +# By default, Pathway indexes the table with uuid indexes, except when doing a groupby/reduce where the columns used for the groupby are used to index the table. +# +# You can reindex the table by manually choosing primary keys, using `.with_id_from`: + +# %% +indexed_table = pw.debug.table_from_markdown( + """ +colA | colB +10 | A +20 | B +""" +) +pw.debug.compute_and_print(indexed_table) +reindexed_table = indexed_table.with_id_from(pw.this.colB) +pw.debug.compute_and_print(reindexed_table) + +# %% [markdown] +# You can see that indexes have been updated. +# +# With primary keys, the rows can now be accessed using `ix_ref`: + +# %% +pw.debug.compute_and_print( + indexed_table.select(new_val=reindexed_table.ix_ref("A").colA) +) + +# %% [markdown] +# ## Single-Row Tables +# +# A special case is an empty `groupby`: all the entries are associated to the same group. +# +# It is the case when you consider global statistics such as the average salary on the entire company. +# In Pathway, it can be computed with a simple `reduce()`. +# +# Let's compute the sum of all the salaries in the company: + +# %% +pw.debug.compute_and_print(salaries.reduce(sum_salary=pw.reducers.sum(pw.this.salary))) + +# %% [markdown] +# As you can see, **Pathway returns a single-row table** and not the single value. +# +# As tempting as it is, in Pathway, you cannot use the value directly and do: +# ```python +# nb_employees = employee_salary.reduce(pw.reducers.avg(pw.this.salary)) +# ``` +# +# In Pathway, you cannot obtain the value as an int or a float as you could in SQL: you need to use `.ix_ref()` +# +# **In Pathway, we access the value of a single-row table using `singlerowtable.ix_ref()`.** +# +# As previously, `.ix_ref()` access the entire (single) row so you still need to specify the column. +# You access the average with `average_table.ix_ref().average_salary`. +# +# Let's see how it goes: + +# %% +global_statistics = salaries.reduce(average_salary=pw.reducers.avg(pw.this.salary)) +pw.debug.compute_and_print(global_statistics) +global_statistics = global_statistics.cast_to_types(average_salary=int) +results = ( + salaries.filter(pw.this.salary >= global_statistics.ix_ref().average_salary) +).reduce(count=pw.reducers.count()) +pw.debug.compute_and_print(results) + +# %% [markdown] +# That's it! +# +# You now have the number of employees with a higher salary than average, also contained in a single-row table. +# +# `ix_ref()` can be used to copy the value in all the rows of the table: + +# %% +salaries_with_average = salaries.select( + *pw.this, global_statistics.ix_ref().average_salary +) +pw.debug.compute_and_print(salaries_with_average) + +# %% [markdown] +# ## Bonus: SQL version +# With Pathway's SQL API, you can directly query tables using SQL queries: + +# %% +sql_result = pw.sql( + "SELECT COUNT(*) AS count FROM salaries WHERE salary > (SELECT AVG(salary) FROM salaries)", + salaries=salaries, +) + +# %% +pw.debug.compute_and_print(sql_result) + +# %% [markdown] +# Be careful when using the SQL API: it only supports subqueries on single-row tables. +# This subquery usage is an exception, and we strongly encourage you to use the Python syntax as much as possible! diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.iterate/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.iterate/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.iterate/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.iterate/article.py new file mode 100644 index 00000000..2cb5ce70 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.iterate/article.py @@ -0,0 +1,328 @@ +# --- +# title: Iterative computations +# description: An article exploring concepts related to iterative computation in Pathway. +# date: '2023-12-28' +# thumbnail: '' +# tags: ['tutorial', 'engineering'] +# keywords: ['iterate', 'iteration', 'iterative', 'updates', 'connected components', 'collatz conjecture', 'newton method'] +# notebook_export_path: notebooks/tutorials/iterate.ipynb +# --- + +# %% [markdown] +# # Iterative computations +# Pathway allows you to perform iterative computations on tables. The tables can be updated over time but the data is still split into batches properly after performing an iteration. +# +# The iteration is performed until convergence or until the limit of steps is reached (by default there is no limit but you can specify it). The iteration is incremental - the next iteration step only gets elements that have been updated in the previous step. It allows Pathway to reduce the amount of computations performed and effectively have a different number of iteration steps for different elements. +# +# You could replace the iteration by unrolling the computations using, e.g. a `for` loop, but then you would get the same amount of steps for each element and possibly a lot more computations performed. + +# In the sections below you can find the example applications of iteration in Pathway with a more detailed explanation of how this mechanism works. + +# %% [markdown] +# ## Collatz conjecture +# There is a hypothesis in mathematics that the function presented below applied iteratively to any positive integer eventually transforms it to 1. The function is +# +# $f(n) = \begin{cases} n/2 & n \text{ is even} \\ 3n+1 & n \text{ is odd} \end{cases}$ +# +# The sequence is $a_0=n$, $a_i = f(a_{i-1})$ for $i > 0$ and the conjecture says that for every positive integer $n$ there exists such $i$ that $a_i=1$. You can read more about it on [Wikipedia](https://en.wikipedia.org/wiki/Collatz_conjecture). +# +# You will now learn how to use Pathway to test the Collatz conjecture. Let's import Pathway first and define a function that performs a single step of the iterative computation. It can return a single table, a tuple or a dictionary of tables. +# %% +import pathway as pw + + +def collatz_step(t: pw.Table) -> pw.Table: + t = t.with_columns( + a=pw.if_else( + pw.this.a == 1, + 1, + pw.if_else(pw.this.a % 2 == 0, pw.this.a // 2, 3 * pw.this.a + 1), + ) + ) + # t.debug("collatz") + return t + + +# %% [markdown] +# Let's define a table with initial numbers to be checked and apply `collatz_step` iteratively on them. + +# %% +table = pw.debug.table_from_markdown( + """ +n +3 +13 +42 +""" +).with_columns(a=pw.this.n) + +res = pw.iterate(collatz_step, t=table) +pw.debug.compute_and_print(res) + +# %% [markdown] +# The main part of the computation is the [`pw.iterate`](/developers/api-docs/pathway/#pathway.iterate) function. It takes a function as an argument - the `collatz_step` in this case. You need to pass all tables required by `collatz_step` as keyword arguments of `pw.iterate`. In general, `pw.iterate` can return a single table, a tuple or a named tuple of tables. It depends on what is returned from the user-defined function passed to `pw.iterate`. In this case, the `collatz_step` returns a single table and that's why `pw.iterate` also returns a single table. +# +# As for the mechanism, inside the iteration records of the returned table `t` that have changed their values are passed to the beginning of the function and flow through the dataflow in `collatz_step` again as new records of `t`. The `iteration_limit` argument of `pw.iterate` was not set. That's why the computation continues until records stop updating. If the Collatz conjecture was not true for these numbers, the computation would not finish. Fortunately, it has finished, and this way we checked that it is true at least for these three numbers. You can check more numbers if you wish but don't expect anything fancy as all numbers up to $2^{69}$ have been [checked already](http://www.ericr.nl/wondrous/). +# +# If you uncomment the `t.debug("collatz")` line, you can analyze the flow of the data. You can see that the maximal number of steps was $10$ (for $n=13$) and the minimal number of steps was $8$ (for $n=3$). While counting steps, also a final step that transforms 1 to 1 was included. It is a necessary step that allows Pathway to say that nothing has changed. The `pw.iterate` can only stop if there is no change in some iteration. If there was no change in one iteration, there won't be changes in any further iterations (functions passed to `iterate` should be deterministic to avoid problems). + +# %% [markdown] +# ## Newton's method +# In this example, you are going to use a [Newton method](https://en.wikipedia.org/wiki/Newton%27s_method) to find roots of a cubic polynomial. Let's consider polynomials of the form $f(x) = x^3 + ax^2 + bx + c$. Note that you can convert any third-degree polynomial for the purpose of root finding to this form by dividing by the coefficient of $x^3$. The derivative of the polynomial is $f'(x) = 3x^2 + 2ax + b$. Now you can write the formula for the solution update: $x_i = x_{i-1} - \frac{x^3 + ax^2 + bx + c}{3x^2 + 2ax + b}$ for $i > 0$ where $x_0$ is an initial guess (in the example you'll provide it by yourself). With this knowledge, let's write a code for the iteration step. In the code, a Horner's scheme is used for better speed and numerical stability. + + +# %% +def newton(table: pw.Table) -> pw.Table: + f = ((pw.this.x + pw.this.a) * pw.this.x + pw.this.b) * pw.this.x + pw.this.c + f_prime = (3 * pw.this.x + 2 * pw.this.a) * pw.this.x + pw.this.b + table = table.with_columns(x=pw.this.x - f / f_prime) + # table.debug("newton") + return table + + +# %% [markdown] +# Let's define a table with polynomials' coefficients and starting points. There are three different polynomials in total. +# %% +table = pw.debug.table_from_markdown( + """ + a | b | c | x0 +-2 | 1 | 1 | 0 +-2 | 1 | 1 | 2 +-1 | -2 | 1 | 0 +-1 | -2 | 1 | 1 + 1 | -3 | -1 | -1 + 1 | -3 | -1 | 0 + 1 | -3 | -1 | 1 +""" +).with_columns(x=pw.cast(float, pw.this.x0)) + +# %% [markdown] +# ![Graphs](assets/content/tutorials/iterate/poly.svg) +# + +# %% [markdown] +# The table and iteration step are defined, so you can set up a computation and print the results. + +# %% +result = pw.iterate(newton, table=table) +pw.debug.compute_and_print(result) + +# %% [markdown] +# By using the fact that the floating point numbers have finite precision, you don't need to specify the `iteration_limit`. If you were running some computations with slower convergence, specifying it could have been useful. Note that also a Newton method may sometimes diverge and hence never finish if you choose a wrong starting point. +# +# You can analyze the example by uncommenting the `table.debug("newton")` line. You will then see that the maximal number of steps was $17$. This number of steps was required by $x^3 - 2x^2 + x + 1 = 0$ for $x_0 = 2$. It was not a good starting point. As a result, the root estimate was changing a lot. All other computations have finished in at most $8$ steps. + +# %% [markdown] +# ## Connected components in a graph +# In all cases you've seen so far, the rows were independent from each other. Those cases can be solved easily without `pw.iterate` - for example by writing a [Python UDF](/developers/api-docs/pathway#pathway.udf) that would get values from a single row and process them until convergence. For longer computations, a UDF would be slower as `pw.iterate` is implemented natively in Rust. Nevertheless, it'd be possible to do it that way. +# +# Now you'll see how to find [connected components](https://en.wikipedia.org/wiki/Component_(graph_theory)) in a graph using `pw.iterate`. Doing this is not possible with a Python UDF when each row represents a vertex or an edge in a graph. One way of solving the problem is to find for each vertex the lowest vertex label among vertices it has a path to. Let's call this vertex with the lowest label a representative. If the vertices' labels are different, each vertex in a single connected component has the same representative, and vertices from different connected components have different representatives. Then it is easy to check if the vertices belong to the same connected component. +# +# Let's create an example graph - a chain of six vertices and a single vertex. A `repr` column is also created. It contains a representative of a connected component, initially set to the vertex itself. The labels of vertices are different and hence can be used to create IDs of rows in a table by using `with_id_from`. It is needed later to have a deterministic way of assigning IDs to records. + +# %% +vertices = pw.debug.table_from_markdown( + """ + a + 1 + 2 + 3 + 4 + 5 + 6 + 7 +""", + id_from=["a"], +).with_columns(repr=pw.this.a) + +edges = pw.debug.table_from_markdown( + """ + a | b + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 1 +""" +) + +# %% [markdown] +# The edges are undirected but it is easier to use them in a directed fashion. For this purpose, reversed edges are added to the graph. Also self-loops are useful. + +# %% +edges = pw.Table.concat_reindex( + edges, + edges.select(a=pw.this.b, b=pw.this.a), + vertices.select(a=pw.this.a, b=pw.this.a), +) + +# %% [markdown] +# Now you can define an iteration step. In each step for each vertex, the representatives of neighbors are checked and the lowest label among them is be chosen as a new representative. Self-loops are needed to include the current representative in the computation. It'd be possible to preserve it differently, but this way it is cleaner. The process ends when there are no new updates - which means that all neighbors have the same representative assigned. + + +# %% +def cc(vertices: pw.Table, edges: pw.Table) -> pw.Table: + edges_with_repr = edges.join(vertices, edges.b == vertices.a).select( + edges.a, vertices.repr + ) + vertices_updated = edges_with_repr.groupby(pw.this.a).reduce( + pw.this.a, repr=pw.reducers.min(pw.this.repr) + ) + # vertices_updated.debug("vertices") + vertices_updated = vertices_updated.with_universe_of(vertices) + return vertices_updated + + +# %% [markdown] + +# In an iteration step, the `edges` table is joined with the `vertices` table to get the representatives of neighbors in the graph. Then `groupby` is performed on `edges_with_repr` to get a minimal representative for each vertex. A new ID is assigned based on column `a` - vertex label. It is assigned in exactly the same way it is done above when creating a table. It allows you to have the same set of keys in the `vertices_updated` table as in the `vertices` table. However, Pathway is not that clever to deduce that the keys are exactly the same in these two tables. That's why it has to be additionaly told they are the same, by using [`with_universe_of`](/developers/api-docs/pathway-table/#pathway.Table.with_universe_of). + +# Preserving the set of keys is important in `iterate`. The iteration can stop only stop if there are no updates in any of the records. The records correspondence between iterations is determined using their IDs. If a record with one ID disappears and a record with a new ID appears, Pathway decides that something is still changing and the computation has to continue (even if the contents of the two rows are the same). It is possible to change the set of keys used in `iterate` but in the end the set of keys has to stop changing anyway. You can see that in the next example on computing shortest distances in a graph. + +# %% [markdown] +# You can now start the computations. Note that you pass two tables to `pw.iterate` but only one is updated (and hence returned from `pw.iterate`). The `edges` table is an auxiliary table that only helps to perform the computations and `pw.iterate` doesn't change the table's content. However, the `edges` table can also change its content (when it is created by a streaming source). Then the updates to `edges` result in updates inside iteration (you can see an example later in this tutorial). + +# %% +result = pw.iterate(cc, vertices=vertices, edges=edges) +pw.debug.compute_and_print(result) + +# %% [markdown] +# Vertices from $1$ to $6$ got $1$ as their representative and hence they belong to the same connected component. Vertex $7$ has $7$ as a representative and is a one-vertex connected component. +# +# As in the previous examples, you can uncomment the `vertices_updated.debug("vertices")` line to see how the computations were performed. You can also see the updates on a diagram below. Note that not all vertices update at once and that the vertex that was not updated for a few iterations may start being updated (it applies for example to vertex $3$). +# ![Connected components representatives updates](assets/content/tutorials/iterate/iterate_cc_1.svg) +# + +# %% [markdown] +# Now let's consider a graph that is updated over time and compute its connected components. The graph is presented below with deletions marked using red color and insertions using green color. Each event has a time associated with it. The initial vertices (at time $2$) are black-colored. +# ![Graph with online updates](assets/content/tutorials/iterate/iterate_cc_2.svg) +# +# You can simulate a stream by adding `__time__` and `__diff__` columns to [`table_from_markdown`](/developers/api-docs/debug/#pathway.debug.table_from_markdown). The `__time__` column simulates the time the record arrives to Pathway and the `__diff__` determines whether the record is inserted (`+1`, the default) or deleted (`-1`). + +# %% +vertices = pw.debug.table_from_markdown( + """ + a | __time__ + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 + 6 | 2 + 7 | 2 + 8 | 2 + 9 | 6 + 10 | 8 +""", + id_from=["a"], +).with_columns(repr=pw.this.a) + +edges = pw.debug.table_from_markdown( + """ + a | b | __time__ | __diff__ + 1 | 2 | 2 | 1 + 2 | 3 | 2 | 1 + 2 | 8 | 2 | 1 + 8 | 4 | 2 | 1 + 8 | 5 | 2 | 1 + 8 | 6 | 2 | 1 + 4 | 7 | 2 | 1 + 2 | 8 | 4 | -1 + 9 | 10 | 8 | 1 + 3 | 9 | 10 | 1 + 2 | 3 | 12 | -1 +""", + id_from=["a", "b"], +) +edges = pw.Table.concat_reindex( + edges, + edges.select(a=pw.this.b, b=pw.this.a), + vertices.select(a=pw.this.a, b=pw.this.a), +) + +# %% [markdown] +# You can now run the computations. To see the updates over time [`pw.debug.compute_and_print_update_stream`](/developers/api-docs/debug/#pathway.debug.compute_and_print_update_stream) was used. Apart from ordinary columns, it returns `__time__` and `__diff__` columns which say respectively, when the record was produced and whether it is an insertion or a deletion. + +# %% +result = pw.iterate(cc, vertices=pw.iterate_universe(vertices), edges=edges) +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# Initially (at time $2$), all vertices have the same representative ($1$) and hence belong to a single connected component. At time $4$ the edge $2-8$ is deleted and vertices from $4$ to $8$ get a new representative. At time $6$ a new node $9$ is added. It is a new connected component. At time $8$ a new node $10$ is added along with an edge $9-10$. As a result, $10$ gets $9$ as a representative. At time $10$ a new edge $3-9$ is added. In effect, $1$ is propagated as a representative of $9$ and $10$. They are in the same connected component as $1, 2, 3$. However, at time $12$ the edge $2-3$ is removed. The connected component represented by $1$ is split into two components - one with $1$ as a representative and one with $3$ as a representative. + +# %% [markdown] +# ## Shortest paths in a graph +# Another interesting use of `pw.iterate` is computing shortest paths in a graph. In this example, you are going to find the shortest paths from one vertex to all accessible vertices in a graph. To demonstrate the ability of `pw.iterate` to work on tables with changing sets of rows, the initial answer only contains one vertex - a starting point. It grows when paths to new vertices are found. The graph is finite so the set of vertices finally stops growing and the distances finally stop updating so the iteration can finish. + +# To find the shortest paths, the [Bellman-Ford algorithm](https://en.wikipedia.org/wiki/Bellman%E2%80%93Ford_algorithm) is used. The example graph does not contain negative cycles. If it did, the iteration would not stop. To solve the problem, you could limit the number of iterations to `#vertices-1` and check if one more step does any updates to the distances. If it does, it means that there is a negative cycle in the graph. + +# Let's define the graph. In contrast to the connected components example, this time the edges are directed. They are directed from `a` to `b`. +# ![A graph with weighted edges](assets/content/tutorials/iterate/iterate_bellman.svg) +# + +# %% + +vertices = pw.debug.table_from_markdown( + """ + a | dist + 1 | 0 +""", + id_from=["a"], +) + +edges = pw.debug.table_from_markdown( + """ + a | b | weight + 1 | 2 | 2 + 2 | 3 | 3 + 3 | 2 | 2 + 1 | 3 | 10 + 2 | 5 | 5 + 3 | 4 | 3 + 4 | 5 | -2 + 5 | 3 | 1 + 4 | 8 | 4 + 4 | 1 | -4 + 6 | 7 | 2 + 7 | 6 | 3 +""" +) + + +edges = pw.Table.concat_reindex( + edges, vertices.select(a=pw.this.a, b=pw.this.a, weight=0) +) + +# %% [markdown] +# To make the updates easier, self-loops with weight zero are added. Thanks to them, there is no need to compare distances computed in the previous and the current iteration (the only vertex that would need them is the starting vertex). +# +# In a single iteration step, for each vertex, the length of paths via each of its neighbors is computed and the smallest one is chosen. Computing path length from the starting point via each neighbor is done using a [`join`](/developers/api-docs/pathway-table#pathway.Table.join) and taking the minimal length is done by a [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby), [`reduce`](/developers/api-docs/pathway/#pathway.GroupedTable.reduce) pair. + + +# %% +def bellman_ford(vertices: pw.Table, edges: pw.Table) -> pw.Table: + edges_with_dist = edges.join(vertices, edges.a == vertices.a).select( + a=edges.b, dist=edges.weight + vertices.dist + ) + vertices_updated = edges_with_dist.groupby(pw.this.a).reduce( + pw.this.a, dist=pw.reducers.min(pw.this.dist) + ) + # vertices_updated.debug("vertices") + return vertices_updated + + +# %% [markdown] +# Now you can use the `bellman_ford` in iterate. Note the [`pw.iterate_universe`](/developers/api-docs/pathway/#pathway.iterate_universe) wrapper for the `vertices` table. It is needed to tell `pw.iterate` that the vertices table changes its set of IDs. + +# %% +res = pw.iterate(bellman_ford, vertices=pw.iterate_universe(vertices), edges=edges) +pw.debug.compute_and_print(res) + +# %% [markdown] +# As in the previous examples, you can uncomment `vertices_updated.debug("vertices")` to see the flow of data inside iteration. Also, note that the final result does not contain distances to vertices $6$ and $7$. It is caused by the fact that the initial set only contains the starting vertex and only the accessible (in the same connected component) vertices are added to it. Vertices $6$ and $7$ form a separate connected component. +# +# If you wish, you can try simulating changes in the graph over time by adding `__time__` and `__diff__` columns to `pw.debug.table_from_markdown`. Then you can see the changes of the result over time by using `pw.debug.compute_and_print_update_stream(res)` instead of `pw.debug.compute_and_print(res)`. If you have problems, see how it was implemented for the connected components above. + +# %% [markdown] +# ## Summary +# In this tutorial, you have learned how the iteration works in Pathway and what are its applications. It can be useful in problems you encounter as well! diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.join-manual/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.join-manual/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.join-manual/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.join-manual/article.py new file mode 100644 index 00000000..d53026b2 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.join-manual/article.py @@ -0,0 +1,587 @@ +# --- +# title: Simple join manu[a]l +# description: Simple join manu[a]l +# notebook_export_path: notebooks/tutorials/join_manual.ipynb +# --- + +# # Playing with joins. +# A brief explanation on how to perform joins with Pathway. +# +# Join is one of the basic table operations provided in Pathway. +# A join operation combines columns from two different tables by associating rows from both tables wich are matching on some given values. +# +# This guide presents several samples of code using the joins: +# * [join (inner join)](#simple-inner-join) +# * [join_left, join_right, join_outer (outer_joins)](#outer-joins) +# +# +# The examples demonstrate usual use-cases, explain the behavior of +# outer joins, and point out some peculiarities you may encounter +# while using Pathway. In particular, they show how to: +# * [inherit id from left or right table](#id-inheritance-in-join) +# * [join tables using foreign keys](#joins-on-a-foreign-key) +# * [use joins in a chain](#chaining-joins) + +# ## Prerequisites +# +# Be sure to import Pathway. + +import pathway as pw + +# +# Also, you need some tables to play with. For the sake of presentation, +# let us consider the following made up scenario: a group of four friends +# (table friends) goes to grab some breakfast. +# + +friends = pw.debug.table_from_markdown( + """ + | name |budget + 1 | Alice | 13 + 2 | Bob | 10 + 3 | Carol | 11 + 4 | Dan | 12 + """ +) +menu = pw.debug.table_from_markdown( + """ + | dish | price + 11 | pancakes_with_jam | 11 + 12 | pb_jam_sandwich | 9 + 13 | jam_pb_pancakes | 12 + 14 | scrambled_egg | 11 + """ +) + + +# # +# However, some of them have allergies (table allergies), +# and cannot eat everything. Luckily, the restaurant has a list +# of allergens contained in each dish (table allergens_in_menu). +# + +# + +allergies = pw.debug.table_from_markdown( + """ + | person| allergy + 21 | Alice | milk + 22 | Bob | eggs + 23 | Carol | peanuts + 24 | Carol | milk + """ +) + +allergens_in_menu = pw.debug.table_from_markdown( + """ + | dish | allergen + 31 | pancakes_with_jam | milk + 32 | pancakes_with_jam | eggs + 33 | pb_jam_sandwich | peanuts + 34 | jam_pb_pancakes | eggs + 35 | jam_pb_pancakes | peanuts + 36 | jam_pb_pancakes | milk + 37 | scrambled_egg | eggs + """ +) +# - + +# ## Simple inner join +# ### Syntax +# Putting it simply, the syntax of [`join`](/developers/api-docs/pathway-table#pathway.Table.join) is: + +# + +# _MD_SHOW_table.join(other, *on) +# - + +# where +# * `table` and `other` are tables to be joined, +# * `*on` is a list of conditions of form: + +# + +# _MD_SHOW_table.some_column == other.some_other_column +# - + +# **Remark:** the order of tables in the condition matters. That is, +# a condition of the following form won't be accepted: + +# + +# _MD_SHOW_other.some_other_column == table.some_column +# - + +# ### Examples +# As a warm-up, let us see how to make simple join-queries. +# Let's begin with a simple query that finds all pairs person-dish +# that can put someone in a hospital. + +bad_match = allergies.join( + allergens_in_menu, allergies.allergy == allergens_in_menu.allergen +).select(allergens_in_menu.dish, allergies.person) +pw.debug.compute_and_print(bad_match) + +# # +# As you can see, the result is a table of life-threatening pairs. +# +# The [`select`](/developers/api-docs/pathway/#pathway.JoinResult.select) function works here similarly as [`select`](/developers/api-docs/pathway-table#pathway.Table.select) on a table. The difference is +# that here, you can use columns of both tables as arguments, e.g. + +# + +def make_food_constraint_note(name, dish): + return f"{name} can't eat {dish}." + + +bad_match_note = allergies.join( + allergens_in_menu, allergies.allergy == allergens_in_menu.allergen +).select( + note=pw.apply(make_food_constraint_note, allergies.person, allergens_in_menu.dish) +) +pw.debug.compute_and_print(bad_match_note) +# - + +# *Remark:* note that id is now some auto generated number, which is the usual behavior of join. + +# ### On self joins +# In order to perform a self join (a join of table with itself), +# you need to create a copy with [`.copy`](/developers/api-docs/pathway-table/#pathway.Table.copy). + +# + +same_allergies = ( + allergies.join( + allergies_copy := allergies.copy(), allergies.allergy == allergies_copy.allergy + ) + .select( + l_name=allergies.person, + r_name=allergies_copy.person, + ) + .filter(pw.this.l_name != pw.this.r_name) +) + +pw.debug.compute_and_print(same_allergies) +# - + +# ## Outer joins +# The difference between outer joins and joins is that the outer join adds to the result +# table also entries that didn't match: + +# + +# _MD_SHOW_left.join_left(right, conditions) +# - + +# includes entries from the *left* table and + +# + +# _MD_SHOW_left.join_right(right, conditions) +# - + +# includes entries from the *right* table. + +# ### Simple example +# To demonstrate outer-joins, you can consider a question about forbidden +# breakfast configurations (i.e. pairs of person-dish, such that a person is +# allergic to some ingredients). + +# + +people_allergies = friends.join_left( + allergies, friends.name == allergies.person +).select(friends.name, allergies.allergy) +pw.debug.compute_and_print(people_allergies) + +forbidden_breakfast = people_allergies.join_left( + allergens_in_menu, people_allergies.allergy == allergens_in_menu.allergen +).select(people_allergies.name, allergens_in_menu.dish) +pw.debug.compute_and_print(forbidden_breakfast) +# - + +# Now, as you can see, the table has duplicate rows. This is the intended behavior, as +# a particular person can be allergic to more than one ingredient. In other words, +# when you add a reason column to your result table, you can see that each row was +# included in the table above for a different reason. + +forbidden_breakfast = people_allergies.join_left( + allergens_in_menu, people_allergies.allergy == allergens_in_menu.allergen +).select( + people_allergies.name, allergens_in_menu.dish, reason=allergens_in_menu.allergen +) +pw.debug.compute_and_print(forbidden_breakfast) + + +# ### Removing duplicates +# If you really want to have a table without the 'reason' column and without +# duplicates, you can achieve that with extra [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby) and [`reduce`](/developers/api-docs/pathway/#pathway.GroupedTable.reduce) : + +# + +forbidden_breakfast = people_allergies.join_left( + allergens_in_menu, people_allergies.allergy == allergens_in_menu.allergen +).select(people_allergies.name, allergens_in_menu.dish) + +forbidden_breakfast_no_duplicates = forbidden_breakfast.groupby( + forbidden_breakfast.name, forbidden_breakfast.dish +).reduce(forbidden_breakfast.name, forbidden_breakfast.dish) + +pw.debug.compute_and_print(forbidden_breakfast_no_duplicates) +# - + +# That simple application of groupby-reduce combination essentially selects a +# unique set of pairs (name, dish) from our table. More on the reduce-groupby operations +# can be found in the reduce-groupby [manual](/developers/user-guide/data-transformation/groupby-reduce-manual/). + +# ### Expressions for unmatched rows +# Because [`join_left`](/developers/api-docs/pathway-table/#pathway.Table.join_left) operation returns rows from the left table, even if there is no +# match in the right column, some input columns for functions might have no defined value. +# +# **The left join sets the undefined input cells to `None` and the function needs to +# explicitly define how to handle such cases.** +# +# As an example, you can consider a modified variant of the `bad_match_note` table. +# The modification is that you want to include all the people, and additionally you want to display their budget. +# +# To that end, it is enough that you join the `friends` table with the `bad_match` table, as +# together they include all necessary information. + +basic_customer_info = friends.join_left( + bad_match, friends.name == bad_match.person +).select( + friends.name, + friends.budget, + note=pw.apply(make_food_constraint_note, friends.name, bad_match.dish), +) +pw.debug.compute_and_print(basic_customer_info) + +# As you can see, the behavior of the original `make_food_constraint_note` generates a +# little bit of an odd entry for Dan. To fix that problem, you can redefine the `make_food_constraint_note`. + +# + +def make_food_constraint_note_none_tolerant(name, dish): + if dish is None: + return f"{name} has no food restrictions." + else: + return make_food_constraint_note(name, dish) + + +basic_customer_info = friends.join_left( + bad_match, friends.name == bad_match.person +).select( + friends.name, + friends.budget, + note=pw.apply( + make_food_constraint_note_none_tolerant, + friends.name, + bad_match.dish, + ), +) +pw.debug.compute_and_print(basic_customer_info) +# - + +# To once again demonstrate the fact that the arguments for expressions are replaced with +# `None`, let us change `friends.name` to `bad_match.person` in the select part of our code. + +basic_customer_info = friends.join_left( + bad_match, friends.name == bad_match.person +).select( + bad_match.person, + friends.budget, + note=pw.apply( + make_food_constraint_note_none_tolerant, friends.name, bad_match.dish + ), +) +pw.debug.compute_and_print(basic_customer_info) + +# Note that, the cell that earlier included `Dan` is empty, even though `Dan` is the +# only value that could be included in this column (if there would be a match). +# The reason is that the expression for this column simply shows the value +# from `bad match.person`; since this row was not matched, this value is undefined and +# replaced by `None`. +# + +# ### On right joins +# In the examples above, you only relied on `join_left`. The [`join_right`](/developers/api-docs/pathway-table/#pathway.Table.join_right) operation is quite +# similar in its behavior. Namely, + +# + +people_allergies = friends.join_left( + allergies, friends.name == allergies.person +).select(friends.name, allergies.allergy) +pw.debug.compute_and_print(people_allergies) + +# is almost (except for auto-generated IDs) equivalent to + +people_allergies = allergies.join_right( + friends, allergies.person == friends.name +).select(friends.name, allergies.allergy) +pw.debug.compute_and_print(people_allergies) +# - + +# When you join two tables, the only difference is in syntax - since allergies is +# the table on which you call `join_right`, it must be first argument +# in the join condition, i.e. it is + +# + +# _MD_SHOW_allergies.person == friends.name +# - + +# as opposed to + +# + +# _MD_SHOW_friends.name == allergies.person +# - + +# used in the `join_left`. + +# ### On full outer joins +# The [`join_outer`](/developers/api-docs/pathway-table/#pathway.Table.join_outer) operation is a full outer join, which means that + +# + +# _MD_SHOW_left.join_outer(right, *on) +# - + +# not only show the pairs of rows from left and right that meet the condition +# in `*on`, but also rows that didn't get matched with any other row, +# from both the `left` and `right` tables. To demonstrate this operation, let +# us introduce another group of friends and find out, for each person in a group, +# whether the other group has any people with the same name. + +other_group = pw.debug.table_from_markdown( + """ + | name |budget + 5 | Bob | 12 + 6 | Carol | 14 + 7 | Eve | 12 + """ +) + +pw.debug.compute_and_print( + friends.join_outer(other_group, friends.name == other_group.name).select( + l_name=friends.name, + l_id=friends.id, + r_name=other_group.name, + r_id=other_group.id, + ) +) + + +# ## ID inheritance in join +# Full (yet still informal) syntax of join is: + +# + +# _MD_SHOW_table.join(other, *on, id = None) +# - + +# where +# * `table` and `other` are tables to be joined, +# * `*on` is a list of conditions of form: + +# + +# _MD_SHOW_table.some_column == other.some_other_column +# - + +# * optional `id` can be set to either table.id or other.id +# +# Whenever the `id` argument is not none, join will try to use the column passed in +# the id argument as the new id in the result of join. +# This operation will succeed only when there is a guarantee that the resulting joined table +# has no multiple rows with the same id-to-inherit. +# +# Below you can find three examples - one successful and two failed id inheritance. +# First, let us see what are the id-s of the original tables. + +pw.debug.compute_and_print(friends) +pw.debug.compute_and_print(allergies) + +# ### Successful id inheritance +# Let us try a join that inherits id-s from table `allergies`. + +pw.debug.compute_and_print( + allergies.join(friends, allergies.person == friends.name, id=allergies.id).select( + friends.name, allergies.allergy, friends.budget + ) +) + +# As you can see, the id column is the same as in the `friends` table, +# which is not the case when the id parameter is not set. + +pw.debug.compute_and_print( + allergies.join(friends, allergies.person == friends.name, id=allergies.id).select( + friends.name, allergies.allergy, friends.budget + ) +) + +# ### Failed id inheritance: duplicate id-s +# The first possible problem with inheriting id is that one row of the source table +# could be matched with several entries of the other table. The code below will +# cause such a problem, and will raise a `KeyError` error. + +# + +# _MD_SHOW_pw.debug.compute_and_print( +# _MD_SHOW_ allergies.join(friends, allergies.person == friends.name, id=friends.id).select( +# _MD_SHOW_ friends.name, allergies.allergy +# _MD_SHOW_ ) +# _MD_SHOW_) +# _MD_SHOW_[stacktrace...] +# _MD_SHOW_KeyError: 'duplicate key: ^SERVYWW6KDGEQ2WVZ3ZZB86VSR' +# - + +# ### Failed id inheritance: empty id-s +# Finally, if you consider outer joins between those two tables, you +# may encounter a situation in which you need to assign an id that is empty in the join result: + + +pw.debug.compute_and_print( + allergies.join_right(friends, allergies.person == friends.name).select( + friends.name, allergies.allergy, allergies_id=allergies.id + ) +) + +# As you can see, the `allergies_id` field is not set for Dan's entry. If you try to use +# `allergies.id` as the `id` parameter, you will encounter a `TypeError` error. + +# + +# _MD_SHOW_pw.debug.compute_and_print( +# _MD_SHOW_ allergies.join_right( +# _MD_SHOW_ friends, allergies.person == friends.name, id=allergies.id +# _MD_SHOW_ ).select(friends.name, allergies.allergy, allergies_id=allergies.id) +# _MD_SHOW_) +# _MD_SHOW_[stacktrace...] +# _MD_SHOW_TypeError: type mismatch: expected a pointer, got None +# - + +# ## Joins on a foreign key +# In Pathway, the id column is auto-generated, and as such joining over a foreign key kept in +# some other table requires extra care. Let's assume that you have another table `likes` +# that indicates that a friend (row in `friends`) likes some particular dish (row in `menu`). + +likes = pw.debug.table_from_markdown( + """ + | f_id | m_id +100 | 1 | 11 +101 | 1 | 13 +102 | 2 | 12 +103 | 2 | 13 +104 | 3 | 11 +105 | 3 | 14 +106 | 3 | 13 +107 | 4 | 12 +108 | 4 | 14 +""" +) + +# Without further specification, Pathway treats columns `f_id` and `m_id` as numbers: + +pw.debug.compute_and_print(likes) + +# while the id of tables `friends` and `menu` was converted to Pointer. +# + +pw.debug.compute_and_print(friends) +pw.debug.compute_and_print(menu) + +# +# To handle joins using those columns, you can use [`pointer_from`](/developers/api-docs/pathway-table/#pathway.Table.pointer_from) function + +likes += likes.select( + f_id_ptr=friends.pointer_from(likes.f_id), + m_id_ptr=menu.pointer_from(likes.m_id), +) +pw.debug.compute_and_print(likes) + +# An [example of code](#long-chain-example) joining `friends` with `menu` using `likes` +# is presented the next section on chaining joins. + +# ## Chaining joins: +# Pathway provides two ways of chaining joins. The first relies on usage of [`pw.this`](/developers/api-docs/pathway/#pathway.this), +# the second allows for slightly more compact code. Below, let's focus on chaining joins +# using `pw.left`. +# +# ### Simple join chaining +# Below, you will do chain joins using [`pw.left`](/developers/api-docs/pathway/#pathway.left) and [`pw.right`](/developers/api-docs/pathway/#pathway.right). To show how it can be used, let's revisit +# the first example of `join_left`, in which you computed a `join_left` on a table +# that was obtained by another `join_left`. Instead of storing the result of the first +# `join_left` in `people_allergies`, you can use the following: + +pw.debug.compute_and_print( + friends.join_left(allergies, friends.name == allergies.person) + .select(friends.name, allergies.allergy, friends.budget) + .join_left(allergens_in_menu, pw.left.allergy == pw.right.allergen) + .select(pw.this.name, allergens_in_menu.dish) +) + +# Essentially, for a join `left.join(right, *on)`, `pw.left` allows us to address the `left` table +# and `pw.right` allows us to address the `right` table. In this particular example, `pw.left` allows us to address the table computed by +# +# friends.join_left(allergies, friends.name == allergies.person +# ).select(friends.name, allergies.allergy) +# +# without breaking the chain and storing intermediate results in a temporary +# variable. +# +# More generally, given a chain: +# +# _MD_SHOW_table.join(...).select(...).join(...).select(...)... +# +# `pw.left` can be used to address the result of the latest select. +# +# While in the example above `pw.right` is essentially a replacement for `allergens_in_menu`, +# it can be also used to address a table that is passed as an argument of a join, but is +# not assigned to any variable. +# +# To show `pw.right` in action, you can go back to our [example](#expressions-using-unmatched-rows) showing basic consumer information, +# and compute it directly from tables `friends`, `allergies`, and `allergens_in_menu`. + +basic_customer_info = friends.join_left( + allergies.join( + allergens_in_menu, allergies.allergy == allergens_in_menu.allergen + ).select( + allergens_in_menu.dish, + allergies.person, + ), + friends.name == pw.right.person, +).select( + friends.name, + friends.budget, + note=pw.apply(make_food_constraint_note_none_tolerant, friends.name, pw.right.dish), +) +pw.debug.compute_and_print(basic_customer_info) + +# ### Long chain example +# To demonstrate longer chains in action, let's go back to the table `likes` you used to +# show how to handle (generate) foreign keys. + +feasible_choice = ( + friends.join(likes, friends.id == likes.f_id_ptr) + .select(friends.name, friends.budget, likes.m_id_ptr) + .join(menu, pw.left.m_id_ptr == menu.id) + .select(pw.left.name, pw.left.budget, menu.dish, menu.price) + .join_left(allergies, pw.left.name == pw.right.person) + .select(*pw.left[["name", "budget", "dish", "price"]], allergies.allergy) + .join(allergens_in_menu, pw.left.dish == pw.right.dish) + .select( + *pw.left[["name", "budget", "dish", "price", "allergy"]], + allergens_in_menu.allergen, + ) + .filter((pw.this.price <= pw.this.budget) & (pw.this.allergy != pw.this.allergen)) +) +pw.debug.compute_and_print(feasible_choice) + +# As you can see, this table contains all choices of person and dish, such that a person likes +# a particular dish, is not allergic to it, and can afford it. You can further simplify +# the result by adding another groupby-reduce at the end of the chain. + +pw.debug.compute_and_print( + feasible_choice.groupby( + *pw.this[["name", "dish", "budget", "price"]], + ).reduce(*pw.this[["name", "dish", "budget", "price"]]) +) + +# Furthermore, one can make this piece of code more compact, using the [* notation](/developers/user-guide/data-transformation/table-operations/#select-and-notations). + +pw.debug.compute_and_print( + friends.join(likes, friends.id == likes.f_id_ptr) + .select(*friends, likes.m_id_ptr) + .join(menu, pw.left.m_id_ptr == menu.id) + .select(*pw.left.without("m_id_ptr"), *menu) + .join_left(allergies, pw.left.name == pw.right.person) + .select(*pw.left, allergies.allergy) + .join(allergens_in_menu, pw.left.dish == pw.right.dish) + .select(*pw.left, allergens_in_menu.allergen) + .filter((pw.this.price <= pw.this.budget) & (pw.this.allergy != pw.this.allergen)) + .groupby(*pw.this) + .reduce(*pw.this) +) diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.table-operations/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.table-operations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.table-operations/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.table-operations/article.py new file mode 100644 index 00000000..72fb9270 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.table-operations/article.py @@ -0,0 +1,358 @@ +# --- +# title: "Basic operations" +# description: '' +# notebook_export_path: notebooks/tutorials/survival_guide.ipynb +# --- + +# # Basic operations +# Must-read for first-timers and veterans alike, this guide gathers the most commonly used basic elements of Pathway. +# +# +# While the Pathway programming framework comes with advanced functionalities such as [classifiers](/developers/showcases/lsh/lsh_chapter1) or [fuzzy-joins](/developers/showcases/fuzzy_join/fuzzy_join_chapter1), it is essential to master the basic operations at the core of the framework. +# As part of this basic operations guide, we are going to walk through the following topics: +# * [Streaming and static modes](#streaming-and-static-modes) +# * [Starting with data](#starting-with-data) +# * [Select and notations](#select-and-notations) +# * [Manipulating the table](#manipulating-the-table) +# * [Working with multiples tables: union, concatenation, join](#working-with-multiple-tables-union-concatenation-join) +# * [Updating](#updating) +# * [Computing](#operations) +# +# If you want more information you can review our complete [API docs](/developers/api-docs/pathway) or some of our [tutorials](/developers/showcases/suspicious_activity_tumbling_window). + +# ## Streaming and static modes +# +# The first thing to keep in mind is that Pathway is made for streaming data. +# In this **streaming mode**, Pathway assumes unbounded incoming updates. +# To process the incoming data, Pathway maintains a [dataflow](/developers/user-guide/introduction/concepts#dataflow). +# This mode requires input connectors listening to streaming data sources. +# The computation runs indefinitely until the process is killed: the computation starts when [`pw.run`](/developers/api-docs/pathway/#pathway.run) is called and everything afterwards is unreachable code. +# +# However, the streaming mode may not be the most convenient when testing or debugging. +# For that purpose, Pathway provides a **static mode** in which static data may be attached to the connectors. +# In that mode, finite and static data can be loaded, e.g. from a table written in a static CSV file or from a markdown table. +# +# When the computation is run in the static mode, all the data is loaded and processed at once. +# While the static mode does not fully benefit from the dataflow, it allows checking if the graph is correctly built. +# To ease the debugging, Pathway provides a function called [`compute_and_print`](/developers/api-docs/debug#pathway.debug.compute_and_print). +# When calling `pw.debug.compute_and_print(t)`, Pathway builds the whole graph, ingests all the available static data, prints the obtained table `t`, and then discards the data. +# Calling twice `compute_and_print` will result in ingesting the whole data twice. +# In the streaming mode, the building of the graph and the ingestion of the data is done only once when `pw.run()` is called and the data is maintained in the graph until the computation is over: results cannot be printed but should be accessed using output connectors. +# +# The processing pipeline should be designed in the same way no matter what mode is used in the end. The only difference is how the data is considered. +# +# To learn more about both modes, you can read our [article](/developers/user-guide/connecting-to-data/streaming-and-static-modes/) about it. +# +# In most of our examples, we use the static mode since it avoids setting external streaming data sources such as Kafka. + +# ## Starting with data +# +# Be sure to always import Pathway. + +import pathway as pw + +# Now you need tables to manipulate. +# The way tables are obtained depends on whether you are on streaming or static mode. +# In this article, you will be using the static mode to be able to show the behavior of each processing step, but keep in mind that Pathway is made for the streaming mode. +# +# ### Static mode +# In the static mode, you can manually enter a table using a markdown connector. Here are the (static) tables you will be using: + +t_name = pw.debug.table_from_markdown( + """ + | name + 1 | Alice + 2 | Bob + 3 | Carole + """ +) +t_age = pw.debug.table_from_markdown( + """ + | age + 1 | 25 + 2 | 32 + 3 | 28 + """ +) +t_david = pw.debug.table_from_markdown( + """ + | name | age + 4 | David | 25 + """ +) + +# You can display a snapshot of our table (for debugging purposes) using [`pw.debug.compute_and_print`](/developers/api-docs/debug#pathway.debug.compute_and_print): + +pw.debug.compute_and_print(t_name) + +# In the following, let's omit the `pw.debug.compute_and_print()` for clarity reasons but keep in mind that it is required to print the actual *static* data at a given time. +# +# +# ### Streaming mode +# In the streaming mode, those tables would be obtained using one of the [connectors](/developers/user-guide/connecting-to-data/connectors/) provided by Pathway. +# For example, we could use [`pw.io.kafka.read`](/developers/api-docs/pathway-io/kafka/#pathway.io.kafka.read) to obtain the table from Kafka. +# The results should be taken out of Pathway using an output connector: we could send the data to postgresql using [`pw.io.progresql.write`](/developers/api-docs/pathway-io/postgres/#pathway.io.postgres.write) for instance. +# Otherwise, the way the data is manipulated is exactly the same in both modes. + +# ## Select and notations +# +# The main way to manipulate a table in Pathway is by using the [`select`](/developers/api-docs/pathway-table#pathway.Table.select) operation. +# +# * **The dot notation**: you can use `select` to select a particular column and you can use the dot notation to specify the name of the column. For example, you can access the column "name" of your `t_david` table: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_david.select(t_david.name)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_david.select(t_david.name) + +# * **The bracket notation**: you can also use string to access the column **the bracket notation**. The previous example is equivalent to ```t_david.select(t_david["name"])```. + +# * The **this notation**: to refer to the table currently manipulated you can use [`pw.this`](/developers/api-docs/pathway/#pathway.this). Our example becomes `t_david.select(pw.this.name)`. +# This notation works for all standard transformers. +# > It can be used to refer to the table, even if it has not been given a name, for example in successive operations: + +t_new_age = t_david.select(new_age=pw.this.age).select( + new_age_plus_7=pw.this.new_age + 7 +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_new_age) +# _MD_COMMENT_END_ +# _MD_SHOW_t_new_age + + +# In this example, it would be impossible to refer to the table obtained after the first select (with the column `new_age`) without using `pw.this` as `t_david` still refers to the initial and unmodified table. + +# * **left and right notations**: similarly to the this notation, [`pw.left`](/developers/api-docs/pathway/#pathway.left) and [`pw.right`](/developers/api-docs/pathway/#pathway.right) can be used to manipulate the different tables used in a [join](#working-with-multiples-tables-union-concatenation-join) ([`.join`](/developers/api-docs/pathway-table#pathway.Table.join)). +# > `left_table.join(right_table, pw.left.C1==pw.right.C2).select(pw.left.C3, pw.right.C4)` +# +# For more information about the join and the use of `pw.left` and `pw.right`, you can see the dedicated [section](#working-with-multiples-tables-union-concatenation-join) and [manual](/developers/user-guide/data-transformation/join-manual/). + +# * The **star * notation**: this notation is used to select all the columns of the manipulated table. `table.select(*pw.this)` will return the full table. +# It can be combined with [`.without`](/developers/api-docs/pathway-table#pathway.Table.without) to remove the unwanted columns: +# +# > In our example, instead of selecting the "name" column, you could want to select all the columns except the "age" one. This is obtained as follows: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_david.without("age")) +# _MD_COMMENT_END_ +# _MD_SHOW_t_david.without("age") + +# ⚠️ Note on **column names**: column names are **case sensitive** and *any* string can be used as column name using the bracket notation. +# However it is not the case for the dot notation which is restricted by standard Python syntax. +# For instance, the dot notation does not allow names with space: using `t.select(t["my name"])` works while `t.select(t.my name)` will produce an error. +# By convention, we advocate to follow the Python variable naming convention which is to use only the special character "\_" in addition to alphanumeric characters, and not to start with a number. + +# ## Manipulating the table +# +# In addition to `select`, Pathway provides more operators to manipulate and index the tables. + +# * **Filtering**: we can use [`filter`](/developers/api-docs/pathway-table#pathway.Table.filter) to keep rows following a given property. + +t_adult = pw.debug.table_from_markdown( + """ + | is_adult + 5 | True + 6 | False + 7 | True + """ +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_adult.filter(pw.this.is_adult)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_adult.filter(pw.this.is_adult) + +# You can use standard python operators, including arithmetic operators such as `+`, `-`, `*`, `/`, `//`, `<`, `>`, or `~`: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_age.filter(pw.this.age > 30)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.filter(pw.this.age>30) + +# You can also use the logical operations AND (`&`), OR (`|`) and NOT (`~`): + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_adult.filter(~pw.this.is_adult)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_adult.filter(~pw.this.is_adult) + +# * **Reindexing**: you can change the ids (accessible by `table.id`) by using [`.with_id_from()`](/developers/api-docs/pathway-table#pathway.Table.with_id_from). +# You need a table with new ids: + +t_new_ids = pw.debug.table_from_markdown( + """ + | new_id_source + 1 | 4 + 2 | 5 + 3 | 6 + """ +) + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_name.with_id_from(t_new_ids.new_id_source)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_name.with_id_from(t_new_ids.new_id_source) + +# * **ix_ref**: uses a column's values as indexes. +# As an example, if you have a table containing indexes pointing to another table, you can use this [`ix_ref`](/developers/api-docs/pathway-table#pathway.Table.ix_ref) to obtain those lines: + +t_selected_ids = pw.debug.table_from_markdown( + """ + | selected_id + 100 | 1 + 200 | 3 + """ +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_selected_ids.select(selected=t_name.ix_ref(t_selected_ids.selected_id).name) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_selected_ids.select(selected=t_name.ix_ref(t_selected_ids.selected_id).name) + +# * **Group-by**: we can use [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby) to aggregate data sharing a common property and then use a reducer to compute an aggregated value. + +t_spending = pw.debug.table_from_markdown( + """ + | name | amount + 1 | Bob | 100 + 2 | Alice | 50 + 3 | Alice | 125 + 4 | Bob | 200 + """ +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_spending.groupby(pw.this.name).reduce( + pw.this.name, sum=pw.reducers.sum(pw.this.amount) + ) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_spending.groupby(pw.this.name).reduce(pw.this.name, sum=pw.reducers.sum(pw.this.amount)) + +# You can do groupbys on multiple columns at once (e.g. `.groupby(t.colA, t.colB)`). +# The list of all the available reducers can be found [here](/developers/api-docs/reducers). +# +# If you want to find out more about the [`groupby`](/developers/api-docs/pathway-table#pathway.Table.groupby) and [`reduce`](/developers/api-docs/pathway/#pathway.GroupedTable.reduce) operations, you can read our [article](/developers/user-guide/data-transformation/groupby-reduce-manual/) about it. +# +# ## Working with multiples tables: union, concatenation, join +# +# * **Union**: we can use the operator `+` or `+=` to compute the union of two tables sharing the same ids. + +t_union = t_name + t_age +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_union) +# _MD_COMMENT_END_ +# _MD_SHOW_ + +# * **Concatenation**: you can use [`concat`](/developers/api-docs/pathway-table#pathway.Table.concat) to concatenate two tables, but they need to have disjoint ids. + +pw.universes.promise_are_pairwise_disjoint(t_union, t_david) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(pw.Table.concat(t_union, t_david)) +# _MD_COMMENT_END_ +# _MD_SHOW_pw.Table.concat(t_union,t_david) + +# As you can see, Pathway keeps the index. +# If you are concatenating tables with overlapping indexes, you can use [`concat_reindex`](/developers/api-docs/pathway-table#pathway.Table.concat_reindex) which will concat the tables and produce a new index. +# +# > **Info for Databricks Delta users**: Concatenation is highly similar to the SQL [`MERGE INTO`](https://docs.databricks.com/sql/language-manual/delta-merge-into.html). +# +# * **Join**: you can do all usual types of joins in Pathway (inner, outer, left, right). The example below presents an inner join: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_age.join(t_name, t_age.id == t_name.id).select(t_age.age, t_name.name) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.join(t_name, t_age.id==t_name.id).select(t_age.age, t_name.name) + +# Note that in the equality `t_age.id==t_name.id` the left part must be a column of the table on which the join is done, namely `t_name` in our example. Doing `t_name.id==t_age.id` would throw an error. +# +# For more visibility, the `pw.left` and `pw.right` notations should be used: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_age.join(t_name, pw.left.id == pw.right.id).select(pw.left.age, pw.right.name) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.join(t_name, pw.left.id == pw.right.id).select(pw.left.age, pw.right.name) + +# If you want more info about joins, we have an entire [manu\[a\]l](/developers/user-guide/data-transformation/join-manual/) about it! +# +# ## Updating +# +# * **Adding a new column with a default value** with `select`: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_age.select(*pw.this, adult=True)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.select(*pw.this, adult=True) + +# The value can be a basic operation on the columns: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_age.select(*pw.this, thirties=pw.this.age >= 30)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.select(*pw.this, thirties=pw.this.age>=30) + +# * **Renaming** with `select`: + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_name.select(surname=pw.this.name)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_name.select(surname=pw.this.name) + +# * **Renaming** with [`rename`](/developers/api-docs/pathway-table#pathway.Table.rename): + + +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_name.rename(surname=pw.this.name)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_name.rename(surname=pw.this.name) + +# * **Updating cells**: you can update the values of cells using [`update_cells`](/developers/api-docs/pathway-table#pathway.Table.update_cells) which can be also done using the binary operator `<<`. The ids and column names should be the same. + +t_updated_names = pw.debug.table_from_markdown( + """ + | name + 1 | Alicia + 2 | Bobby + 3 | Caro + """ +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print(t_name.update_cells(t_updated_names)) +# _MD_COMMENT_END_ +# _MD_SHOW_t_name.update_cells(t_updated_names) + +# ## Operations +# +# * **Row-centered operations** with [`pw.apply`](/developers/api-docs/pathway/#pathway.apply): you can apply a function to each value of a column (or more) by using `pw.apply` in a `select`. + +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_age.select(thirties=pw.apply(lambda x: x > 30, pw.this.age)) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_age.select(thirties=pw.apply(lambda x: x>30, pw.this.age)) + +# Operations on multiples values of a single row can be easily done this way: + +t_multiples_values = pw.debug.table_from_markdown( + """ + | valA | valB + 1 | 1 | 10 + 2 | 100 | 1000 + """ +) +# _MD_COMMENT_START_ +pw.debug.compute_and_print( + t_multiples_values.select( + sum=pw.apply(lambda x, y: x + y, pw.this.valA, pw.this.valB) + ) +) +# _MD_COMMENT_END_ +# _MD_SHOW_t_multiples_values.select(sum=pw.apply(lambda x,y: x+y, pw.this.valA, pw.this.valB)) + +# * Other operations with **transformer classes**: Pathway enables complex computation on data streams by using transformer classes. +# It is a bit advanced for this first-steps guide but you can find all the information about transformer classes in [our tutorial](/developers/user-guide/diving-deeper/transformer-introduction/). diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.user-defined-functions/__init__.py b/docs/2.developers/4.user-guide/050.data-transformation/.user-defined-functions/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/050.data-transformation/.user-defined-functions/article.py b/docs/2.developers/4.user-guide/050.data-transformation/.user-defined-functions/article.py new file mode 100644 index 00000000..7b0890bf --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/.user-defined-functions/article.py @@ -0,0 +1,531 @@ +# --- +# title: User-defined functions +# description: An article exploring concepts related to user defined functions in Pathway. +# date: '2024-02-21' +# thumbnail: '' +# tags: ['tutorial', 'engineering'] +# keywords: ['python', 'udf', 'function', 'apply', 'transformation', 'cache', 'timeout'] +# notebook_export_path: notebooks/tutorials/udf.ipynb +# --- + +# %% [markdown] +# # User-defined functions (UDFs) in Pathway +# Pathway supports a wide range of expressions that allow you to operate on individual rows. +# However, not all operations can be expressed that way. +# To address this problem, Pathway allows you to write a user-defined function (UDF) in Python. +# Such function is then applied to each row of your data individually in the same way as the predefined expressions. +# UDFs can be customized in various ways and all of them are explored in this guide. + + +# ## Simple UDFs +# In the beginning, let's consider a simple case. You want to write a function that increments a number by 1. +# Just write a regular function and decorate it with [`pw.udf`](/developers/api-docs/pathway#pathway.udf). + +# %% +import pathway as pw + + +@pw.udf +def inc(x: int) -> int: + return x + 1 + + +# %% [markdown] +# and that's everything you need. +# Now you can use it as an ordinary Pathway expression, as in the example shown below. +# %% +table = pw.debug.table_from_markdown( + """ + value + 1 + 2 + 13 +""" +) + +result = table.with_columns(value_inc=inc(table.value)) +pw.debug.compute_and_print(result) + +# %% [markdown] +# It works! The printed table contains two columns - `value` and `value_inc` with the result of `inc` function. +# +# Note that we annotated the return type of `inc` (`int` in this case). +# It is important information for pathway as it is used to infer the type of `value_inc` column. +# Let's make sure that the return type is correct by printing the schema of the `result` table. + +# %% +print(result.schema) + + +# %% [markdown] +# It is correct. The return type can also be set in a decorator: +# %% +@pw.udf(return_type=int) +def inc_2(x: int): + return x + 1 + + +result_2 = table.with_columns(value_inc=inc_2(table.value)) +pw.debug.compute_and_print(result_2) +print(result_2.schema) + + +# %% [markdown] +# In this case, it is also set correctly. +# If a UDF is not annotated and the `return_type` is not set, Pathway can't determine the return type of the column and sets it as `Any`. +# It is an undesirable situation as many expressions can't be called on columns of type `Any`. +# For example, you can't add a column with type `Any` to a column of type `int` (you also can't add `Any` to `Any`), but you can add a column of type `int` to a column of type `int`. +# %% +@pw.udf +def inc_3(x: int): + return x + 1 + + +result_3 = table.with_columns(value_inc=inc_3(table.value)) +pw.debug.compute_and_print(result_3) +print(result_3.schema) + +# %% [markdown] +# As you can see, this time the type of `value_inc` column is `Any`. + +# %% [markdown] +# Python functions can also be called on data by using [`pw.apply`](/developers/api-docs/pathway#pathway.apply)/[`pw.apply_with_type`](/developers/api-docs/pathway#pathway.apply_with_type) functions. + +# %% +result_4 = table.with_columns( + value_inc=pw.apply_with_type(lambda x: x + 1, int, table.value) +) +pw.debug.compute_and_print(result_4) +print(result_4.schema) + +# %% [markdown] +# *Remark:* to keep the examples as simple as possible, the code pieces in this guide use `table_from_markdown` to define the example tables and `compute_and_print` to run the computations. +# Those functions use Pathway in the static mode. +# However, Pathway is a streaming data processing system and can work on dynamically changing data. +# See [Pathway modes](/developers/user-guide/connecting-to-data/streaming-and-static-modes/) for more info on this topic. +# +# Also note that the `inc` function is only present in this guide for demonstration purposes. +# It is possible to get the same result using Pathway native operations and this is the recommended way as then the computations are performed in Rust, not Python. +# +# The UDFs are useful for more complicated solutions that cannot be fully expressed in Pathway but the functions in the guide are kept simple to focus on UDFs usage and configuration options. + +# %% +result_5 = table.with_columns(value_inc=table.value + 1) +pw.debug.compute_and_print(result_5) + +# %% [markdown] +# ## Calling library functions +# The UDF mechanism allows you to also call external functions. You can, for example, use [`scipy`](https://scipy.org/) to compute quantiles of the normal distribution. + +# %% +from scipy.stats import norm + +table = pw.debug.table_from_markdown( + """ + q + 0.02 + 0.5 + 0.84 + 0.99 +""" +) +quantiles = table.with_columns(value=pw.apply_with_type(norm.ppf, float, table.q)) +pw.debug.compute_and_print(quantiles) + +# %% [markdown] +# Using [`norm.ppf`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.norm.html) is convenient because it is a function, not a method. +# To use an external method it is better to create a wrapper. +# Below you can find an example of such a wrapper that extracts the [`pandas.Timestamp.is_leap_year`](https://pandas.pydata.org/docs/reference/api/pandas.Timestamp.is_leap_year.html) property and puts its value to the column `leap_year`. + +# %% +import pandas as pd + +table = pw.debug.table_from_markdown( + """ + date_str + 2023-01-01T12:00:00 + 2024-01-01T12:00:00 + 2025-01-01T12:00:00 +""" +).select(date_time=pw.this.date_str.dt.strptime("%Y-%m-%dT%H:%M:%S")) + + +@pw.udf +def is_leap_year(date_time: pd.Timestamp) -> bool: + return date_time.is_leap_year + + +result = table.with_columns(leap_year=is_leap_year(pw.this.date_time)) +pw.debug.compute_and_print(result) + +# %% [markdown] +# ## Propagating Nones +# By default, Pathway UDFs are called on all rows, however it may not always be desired. +# In particular, if you have a function that requires values to be non-optional but your data has some missing entries, you may want to return `None` immediately instead of calling a function. +# In Pathway, you can enable such mechanism with the `propagate_none` parameter of `pw.udf`. +# By default, it is set to `False`. Setting it to `True` makes Pathway to look at the inputs of the UDF, and if at least one of the arguments is `None`, then the function is not called, and `None` is returned instead. + +# %% +table = pw.debug.table_from_markdown( + """ + a | b + 12 | 3 + 3 | + 23 | 42 + | 12 +""" +) + + +@pw.udf(propagate_none=True) +def mul(a: int, b: int) -> int: + return a * b + + +result = table.with_columns(c=mul(pw.this.a, pw.this.b)) +pw.debug.compute_and_print(result) + +# %% [markdown] +# If `propagate_none` was not set, the `mul` function would error. + +# %% [markdown] +# ## Determinism +# Pathway assumes that a UDF is not deterministic unless told otherwise. +# In this context, being deterministic means that the function always returns the same value for the same arguments. +# Pathway requires this information for consistency reasons. +# If you're sure that your function is deterministic, you can set `deterministic=True` as it usually improves the speed and memory requirements of the computation. +# However sometimes the function may be nondeterministic in a non-obvious way. For example, some linear algebra operations on floating point numbers that use multithreading under the hood can return slightly different results across runs. Such functions are not deterministic. +# If this explanation is enough for you, feel free to skip to the next section. +# If you want to learn more about how Pathway handles non-deterministic functions, dive in. +# +# To maintain consistency, it'll memoize the result of a UDF call until the corresponding input row is deleted. +# Being able to produce deleting rows is the reason why Pathway has to store the results of UDFs. +# The values in the inserted and deleted entries have to be the same so that they can cancel out. +# If a UDF is non-deterministic, it can produce a different value and the entries can't cancel out as they are not equal. +# To get the same values at row deletion as at insertion, the results have to be remembered. +# When the row is deleted, there is no more need to remember the result of the call and it can be discarded. +# +# When you're sure that the function is deterministic, you can avoid storing the results by setting `deterministic=True` in the `pw.udf` decorator. +# It'll usually improve the speed and memory requirements of the computation (especially for fast functions). +# It is recommended to set it always when the function is deterministic. +# +# If the function is slow, setting `deterministic=False` might result in a faster execution, but it's not recommended if the function is deterministic. It's better to use [caching](#caching). +# Caching can help with slow functions even if you call the function with each argument only once. +# It because Pathway has to evaluate the function also on deletion and when it is cached, the value can be taken from cache instead of evaluating it. +# +# Let's see the effects of `deterministic` parameter in practice. To do that, let's simulate a stream. +# It contains special columns: `id` that sets the id of the row (a deletion has to have the same `id` as the insertion it removes), +# `__time__` that simulates the arrival time to the engine and `__diff__` that tells whether the entry is an insertion ($1$) or deletion ($-1$). +# At time $2$ two rows are inserted. +# At time $4$ one row is upserted (with an upsert represented as a deletion and insertion with the same time and id). + +# %% +table = pw.debug.table_from_markdown( + """ + id | a | b | __time__ | __diff__ + 1 | 2 | 3 | 2 | 1 + 2 | 4 | 1 | 2 | 1 + 1 | 2 | 3 | 4 | -1 + 1 | 3 | 3 | 4 | 1 +""" +) +# %% [markdown] +# You apply a UDF with default parameters (`deterministic=False`) first. + + +# %% +@pw.udf +def add(a: int, b: int) -> int: + print(f"add({a}, {b})") + return a + b + + +result_default = table.select(pw.this.a, pw.this.b, c=add(pw.this.a, pw.this.b)) +pw.debug.compute_and_print(result_default) + +# %% [markdown] +# As you can see from the printed messages, the function is called three times. +# It is because the function was not called on deletion. +# +# This time, let's tell Pathway that the function is deterministic. + + +# %% +@pw.udf(deterministic=True) +def add_deterministic(a: int, b: int) -> int: + print(f"add_deterministic({a}, {b})") + return a + b + + +result_default = table.select( + pw.this.a, pw.this.b, c=add_deterministic(pw.this.a, pw.this.b) +) +pw.debug.compute_and_print(result_default) + +# %% [markdown] +# This time, the function was called four times (once for each entry) and there was no need to remember anything! +# The function is truly deterministic and the result is consistent. + +# %% [markdown] +# ## UDFs should not be used for side effects +# UDFs are Python functions so you can capture non-local variables and modify them inside the functions. +# From the UDF it is also possible to call external services and modify their state. +# This is, however, strongly discouraged. +# There's no guarantee that Pathway will run a UDF exactly once for each row. +# If the function is non-deterministic it might not always be called (see above). +# Also if caching is set, the functions will be called less frequently. +# +# Note that we sometimes produce side effects in this tutorial by using the `print` function. +# However, we use it to show the behavior of the system, not to use the printed messages in some other computation. +# +# If you want to produce side effects, [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe) should be used instead. + +# %% [markdown] +# ## Caching +# If the function you call is expensive and you call it frequently, you may want to cache its results. +# To do this, you can set `cache_strategy` in `pw.udf` decorator. Currently, the supported caching strategies are [`DiskCache`](/developers/api-docs/udfs#pathway.udfs.DiskCache) and [`InMemoryCache`](/developers/api-docs/udfs#pathway.udfs.InMemoryCache). +# The `DiskCache` requires the persistence to be enabled. It caches the results in the persistent storage. +# As a consequence, the results can be reused after the program restart. +# The `InMemoryStorage` caches results in memory. +# As a result, it does not need persistence config but the results are not available after the computations restart. +# +# Let's first run the example without caching: +# %% + +table = pw.debug.table_from_markdown( + """ + value | __time__ + 1 | 2 + 2 | 2 + 13 | 2 + 1 | 2 + 2 | 2 + 1 | 2 +""" +) + + +@pw.udf(deterministic=True) +def inc_no_cache(x: int) -> int: + print(f"inc({x})") + return x + 1 + + +result_no_cache = table.with_columns(value_inc=inc_no_cache(table.value)) +pw.debug.compute_and_print(result_no_cache) + +# %% [markdown] +# As you can see from printed messages, the UDF was called 6 times. +# +# Let's use `InMemoryCache` this time: + + +# %% +@pw.udf(deterministic=True, cache_strategy=pw.udfs.InMemoryCache()) +def inc_in_memory_cache(x: int) -> int: + print(f"inc({x})") + return x + 1 + + +result_in_memory_cache = table.with_columns(value_inc=inc_in_memory_cache(table.value)) +pw.debug.compute_and_print(result_in_memory_cache) + +# %% [markdown] +# This time, the function was called only three times. +# Other results were extracted from the cache. +# If you run that piece of code as a separate program, it'd compute the results from scratch at every restart (because the results are stored in memory). +# In a notebook you have to restart the runtime to see the effect. +# +# This behavior might be problematic if you want to keep the results between restarts. +# This is where `DiskCache` comes in. +# It stores the results of the calls in persistent storage. +# In the example, it is located in the `./Cache` directory. +# To read more about setting up persistence see the [persistence guide](/developers/user-guide/persistence-consistency-scaling/persistence/). + + +# %% +@pw.udf(deterministic=True, cache_strategy=pw.udfs.DiskCache()) +def inc_disk_cache(x: int) -> int: + print(f"inc({x})") + return x + 1 + + +persistence_config = pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem("./Cache"), + persistence_mode=pw.PersistenceMode.UDF_CACHING, +) +result_disk_cache = table.with_columns(value_inc=inc_disk_cache(table.value)) +pw.debug.compute_and_print(result_disk_cache, persistence_config=persistence_config) + +# %% [markdown] +# If, instead of printing output on the screen, you want to use one of the [output connectors](/developers/user-guide/connectors/csv_connectors/), you need to put `persistence_config` in `pw.run`, like this: + +# %% +pw.io.csv.write(result_disk_cache, "result_disk_cache.csv") +pw.run(persistence_config=persistence_config) + +# %% [markdown] +# ## Asynchronous UDFs +# By default, Pathway UDFs are synchronous and blocking. +# If one worker is used, only one UDF call is active at a time and it has to finish for the next UDF call to start. +# If more workers are used, the maximal number of UDFs that have started and haven't finished is equal to the number of workers. +# It is a good situation for CPU bound tasks. +# If you want, however, to execute I/O bound tasks, like calling external services, it is better to have more than one task started per worker. Pathway provides asynchronous UDFs for it. +# +# Asynchronous UDFs can be defined in Pathway using [Python coroutines](https://docs.python.org/3/library/asyncio-task.html#id2) with the `async`/`await` keywords. +# The asynchronous UDFs are asynchronous *within a single batch*. +# In this context, we define a batch as all entries with equal processing times assigned. +# The UDFs are started for all entries in the batch and the execution of further batches is blocked until all UDFs for a given batch have finished. +# Thanks to that, the processing time of the entries remains unchanged and the output remains consistent. +# If you require a fully asynchronous non-blocking mechanism take a look at [`AsyncTransformer`](/developers/user-guide/data-transformation/asynchronous-transformations/). +# +# To define an asynchronous UDF it is enough to decorate a coroutine with `pw.udf`. Let's start with a simple example. +# %% +import asyncio + + +@pw.udf +async def inc_async(x: float) -> float: + print(f"inc_async({x}) starting") + await asyncio.sleep(x) + print(f"inc_async({x}) finishing") + return x + 1 + + +table = pw.debug.table_from_markdown( + """ + value + 0.2 + 0.6 + 2.0 + 1.2 +""" +) + +result = table.select(value=inc_async(pw.this.value)) +pw.debug.compute_and_print(result) + +# %% [markdown] +# From the printed messages, you can see that the calls are executed asynchronously. +# +# Note that accidentally you created a sleepsort. Values in the `finishing` messages are sorted! As an exercise, you can try sorting also other values. +# +# As a more advanced example, you can create a UDF that queries [REST Countries](https://restcountries.com/) service to get the capital of a country. +# It uses `requests` library that on its own is not asynchronous. However, if you set `executor=pw.udfs.async_executor()` +# even though `requests.get` is not a coroutine, the function `find_capital` will be executed in a [ThreadPoolExecutor](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ThreadPoolExecutor), +# so it'll be possible to have more than one function started at once. + +# %% +import requests + + +@pw.udf(executor=pw.udfs.async_executor()) +def find_capital(country: str) -> str: + result = requests.get( + f"https://restcountries.com/v3.1/name/{country}?fields=capital", + timeout=1, + ) + result.raise_for_status() + return result.json()[0]["capital"][0] + + +countries = pw.debug.table_from_markdown( + """ + country + Poland + Germany + Austria + USA + France +""" +) +countries_with_capitals = countries.with_columns(capital=find_capital(pw.this.country)) +pw.debug.compute_and_print(countries_with_capitals) + +# %% [markdown] +# ### AsyncExecutor +# It is possible to control the behavior of asynchronous UDFs using the parameters of `async_executor`: +# - `capacity` - the maximum number of concurrent operations, +# - `timeout` - the maximum time (in seconds) to wait for the function result, +# - `retry_strategy` - the strategy for handling retries in case of failures. +# The available strategies are [`ExponentialBackoffRetryStrategy`](/developers/api-docs/udfs#pathway.udfs.ExponentialBackoffRetryStrategy) and [`FixedDelayRetryStrategy`](/developers/api-docs/udfs#pathway.udfs.FixedDelayRetryStrategy). +# The exponential backoff strategy increases the waiting time between retries exponentially by multiplying the waiting time by `backoff_factor`. +# The fixed delay strategy does not increase the waiting time between retries. +# Both strategies add a random jitter to the waiting times. +# +# When both `timeout` and `retry_strategy` are used, all retries have to finish within a specified `timeout`. +# You can see the application of a retry strategy in the example below. +# The UDF has a 10% chance of failing. +# It fails two times but the retry strategy executes the function with the arguments that failed again. + +# %% +import random + +random.seed(2) + + +@pw.udf( + executor=pw.udfs.async_executor( + retry_strategy=pw.udfs.FixedDelayRetryStrategy(max_retries=10, delay_ms=100), + ) +) +async def inc_async(x: float) -> float: + print(f"inc_async({x})") + if random.random() < 0.1: + raise ValueError("err") + await asyncio.sleep(x) + return x + 1 + + +table = pw.debug.table_from_markdown( + """ + value + 0.2 + 0.6 + 2.0 + 1.2 +""" +) + +result = table.select(value=inc_async(pw.this.value)) +pw.debug.compute_and_print(result) + +# %% [markdown] +# Of course, the retry strategy does not have to be used only to mitigate the effects of runtime errors. +# It can, for example, be used to query a service multiple times in the case of its temporary unavailability. +# +# The parameters that can be used with regular UDFs can also be used with asynchronous UDFs. +# For instance, you can cache its results or set that it is deterministic. + + +# %% +@pw.udf(deterministic=True, cache_strategy=pw.udfs.InMemoryCache()) +async def inc_async(x: float) -> float: + print(f"inc_async({x})") + await asyncio.sleep(x) + return x + 1 + + +table = pw.debug.table_from_markdown( + """ + value + 0.2 + 0.6 + 2.0 + 1.2 + 0.6 + 1.2 +""" +) + +result = table.select(value=inc_async(pw.this.value)) +pw.debug.compute_and_print(result) + + +# %% [markdown] +# ## Conclusions +# In this guide, you've learned how to define Python functions (UDFs) to process data in Pathway. +# The functions process a single row in a single call. +# It is possible to define the behavior of the functions by using UDF's parameters, +# like `deterministic`, `propagate_none`, `cache_strategy`, `executor`, etc. +# A friendly reminder - if your function is deterministic, set `deterministic=True` as it'll help with performance. diff --git a/docs/2.developers/4.user-guide/050.data-transformation/10.table-operations.md b/docs/2.developers/4.user-guide/050.data-transformation/10.table-operations.md new file mode 120000 index 00000000..4c858639 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/10.table-operations.md @@ -0,0 +1 @@ +.table-operations/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/20.join-manual.md b/docs/2.developers/4.user-guide/050.data-transformation/20.join-manual.md new file mode 120000 index 00000000..b2397097 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/20.join-manual.md @@ -0,0 +1 @@ +.join-manual/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/30.groupby-reduce-manual.md b/docs/2.developers/4.user-guide/050.data-transformation/30.groupby-reduce-manual.md new file mode 120000 index 00000000..69754dce --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/30.groupby-reduce-manual.md @@ -0,0 +1 @@ +.groupby-reduce-manual/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/35.custom-reducers.md b/docs/2.developers/4.user-guide/050.data-transformation/35.custom-reducers.md new file mode 120000 index 00000000..c4f2e086 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/35.custom-reducers.md @@ -0,0 +1 @@ +.custom_reducers/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/40.iterate.md b/docs/2.developers/4.user-guide/050.data-transformation/40.iterate.md new file mode 120000 index 00000000..0893a8f2 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/40.iterate.md @@ -0,0 +1 @@ +.iterate/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/50.indexes-in-pathway.md b/docs/2.developers/4.user-guide/050.data-transformation/50.indexes-in-pathway.md new file mode 120000 index 00000000..23621622 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/50.indexes-in-pathway.md @@ -0,0 +1 @@ +.indexes-in-pathway/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/60.indexing-grouped-tables.md b/docs/2.developers/4.user-guide/050.data-transformation/60.indexing-grouped-tables.md new file mode 120000 index 00000000..a3b93926 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/60.indexing-grouped-tables.md @@ -0,0 +1 @@ +.indexing-grouped-tables/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/65.sql.md b/docs/2.developers/4.user-guide/050.data-transformation/65.sql.md new file mode 100644 index 00000000..03b0582c --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/65.sql.md @@ -0,0 +1,11 @@ +--- +title: 'SQL API' +description: 'Pathway SQL API' +redirection: true +--- + +# Taking you to the SQL API documentation + +:redirect{url="https://pathway.com/developers/api-docs/sql-api"} + + diff --git a/docs/2.developers/4.user-guide/050.data-transformation/65.user-defined-functions.md b/docs/2.developers/4.user-guide/050.data-transformation/65.user-defined-functions.md new file mode 120000 index 00000000..8b34f981 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/65.user-defined-functions.md @@ -0,0 +1 @@ +.user-defined-functions/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/70.asynchronous-transformations.md b/docs/2.developers/4.user-guide/050.data-transformation/70.asynchronous-transformations.md new file mode 120000 index 00000000..43caa6c9 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/70.asynchronous-transformations.md @@ -0,0 +1 @@ +.async-transformer/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/050.data-transformation/75.function_calls_caching.md b/docs/2.developers/4.user-guide/050.data-transformation/75.function_calls_caching.md new file mode 100644 index 00000000..1a4a1588 --- /dev/null +++ b/docs/2.developers/4.user-guide/050.data-transformation/75.function_calls_caching.md @@ -0,0 +1,320 @@ +--- +title: 'Setting up cache in asynchronous transformations' +description: 'Tutorial about caching in Pathway: how to set up and use memoization in AsyncTransformer' +date: '2023-11-30' +thumbnail: '' +tags: ['tutorial', 'engineering', 'dataflow management'] +keywords: ['persistence', 'optimization', 'async_transformer', 'caching'] +--- + +# Setting up AsyncTransformer cache in Pathway + +Some functions are expensive. For example, they may take a long to be computed or may access some APIs where you need to pay per usage. It can be a good idea to cache the calls of these functions. + +In Pathway, it is possible by using [`AsyncTransformer`](/developers/user-guide/data-transformation/asynchronous-transformations/), which has caching enabled. + +# Getting started + +First of all, let's install Pathway: + + +```python +%%capture --no-display +!pip install pathway +``` + +Then, let's go ahead and settle on the task that needs to be solved. Suppose that you do some analytics on the user dataset. + +In this dataset, you have user emails. You'd like to check if these emails are disposable, meaning they are reliable for communication. You'll use public tools to do this and create a table with a simple boolean flag for each email. + +For the demonstration, let's use the publicly available API of [Kickbox](https://open.kickbox.com/). + +## Input preparation + +Let's create the dataset. To keep things simple, you will store it on your computer in a file named `users.csv`. In a real production scenario, the source could be different — precisely, the data might be read from a Postgres table using CDC and [Debezium connector](/developers/api-docs/pathway-io/debezium/). + +Also, for simplicity, there will be just two columns: the user's ID and their email, while in real life, the table might contain more columns. + + +```python +%%writefile users.csv +user_id,email +sergey,sergey@pathway.com +jack,jack@guerillamail.com +steven,steven@gmail.com +alice,alice@mailinator.com +rachel,rachel@yahoo.com +anna,anna@wordpress.com +``` + + Overwriting users.csv + + +Let's also clean any cache if it existed before. Remove the local directory `./Cache` because you will further use it to store the cached data. + + +```python +!rm -rf ./Cache +``` + +Now, let's write the code that checks the addresses and forms the table containing the flag if the email is disposable. You can do it using a class inherited from `pw.AsyncTransformer`, where you can also store the counter for the number of API requests made. + + + +```python +import requests +import sys +import pathway as pw + +from typing import Any + + +class VerifiedEmailSchema(pw.Schema): + user_id: str + email: str + is_email_disposable: bool + + +class EmailCheckTransformer(pw.AsyncTransformer, output_schema=VerifiedEmailSchema): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.api_requests_made = 0 + + async def invoke(self, user_id: str, email: str) -> dict[str, Any]: + print(f"An API call has been made for the user {user_id}", file=sys.stderr) + self.api_requests_made += 1 + result = requests.get( + f"https://open.kickbox.com/v1/disposable/{email}", + timeout=1, + ) + + is_email_disposable = None + try: + result.raise_for_status() + is_email_disposable = result.json()["disposable"] + except Exception as e: + print(f"Failed to perform API request: {e}", file=sys.stderr) + + return { + "user_id": user_id, + "email": email, + "is_email_disposable": is_email_disposable, + } +``` + + +Now, let's implement the Pathway program that reads the data, applies the transformation, and outputs it so that you can check the result. + +The cache is configured with the persistence config. It can be enabled by specifying the persistence mode parameter as `pw.PersistenceMode.PERSISTING` or `pw.PersistenceMode.UDF_CACHING`. In the first case, Pathway will also persist in the internal state. In the second, you will only have the cache for the `AsyncTransformer` invocations, which is computationally cheaper. + + +```python +class UsersSchema(pw.Schema): + user_id: str + email: str + + +users = pw.io.csv.read("users.csv", schema=UsersSchema, mode="static") +transformer = EmailCheckTransformer(input_table=users) +emails_verified = transformer.result +pw.io.csv.write(emails_verified, "emails_verified.csv") + +persistence_backend = pw.persistence.Backend.filesystem("./Cache") +persistence_config = pw.persistence.Config.simple_config( + persistence_backend, + persistence_mode=pw.PersistenceMode.UDF_CACHING, +) + +pw.run( + monitoring_level=pw.MonitoringLevel.NONE, + persistence_config=persistence_config, +) +``` + + WARNING:pathway_engine.connectors.monitoring:CsvFilesystemReader-0: Closing the data source + An API call has been made for the user jack + An API call has been made for the user alice + An API call has been made for the user steven + An API call has been made for the user rachel + An API call has been made for the user anna + An API call has been made for the user sergey + WARNING:pathway_engine.connectors.monitoring:PythonReader-1: Closing the data source + + +Let's check the output now: + + +```python +!cat emails_verified.csv +``` + + user_id,email,is_email_disposable,time,diff + "jack","jack@guerillamail.com",True,1701266683974,1 + "alice","alice@mailinator.com",True,1701266684174,1 + "steven","steven@gmail.com",False,1701266684374,1 + "rachel","rachel@yahoo.com",False,1701266684574,1 + "anna","anna@wordpress.com",False,1701266684774,1 + "sergey","sergey@pathway.com",False,1701266684974,1 + + +As you can see now, two emails have been recognized as disposable. Let's check how many API calls have been done. + + +```python +transformer.api_requests_made +``` + + + + + 6 + + + +The number of calls is 6, as expected: six emails, each requiring an API call. Now, let's rerun the program and see how the number of calls changes. + +Let's clear the graph so that the program starts fresh. + + +```python +from pathway.internals.parse_graph import G +G.clear() +``` + +Now, let's run the program. To do so, take the same pipeline that reads from the same input and outputs to the same output. + + +```python +users = pw.io.csv.read("users.csv", schema=UsersSchema, mode="static") +transformer = EmailCheckTransformer(input_table=users) +emails_verified = transformer.result +pw.io.csv.write(emails_verified, "emails_verified.csv") + +persistence_backend = pw.persistence.Backend.filesystem("./Cache") +persistence_config = pw.persistence.Config.simple_config( + persistence_backend, + persistence_mode=pw.PersistenceMode.UDF_CACHING, +) + +pw.run( + monitoring_level=pw.MonitoringLevel.NONE, + persistence_config=persistence_config, +) +``` + + WARNING:pathway_engine.connectors.monitoring:CsvFilesystemReader-0: Closing the data source + WARNING:pathway_engine.connectors.monitoring:PythonReader-1: Closing the data source + + +Now, you can check that the output has been produced. To ensure it is indeed the new output, compare the times below and in the first run. + + +```python +!cat emails_verified.csv +``` + + user_id,email,is_email_disposable,time,diff + "jack","jack@guerillamail.com",True,1701266685306,1 + "alice","alice@mailinator.com",True,1701266685306,1 + "steven","steven@gmail.com",False,1701266685306,1 + "rachel","rachel@yahoo.com",False,1701266685306,1 + "anna","anna@wordpress.com",False,1701266685306,1 + "sergey","sergey@pathway.com",False,1701266685306,1 + + +As you can see, the output is correct: there are still two disposable email addresses and four regular ones. However, there are no requests to the API. Let's check the counter to ensure that no requests have been made. + + +```python +transformer.api_requests_made +``` + + + + + 0 + + + +Now, let's change the data and see how the program behaves. Let's create the dataset where the three users are new, and the other three belong to the previous dataset. + + +```python +%%writefile users.csv +user_id,email +sergey,sergey@pathway.com +steven,steven@gmail.com +rachel,rachel@yahoo.com +john,john@fakemail.fr +diana,diana@mail.com +alex,alex@gmail.com +``` + + Overwriting users.csv + + +Now, let's run the pipeline with cache and see what happens: + + +```python +G.clear() +users = pw.io.csv.read("users.csv", schema=UsersSchema, mode="static") +transformer = EmailCheckTransformer(input_table=users) +emails_verified = transformer.result +pw.io.csv.write(emails_verified, "emails_verified.csv") + +persistence_backend = pw.persistence.Backend.filesystem("./Cache") +persistence_config = pw.persistence.Config.simple_config( + persistence_backend, + persistence_mode=pw.PersistenceMode.UDF_CACHING, +) + +pw.run( + monitoring_level=pw.MonitoringLevel.NONE, + persistence_config=persistence_config, +) +``` + + WARNING:pathway_engine.connectors.monitoring:CsvFilesystemReader-0: Closing the data source + An API call has been made for the user john + An API call has been made for the user alex + An API call has been made for the user diana + WARNING:pathway_engine.connectors.monitoring:PythonReader-1: Closing the data source + + +As you can see, three lines correspond to the new rows in this table. Each of these rows adds a single request that needs to be made. You can double-check with the counter in the transformer class: + + +```python +transformer.api_requests_made +``` + + + + + 3 + + + +Finally, you can ensure that the output is indeed correct: + + +```python +!cat emails_verified.csv +``` + + user_id,email,is_email_disposable,time,diff + "john","john@fakemail.fr",True,1701266685810,1 + "alex","alex@gmail.com",False,1701266685912,1 + "rachel","rachel@yahoo.com",False,1701266685912,1 + "steven","steven@gmail.com",False,1701266685912,1 + "sergey","sergey@pathway.com",False,1701266685912,1 + "diana","diana@mail.com",False,1701266686110,1 + + +# Conclusion + +You have learned how to use the cache for heavy or costly functions here. Walking step-by-step, you need to perform the following steps: +* Define the logic in AsyncTransformer; +* Define the cache by specifying the persistence mode parameter to `UDF_CACHING`; +* Complete your data pipeline and just run the program! diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.asof-join/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.asof-join/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.asof-join/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.asof-join/article.py new file mode 100644 index 00000000..24676400 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.asof-join/article.py @@ -0,0 +1,211 @@ +# --- +# title: "ASOF Join" +# description: Tutorial about ASOF Joins in Pathway. +# date: '2023-05-24' +# thumbnail: '/assets/content/tutorials/finance_ts_asof_join/asof-join-tutorial-th.png' +# tags: ['tutorial', 'engineering'] +# keywords: ['asof join', 'stock prices', 'financial data analysis'] +# notebook_export_path: notebooks/tutorials/asof_join.ipynb +# --- + +# %% [markdown] tags=[] +# # Bridging Financial Data Streams: A Look at ASOF Join in Pathway +# +# +# This tutorial provides an in-depth exploration of ASOF joins in pathway. You will learn how to employ this feature to align stock prices and corporate events, showcasing the practicality and efficiency of ASOF joins in financial data analysis. +# +# An ASOF join, is a type of join operation that combines data based on the closest preceding or succeeding timestamp. It is commonly used in realtime streaming data processing frameworks to align and correlate data based on temporal relationships. The term "ASOF" stands for "As-Of," indicating that the join operation considers data as of a specific point in time. There are three primary modes of asof join: backward, forward, and nearest. +# +# * `Backward`: is the default mode, it finds the last row in the other table where the time is less than or equal to the time in the current row of the self table. +# +# * `Forward`: It finds the next row in the other table where the time is greater than or equal to the time in the current row of the self table. +# +# * `Nearest`: This mode identifies the closest time match in the other table, whether it's in the past or future. If two times are equally distant, it defaults to the backward method. +# +# +# ::article-img +# --- +# src: '/assets/content/tutorials/finance_ts_asof_join/asof_join_illustration.png' +# alt: 'Illustration of ASOF Join' +# class: 'mx-auto' +# --- +# :: +# +# There are few reasons that emphasize its importance: +# +# 1. **Temporal Alignment**: Realtime data is time-stamped and is often required to be analyzed in conjunction with other time-stamped data. However, these different data streams may not always arrive at exactly the same time. An asof join enables you to combine these different data streams in a time-aware manner, aligning data points that are closest in time, even if they are not exactly the same. +# +# 3. **Data from Different Frequencies**: Realtime data streams can come in different frequencies. For instance, one stream might be generating data every second, while another generates data every minute. An asof join can reconcile these different frequencies by joining on the most recent available data from the less frequent stream. +# +# 4. **Realtime Analysis**: Many realtime applications, such as algorithmic trading, IoT analytics, or realtime monitoring, require asof joins to effectively combine and analyze multiple data streams in realtime. For example, in algorithmic trading, traders might need to join realtime trade data with less frequent reference data (like company earnings reports) to make immediate trading decisions. +# +# 5. **Resource Optimization**: By performing asof joins, systems can reduce the volume of data that needs to be kept in memory for joining, because you only need to keep the most recent relevant data points. This is especially important for high-throughput realtime systems, where optimizing memory usage can significantly impact system performance and cost. +# +# +# ## Objective: Unveiling the Impact of Events on Stock Prices +# +# The ultimate goal in this tutorial is to unravel the impact of various corporate events on stock prices. To understand this dynamic, you will be combining two separate data sets: one containing key corporate events and the other encapsulating the corresponding stock prices for five major US companies. +# +# However, a conventional join operation is insufficient in this context. Why? Because corporate events and stock prices don't align perfectly in time. Corporate events are sporadic, whereas stock prices fluctuate continuously throughout trading hours. To correlate these events with stock prices effectively, you'd need a more sophisticated join operation that can accurately pair an event with the closest corresponding stock price, in terms of timing. +# +# In the scope of this tutorial, you will be leveraging data from a public API in a static mode purely for illustrative purposes. However, a more typical use-case would involve the processing of realtime data streams. In such scenarios, Pathway input connectors come into play, seamlessly integrating with the remaining parts of your code to facilitate uninterrupted data flow and processing. The events data comes with dates instead of datetimes. Hence the choice of daily stock prices. +# +# +# ## Prerequisites +# +# For this tutorial, you would need to have Python installed along with the `pandas`, `yahooquery`, and of course, `pathway` libraries. If you haven't installed these yet, you can do so using pip: + +# %% tags=[] +# %%capture --no-display +# # !pip install pandas yahooquery -U pathway + +# %% [markdown] tags=[] +# ## Fetching the Data +# +# kick-start this tutorial by fetching data from Yahoo Finance using the `yahooquery` library. You'll be looking at data for five major US companies: Apple, Google, Microsoft, Amazon, and Meta. + + +# %% tags=[] +import pandas as pd +from yahooquery import Ticker + +import pathway as pw + +# Define the list of ticker symbols for the 5 major US companies +tickers = ["AAPL", "GOOGL", "MSFT", "AMZN", "META"] +start = "2023-01-01" +fmt = "%Y-%m-%d" +period = "1y" + +# %% [markdown] tags=[] +# +# Let's fetch the historical daily prices and corporate events data for each company. + +# %% tags=[] +# Instantiate Ticker object with the list of tickers +t = Ticker(tickers) + +# Fetch the historical daily close prices for each company +prices = t.history(period=period, start=start).reset_index() +prices.date = pd.to_datetime(prices.date, utc=True).dt.tz_localize(None) + + +# Now fetch the corporate events data +events = t.corporate_events.reset_index() + +# %% [markdown] tags=[] +# ## Using Pathway to Process Data +# +# The next phase involves transforming the two dataframes into Pathway Tables and calculating the Daily Return - the percentage change between the opening and closing prices for the chosen companies. This crucial step will provide us with insights into the relationship between corporate events and corresponding price fluctuations. + +# %% tags=[] +from datetime import datetime, timedelta + +events_table = ( + pw.debug.table_from_pandas(events) + .select(pw.this.symbol, pw.this.headline, t=pw.this.date) + .filter(pw.this.t >= datetime.strptime(start, fmt)) +) + +price_table = pw.debug.table_from_pandas(prices).select( + pw.this.symbol, + pw.this.open, + pw.this.close, + daily_return=(pw.this.close - pw.this.open) / pw.this.open * 100.0, + t=pw.this.date, +) + + +# %% [markdown] tags=[] +# The `table_from_pandas` function converts pandas DataFrames into Pathway tables. The `select` function is used to choose the columns you want to keep in the new tables. +# +# ## Previewing Your Data +# Pathway is a powerful tool designed to work with both static and real-time streaming data, though its primary strength lies in handling real-time data processing. +# +# During the development and testing stages of your data processing pipeline, it's essential to validate that the data is processed correctly. To facilitate this, you could define a function named preview_table for data inspection. However, keep in mind that this function should not be utilized in a production setting. It employs pw.debug.compute_and_print and uses static data. For a production environment involving real-time data streams, you should rely on [Pathway's output connectors](/developers/documentation/input-and-output-streams/connectors/) to access the data. +# +# Let's take a look at the first few rows of our newly created tables. + + +# %% tags=[] +def preview_table(table, time_col="t"): + end = datetime.strptime("2023-01-03", fmt) + table = table.filter(table[time_col] <= end) + pw.debug.compute_and_print(table) + + +preview_table(events_table) +preview_table(price_table) + +# %% [markdown] tags=[] +# ## Applying `asof_join` +# +# With your Pathway tables prepared, you can now perform the `asof_join` operation. + +# %% tags=[] +merged = events_table.asof_join( + price_table, + events_table.t, + price_table.t, + events_table.symbol == price_table.symbol, + how=pw.JoinMode.LEFT, + direction=pw.temporal.Direction.FORWARD, +).select( + symbol=events_table.symbol, + headline=events_table.headline, + daily_return=price_table.daily_return, + event_t=events_table.t, + price_t=price_table.t, +) + +preview_table(merged, time_col="event_t") + +# %% [markdown] tags=[] +# In the above `asof_join` function call, you're joining the events table with the prices table. Let's break down its parameters: +# +# * `other`: `Table`, This is the table that will be joined with the main table (events_table in our case). In our context, this is price_table. +# +# * `self_time` and `other_time`: `ColumnExpression`, These are the time-like columns in both the self and other tables used as the basis for the join. In our example, `events_table.t` and `price_table.t` respectively. +# +# * `*on`: `ColumnExpression`: This represents the list of column expressions that specify the conditions for the join. In our case, we use `events_table.symbol == price_table.symbol` to ensure we're matching events and prices for the same company. +# +# * `how`: `JoinMode`: This sets the join mode, which can be `LEFT`, `RIGHT`, or `FULL`. `JoinMode.LEFT` is chosen to retain all rows from `events_table` and attach corresponding data from `price_table` where available. +# +# * `defaults`: `Dict[ColumnReference, Any]`: This dictionary maps columns to default values for cases where the join operation doesn't find a match in the other table. +# +# * `direction`: `Direction`: The direction of the join, either `BACKWARD`, `FORWARD`, or `NEAREST`. We set `direction=Direction.FORWARD` to find the closest subsequent match in `price_table` for each row in `events_table`. This choice aligns with our goal of understanding the future impact of events on stock prices. +# +# +# ## Filtering the Data +# First you will filter out the rows where the time difference between the event and the price is greater than 3 days. + +# %% tags=[] +filtered = merged.filter(merged.price_t.is_not_none()).filter( + pw.this.price_t - pw.this.event_t <= timedelta(days=3) +) + +preview_table(filtered, time_col="event_t") + +# %% [markdown] tags=[] +# Now let's filter for the events with the highest impact and display the resulting table: + +# %% tags=[] +threshold = 3.0 + +highest_impact_events = filtered.filter( + pw.apply_with_type(lambda x: abs(x) >= threshold, bool, pw.this.daily_return) +).select( + pw.this.symbol, + pw.this.headline, + pw.this.daily_return, + event_t=pw.this.event_t.dt.strftime(fmt), +) + +pw.debug.compute_and_print(highest_impact_events) + +# %% [markdown] tags=[] +# ## Conclusion +# +# And there you have it! By using the `asof_join` in Pathway, you have efficiently combined realtime stock price data with corporate event data for multiple companies. This can provide valuable insights for financial analysis. As you can see, Pathway provides a powerful framework for managing and manipulating time-series data. +# +# Thank you for following this tutorial, and we hope you've found it valuable. As always, we welcome your questions and feedback. diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.interval-join/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.interval-join/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.interval-join/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.interval-join/article.py new file mode 100644 index 00000000..623cac25 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.interval-join/article.py @@ -0,0 +1,190 @@ +# --- +# title: 'Interval Join' +# description: 'Tutorial on how to do an interval join in Pathway.' +# date: '2023-05-22' +# thumbnail: 'assets/content/tutorials/fleet_eta_interval_join/fleet-eta-interval-join-th.png' +# tags: ['tutorial', 'engineering'] +# keywords: ['interval join', 'ETA'] +# notebook_export_path: notebooks/tutorials/interval_join.ipynb +# --- + +# # Performing Interval Joins in Pathway +# This article offers a comprehensive tutorial on how to use the interval join function in Pathway to accurately compute the Estimated Time of Arrival (ETA) for vehicles. +# +# The practice of temporal data analysis often invokes the use of interval joins, which diverge from the conventional join operation by virtue of a more adaptable matching strategy. Rather than linking records from two tables based purely on a precise key match, interval joins offer a compound inequality based linkage. +# +# In essence, the interval join merges records by comparing the timestamp from each record of two tables and evaluates whether the difference falls within a specified time window or `interval`. +# +# ![Illustration of interval join](assets/content/tutorials/fleet_eta_interval_join/illustrative_interval_join.png) +# +# This operation is especially useful in time-series data processing scenarios. For instance, when processing streaming data, data points may not arrive at the system simultaneously due to differences in data production speed, network latency, or other factors. Interval join comes to the rescue by enabling the system to align data points that are closest in time, thus facilitating real-time data analysis and decision-making. +# +# In this tutorial, you will calculate the estimated time of arrival (ETA) for each vehicle in a fleet given its current location, destination, the route it's taking, and any potential route maintenance delays. +# +# ## 1. Setting Up the Tables +# +# Let's start by defining the tables: vehicle locations, routes, and maintenance schedules. For the sake of this tutorial, you could use `pw.debug.table_from_markdown` function to create tables from markdown formatted string. Please refer to our article about [connectors in Pathway](/developers/user-guide/connecting-to-data/connectors/) to find out more about data connectors. Be assured that this code is compatible with the streaming mode. + +# + +import pathway as pw + +vehicle_locations = pw.debug.table_from_markdown( + """ + | timestamp | vehicle_id | route_id | current_lat | current_long | dest_lat | dest_long + 1 | 1682884074 | 1 | h432af | 12.8716 | 77.5946 | 12.9800 | 77.5950 + 2 | 1682884300 | 1 | h432af | 12.8717 | 77.5946 | 12.9800 | 77.5950 + 3 | 1682884801 | 1 | h577dk | 12.8768 | 77.5947 | 12.9800 | 77.5950 + 4 | 1682885003 | 1 | h577dk | 12.8829 | 77.5947 | 12.9800 | 77.5950 + 5 | 1682992012 | 2 | h120cc | 10.4601 | 78.0937 | 10.4532 | 78.0987 + 6 | 1682992274 | 2 | h120cc | 10.4600 | 78.0948 | 10.4532 | 78.0987 + 7 | 1682992674 | 2 | h120cc | 10.4595 | 78.0959 | 10.4532 | 78.0987""" +) + +routes = pw.debug.table_from_markdown( + """ + | route_id | traffic_speed_km_per_h + 1 | h120cc | 60 + 2 | h432af | 40 + 3 | h577dk | 80 + """ +) + +# Create a maintenance data table +maintenance = pw.debug.table_from_markdown( + """ + | route_id | start_time | delay_minutes + 1 | h120cc | 1682992274 | 30 + 2 | g392rt | 1682884441 | 20 + 3 | h577dk | 1682885777 | 10 + """ +) +# - + +# ## 2. Calculating Distance +# +# To compute the geodesic distance between the vehicle's current location and its destination, a function `calculate_distance` is defined. + +# If you don't have the geopy package, you can uncomment and execute the following cell: + +# + + +# %%capture --no-display +# #!pip install geopy + + +# + +from geopy.distance import geodesic + + +def calculate_distance( + current_lat: float, current_long: float, dest_lat: float, dest_long: float +) -> float: + current_location = (current_lat, current_long) + destination = (dest_lat, dest_long) + return geodesic(current_location, destination).km + + +# - + +# Apply this function to the table: + +vehicle_locations += vehicle_locations.select( + distance_km=pw.apply( + calculate_distance, + **vehicle_locations[["current_lat", "current_long", "dest_lat", "dest_long"]] + ), +) +pw.debug.compute_and_print(vehicle_locations[["distance_km"]]) + +# ## 3. Performing the Interval Join +# +# Pathway has a function called `interval_join` (resp. `interval_join_{outer/left/right}`) which you can use to perform the interval join operation. As mentioned earlier, it is a temporal join operation that allows matching elements of a table `right` whose timestamps fall within a certain time interval relative to the timestamps of `left`'s elements.\ +# Using the left join variant `interval_join_left` retains records with and without delays, thereby perfectly addressing our current scenario. +# +# For simplicity, we'll make the assumption that maintenance operations are delocalized throughout the routes and they last for a duration of $\Delta$. A delay is incurred only if maintenance event $i$ at time $m_i$ has already began and hasn't ended yet at time $t$ +# i.e. $m_i <= t <= m_i + \Delta$ or equivalently $-\Delta <= m_i - t <= 0$. +# +# It is possible to add exact conditions on top of the temporal join. Here, you need to also join on the `route_id`. +# +# The arguments of the interval_join_left: +# * `other` : the right side table. +# * `self_time_expression` : the time column/expression in the left table. +# * `other_time_expression` : the time column/expression in the right table. +# * `interval` : where other_time_expression-self_time_expression must be. +# * `*on` : a list of equality conditions. + +# + +maintenance_period_sec = 30 * 60 + +records = vehicle_locations.interval_join_left( + maintenance, + pw.left.timestamp, + pw.right.start_time, + pw.temporal.interval(-maintenance_period_sec, 0), + pw.left.route_id == pw.right.route_id, +).select( + vehicle_locations.timestamp, + vehicle_locations.vehicle_id, + vehicle_locations.route_id, + vehicle_locations.distance_km, + delay_sec=60.0 * pw.coalesce(pw.right.delay_minutes, 0), +) + +pw.debug.compute_and_print(records) +# - + +# After joining the tables, all the columns from the left table are kept and filling the missing values in the right table is accomplished using `pw.coalesce`. +# +# To compute the total delay in seconds for each pair (`timestamp`, `vehicle_id`), we perform a groupby and apply the sum reducer on the column `delay_sec`. +# + +# + +records = records.groupby(records.timestamp, records.vehicle_id,).reduce( + records.timestamp, + records.vehicle_id, + pw.reducers.unique(records.route_id), + pw.reducers.unique(records.distance_km), + delay_sec=pw.reducers.sum(pw.this.delay_sec), +) + +pw.debug.compute_and_print(records[["timestamp", "vehicle_id", "delay_sec"]]) + +# - + +# ## 4. Calculating ETA +# +# Next, you're set to do a standard join operation to combine the records and routes based on the common 'route\_id' column. An inner join is used, implying that only records that have a match in both the records and routes data will be retained. + + +records = records.join(routes, pw.left.route_id == pw.right.route_id).select( + pw.left.timestamp, + pw.left.vehicle_id, + pw.left.delay_sec, + pw.left.distance_km, + pw.right.traffic_speed_km_per_h, +) + +# Let's add the delay in seconds to the timestamp and transform it to datetimes. + +# Finally, you can calculate ETA considering both the travel time (based on the distance to the destination and the speed of the route) and any maintenance delays: + +records += records.select( + eta_sec=pw.this.delay_sec + + pw.this.distance_km / (pw.this.traffic_speed_km_per_h / 60.0 / 60.0), +) +records += records.select( + eta=(pw.this.timestamp + pw.cast(int, pw.this.eta_sec)).dt.from_timestamp(unit="s"), +) + +# As your final step, you can clean the output table by keeping the most interesting columns. + +output = records[["timestamp", "vehicle_id", "eta"]] + +pw.debug.compute_and_print(output) + + +# Arrival estimates are computed for each record. The next logical step involves identifying the most recent estimates from each vehicle. This can be achieved by applying a filter to select the latest timestamps associated with each vehicle. We encourage you to explore this further as part of your learning journey with our tutorial. + +# ## Conclusion +# +# In this tutorial, you have learned how to use interval joins in Pathway to handle scenarios where you'd need to join tables based on intervals or timestamps. You have seen how to use these methods to calculate the ETA for vehicles considering their current locations, routes, and any potential maintenance delays. This kind of processing is common in transportation and logistics, and Pathway makes it easy to handle in real time. diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.interval_join_temporal_behavior/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.interval_join_temporal_behavior/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.interval_join_temporal_behavior/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.interval_join_temporal_behavior/article.py new file mode 100644 index 00000000..b411aa0c --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.interval_join_temporal_behavior/article.py @@ -0,0 +1,238 @@ +# --- +# title: Controlling temporal behavior of interval join +# description: An article exploring concepts related to temporal behavior of interval join. +# date: '2023-11-28' +# thumbnail: 'assets/content/tutorials/fleet_eta_interval_join/fleet-eta-interval-join-th.png' +# tags: ['tutorial', 'engineering'] +# keywords: ['interval join', 'behavior', 'late data', 'delay', 'cutoff', 'out-of-order data'] +# notebook_export_path: notebooks/tutorials/interval_join_temporal_behavior.ipynb +# --- + +# %% [markdown] +# # Controlling temporal behavior of interval join +# This article explores concepts related to the temporal behavior of Pathway's interval join, like filtering out +# late records, freeing memory that is no longer needed, or decreasing the frequency of updates. +# +# Interval join is a temporal join that joins events within a specified time interval. +# You can read more about it in [Performing Interval Joins](/developers/user-guide/temporal-data/interval-join) article. +# +# Let's consider a scenario in which you track customers ordering products. Each order is placed at some specific time (`order_time`) and reaches the tracking system written in Pathway at time `__time__`. +# In a real-world scenario, you don't have perfect control over the time between the moment the order is placed and the moment it reaches the data processing engine. Packets can take different routes over the network, or, in rare cases, a sneaky Georgian woman can cut off internet access for [a whole country](https://www.theguardian.com/world/2011/apr/06/georgian-woman-cuts-web-access). As such, times `order_time` can arrive in the processing system out of order, very late, or even not arrive at all. + +# To deliver streaming data in a production system, you can use a connector like Kafka or Redpanda (you can read more about them in [Kafka connector](/developers/user-guide/connectors/kafka_connectors/) or [Redpanda connector](/developers/user-guide/connectors/switching-to-redpanda/) articles). Below, `pw.debug.table_from_markdown` with `__time__` column simulates a connector that delivers data out-of-order to demonstrate configurable _temporal behavior_ of interval join. + +# %% tags=[] +import pathway as pw + +orders = pw.debug.table_from_markdown( + """ + customer | product | order_time | __time__ + Austin | mouse | 120 | 122 + Brenda | keyboard | 120 | 122 + Carl | mouse | 124 | 124 + Evelyn | cable | 128 | 128 + Frank | mouse | 120 | 130 + Min | mouse | 124 | 130 + Nicole | cable | 130 | 132 + Sam | keyboard | 128 | 134 + Theresa | keyboard | 134 | 136 + Yichen | cable | 136 | 138 +""" +) + +discounts = pw.debug.table_from_markdown( + """ + price | product | start_time | __time__ + 42 | mouse | 120 | 122 + 128 | keyboard | 118 | 122 + 8 | cable | 128 | 134 + 135 | keyboard | 132 | 140 + 10 | cable | 122 | 150 +""" +) + +# %% [markdown] +# The store gives discounts on some products and they all last six time units since `start_time` (incl. `start_time`). +# The clients place orders but they get a guarantee that the order is executed only if the product is discounted. +# +# ![Customers orders with their event and processing times](assets/content/tutorials/interval_join_temporal_behavior/orders.svg) +# +# To get the price paid by a customer, you can perform interval join between `orders` and `product` streams. +# An order, to be associated with the product, has to come no earlier than the product +# was discounted and no later than the discount ended. So, it has to satisfy +# `start_time` $\le$ `order_time` $\le$ `start_time+5`. This can be expressed as an interval join: + + +# %% tags=[] +result = discounts.interval_join( + orders, + discounts.start_time, + orders.order_time, + pw.temporal.interval(0, 5), + discounts.product == orders.product, +).select( + orders.customer, + orders.product, + discounts.price, + orders.order_time, + discounts.start_time, +) +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# As time progresses, new orders and products arrive, and you get new information about the prices paid for the products. +# Note that two customers were not able to purchase a product: +# - Sam wanted to buy a keyboard at time $128$, but it was not discounted then. It was discounted only at times $[118, 122]$ and $[132, 136]$. +# - Yichen wanted to buy a cable at time $136$, but it also wasn't discounted. +# +# As such, their orders are not present in the `result` table. If you want to include all orders (also those that can't be executed), you can use `interval_join_right`. + +# %% [markdown] +# ## Forgetting old records +# As mentioned before, you don't have any control over the time between the event creation (an example event can be a store that registers that a product X is available at price Y, starting from time Z) and the event arrival at the processing engine. In our scenario, an example of such an event is the `cable` entry in the `products` table that was discounted at time $122$ and only reached Pathway at time $150$. + +# In principle, you don't know if more old records won't come in the future. As such, to guarantee that a join on such possibly late data returns correct answers, the processing engine needs to store all the records in its memory. + +# Practically, keeping all the old records just to handle some very overdue orders may be a price you are not willing to pay, and it's better to ignore such orders while cleaning memory from some old entries. + +# To make that trade-off possible, Pathway provides the `behavior` parameter for `interval_join`, which defines its temporal behavior. Roughly speaking, it allows you to tell Pathway to ignore the records that are too late. Subsequently, that allows you to forget the records that you know won't be joined with any new incoming records in the future. +# To be more precise: if you set the `behavior` to e.g. `pw.temporal.common_behavior(cutoff=6)`, Pathway will ignore all records that have times less or equal to maximal already seen time minus $6$. Small remark: the maximal already seen time is held separately for each side of the join, as it allows expressing a join with historical data slightly easier. + +# %% tags=[] +result = discounts.interval_join( + orders, + discounts.start_time, + orders.order_time, + pw.temporal.interval(0, 5), + discounts.product == orders.product, + behavior=pw.temporal.common_behavior(cutoff=6), +).select( + orders.customer, + orders.product, + discounts.price, + orders.order_time, + discounts.start_time, +) +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# Let's see what happens in this case. The final result doesn't contain Frank's order. +# His order was performed at time $120$ and could be joined with the mouse being discounted at time $120$. +# However, the maximal seen time in the `orders` stream when Frank's order arrived was $128$ +# (`order_time` of Evelyn's order). All new records with `order_time` less +# or equal to $128-6=122$ had to be ignored. Note that Min's order came to Pathway at +# the same time, but its `order_time` was $124$ $(>122)$, so it was not ignored. +# +# The `cutoff` threshold doesn't have to be that tight. Setting a higher `cutoff` will +# allow you to process more late records, but the memory footprint of an interval join might +# increase then. It'll just store all records that are above the maximal seen time minus `cutoff`. + +# %% [markdown] +# ## Keeping only the most up-to-date data +# Imagine you want to create a dashboard with the most recent orders. You don't want to +# display (or even store) old orders. Don't worry! Pathway can solve this problem too. +# It is enough to set the `keep_results` parameter of `common_behavior` to `False`. +# Then, all records with event time no larger than the maximal seen time minus `cutoff` will be removed from the output. Let's have a look at how it works: + +# %% tags=[] +result = discounts.interval_join( + orders, + discounts.start_time, + orders.order_time, + pw.temporal.interval(0, 5), + discounts.product == orders.product, + behavior=pw.temporal.common_behavior(cutoff=8, keep_results=False), +).select( + orders.customer, + orders.product, + discounts.price, + orders.order_time, + discounts.start_time, +) +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# In the end, the maximal seen time in the `orders` stream is $136$. That's why all +# records with a time less or equal to $136-8=128$ are forgotten. + +# %% [markdown] +# ## Stabilizing the stream +# Another feature of temporal behavior is the ability to delay the results production. +# It can be useful if the input stream is unstable, with several possible updates to the input records, and the output that should not contain every intermediate result. +# Let's return to our shop scenario and consider a situation in which product prices are updated. For instance, it can be caused by a store employee entering an incorrect price first and later fixing it. +# +# In this example, a special column `__diff__` is used to tell Pathway whether a record +# should be added ($1$) or removed ($-1$). Remember that `pw.debug.table_from_markdown` +# is used to simulate a streaming behavior. In a real system, the way of deleting entries +# depends on an input connector you plan to use. + + +# %% tags=[] +discounts_with_updates = pw.debug.table_from_markdown( + """ + id | price | product | start_time | __time__ | __diff__ + 1 | 42 | mouse | 120 | 122 | 1 + 2 | 128 | keyboard | 118 | 122 | 1 + 1 | 42 | mouse | 120 | 124 | -1 + 1 | 43 | mouse | 120 | 124 | 1 + 3 | 8 | cable | 128 | 134 | 1 + 3 | 8 | cable | 128 | 138 | -1 + 3 | 10 | cable | 128 | 138 | 1 + 4 | 135 | keyboard | 132 | 140 | 1 + 5 | 10 | cable | 122 | 150 | 1 +""" +) + +result = discounts_with_updates.interval_join( + orders, + discounts_with_updates.start_time, + orders.order_time, + pw.temporal.interval(0, 5), + discounts_with_updates.product == orders.product, +).select( + orders.customer, + orders.product, + discounts_with_updates.price, + orders.order_time, + discounts_with_updates.start_time, +) +pw.debug.compute_and_print_update_stream(result) + + +# %% [markdown] +# The above script is run without any delays. Because of that, in the output the price Austin +# paid for a mouse is updated from $42$ at time $122$ to $43$ at time $124$. A similar situation +# happens to Evelyn - one price is present at times $134, 136$, and a new price is present from time $138$. +# If you are willing to wait until the results stabilize, you can +# use the `delay` parameter of temporal behavior to buffer the results before producing them. +# Let's set it to $4$ and see what happens. + +# %% tags=[] +result = discounts_with_updates.interval_join( + orders, + discounts_with_updates.start_time, + orders.order_time, + pw.temporal.interval(0, 5), + discounts_with_updates.product == orders.product, + behavior=pw.temporal.common_behavior(delay=4), +).select( + orders.customer, + orders.product, + discounts_with_updates.price, + orders.order_time, + discounts_with_updates.start_time, +) +pw.debug.compute_and_print_update_stream(result) + +# %% [markdown] +# Now, records wait for maximal seen time to become at least `record_time+4` before being +# joined. (By `record_time+4` I mean `order_time+4` for `orders` stream, `start_time+4` +# for `products` stream). Thanks to the delay, the stream can stabilize, and there are +# no price fluctuations in the output. Of course, you should choose a proper value of `delay` +# parameter depending on the times in your application. + +# %% [markdown] +# You could also combine `delay` and `cutoff` to stabilize the stream and forget old records. +# +# Thank you for following this tutorial. I hope you now have a better understanding of +# the ways to control the temporal behavior of the interval join. diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.window-join/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.window-join/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.window-join/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.window-join/article.py new file mode 100644 index 00000000..ff139928 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.window-join/article.py @@ -0,0 +1,172 @@ +# --- +# title: 'Window Join' +# description: 'Tutorial on how to do a window join in Pathway.' +# date: '2023-06-14' +# thumbnail: '/assets/content/tutorials/clickstream_window_join/clickstream-window-join-th.png' +# tags: ['tutorial', 'engineering'] +# keywords: ['window', 'behavior analysis', 'clickstream', 'conversion rate'] +# notebook_export_path: notebooks/tutorials/window_join.ipynb +# --- + +# # User Behavior Analysis with Window Join on Clickstream Data +# +# Welcome to this detailed tutorial on how to analyze user behavior by correlating clickstream data with purchase data using Pathway's window join functionality. Joining clickstream data with purchase data can be particularly useful for businesses in analyzing the success of product page visits converting into purchases, improving their marketing strategies and product offerings. +# +# The `window_join` function from Pathway associates records from two different tables that occur within the same time frame, specified by a window of time, creating sets of temporally correlated records. +# +# In a `window_join`, all pairs of records that fall within the same window and satisfy the specified conditions are joined together. This operation is performed for each window whether it is a sliding window, a session window or a tumbling window. In the case of a sliding window, this means the window slides across the time axis, capturing different subsets of records at each step. +# +# ::article-img +# --- +# src: '/assets/content/tutorials/clickstream_window_join/window_join_illustrative.png' +# alt: 'Illustration of Window Join' +# class: 'mx-auto' +# --- +# :: +# Consider an e-commerce example: the window join can be used to identify instances where product page visits and purchases occurred within the same hour. The time window and conditions could be adjusted to cater to different business questions and analysis requirements. This makes the window join an incredibly powerful tool for temporal data analysis, allowing us to track and analyze patterns over time and within specific timeframes. +# +# In our hypothetical e-commerce example, there are two datasets: +# +# * Clickstream data: It captures the user's interaction with the website. Each row records a user visit to a specific product page. +# +# * Purchase data: It records all successful purchases made by the users. +# +# The goal here is to monitor the evolution of conversion rates for each product page. In this context, a conversion refers to a scenario where a page visit and a purchase occur within a defined time window. You will be using the window_join functionality provided by Pathway for this purpose. +# +# Let's break down the process into step-by-step instructions. +# +# ## Setting Up the Data +# +# First, import the necessary Pathway library and load the clickstream and purchase data. + +import pathway as pw + +clickstream_data = pw.debug.table_from_markdown( + """ + | user_id | session_id | datetime | page_url +1 | 0x1f8b4c | 0x64a0c7 | 2023-06-06T14:00:12 | /home +2 | 0x1f8b4c | 0x64a0c7 | 2023-06-06T14:01:38 | /products/0x11b87b +3 | 0x2eaf7f | 0x22e5b3 | 2023-06-06T14:18:32 | /products/0x04g7d5 +4 | 0x2eaf7f | 0xf508e6 | 2023-06-06T14:19:17 | /products/0x04g7d5 +5 | 0x31a32e | 0x13f6c4 | 2023-06-06T14:21:55 | /products/0x11b87b +6 | 0x442d6e | 0x31d4a2 | 2023-06-06T14:23:51 | /products/0x11b87b +7 | 0x5c2d6e | 0x13f6c4 | 2023-06-06T14:29:59 | /products/0x04g7d5 +""" +) +purchase_data = pw.debug.table_from_markdown( + """ + | purchase_id | user_id | datetime | product_url + 1 | 0x0a1b2c | 0x1f8b4c| 2023-06-06T14:02:15 | /products/0x11b87b + 2 | 0x0b1a2d | 0x32ad44| 2023-06-06T14:03:25 | /products/0x04g7d5 + 3 | 0x0c1b3d | 0x11da93| 2023-06-06T14:23:11 | /products/0x11b87b + 4 | 0x0d1e3f | 0x2eaf7f| 2023-06-06T14:27:32 | /products/0x04g7d5 + 5 | 0x023cc3 | 0x5c2d6e| 2023-06-06T14:45:56 | /products/0x04g7d5 + 6 | 0x0e1f4g | 0x442d6e| 2023-06-06T14:55:22 | /products/0x11b87b +""" +) + +# ## Preprocessing the Data +# Records that do not pertain to product page visits are filtered out, and the datetime string column is converted to a datetime object. This process ensures that the datasets involved are not only relevant to the analysis but also formatted appropriately for the window_join function. + +# + +fmt = "%Y-%m-%dT%H:%M:%S" + +clickstream_data = clickstream_data.with_columns( + datetime=pw.this.datetime.dt.strptime(fmt) +).filter(pw.this.page_url.str.startswith("/products/")) + + +purchase_data = purchase_data.with_columns(datetime=pw.this.datetime.dt.strptime(fmt)) + +# - + +# ## Perform the Window Join +# The critical step in this process is performing a window join on the prepared data. The idea is to monitor the fluctuations of the hourly conversion rates. To achieve this, a sliding window is an appropriate tool. The frequency at which new values are produced can be parameterized by the choice of the hop. By opting for a left join, purchases without any corresponding clickstream data are omitted, effectively disregarding purchases made through other channels. + +matched_data = clickstream_data.window_join_left( + purchase_data, + clickstream_data.datetime, + purchase_data.datetime, + pw.temporal.sliding(hop=pw.Duration("10m"), duration=pw.Duration("1h")), + pw.left.user_id == pw.right.user_id, + pw.left.page_url == pw.right.product_url, +).select( + window_start=pw.left._pw_window_start, + user_id=pw.left.user_id, + page_url=pw.left.page_url, + product_url=pw.right.product_url, + purchase_id=pw.right.purchase_id, +) + + +# Let's dissect the `window_join_left` function and its arguments: +# +# This function joins the clickstream data (left) with the purchase data (right) in each window. The arguments of this function are: +# +# * `purchase_data`: The right side of the join. +# * `clickstream_data.datetime` and `purchase_data.datetime`: These are the time expressions that will be used for joining the datasets. +# * `pw.temporal.sliding(hop=Duration('10m'), duration=Duration('1h'))`: This creates a sliding window of 1 hour with a hop duration of 10 minutes. It means that a new window is created every 10 minutes, and each window spans 1 hour. +# * `pw.left.user_id == pw.right.user_id` and `pw.left.page_url == pw.right.product_url`: These conditions are used to join the datasets. Only those records that meet these conditions will be joined together. +# +# This approach helps in identifying instances where a user visited a product page and made a purchase of the same product within that hour. For more accuracy, it is logical to expect purchases to occur after the page visits. To ensure this, you can filter out the records where the purchase datetime is earlier than the clickstream datetime or use another type of temporal join known as an `interval_join`. + +from datetime import datetime + +view_window_start = datetime(2023, 6, 6, 13, 30, 0) +pw.debug.compute_and_print( + matched_data.filter(pw.this.window_start == view_window_start) +) + +# ## Calculating Conversion Rates +# +# The next step is to calculate the conversion rate, i.e., the proportion of product page visits that resulted in a purchase. Prior to that, it's crucial to first address a potential data discrepancy that could skew our analysis - the presence of duplicate matches within each window. Let's consider a situation where a user visits a product page twice and makes a purchase within the same time window. This would lead to the generation of two matches - a redundancy you'd want to avoid. To ensure the robustness of the data, you could employ a comprehensive groupby operation across all columns, effectively eliminating these duplicate entries. By retaining the `purchase_id` and `user_id` columns, you'll ascertain that within each window, each purchase and user is unique and accurately represented. +# +# Here's how you can do it: + +# + +matched_data = matched_data.groupby(*pw.this).reduce(*pw.this) + +# - + +# Next, the `groupby()` function is used to group the data by the `window_start` and the `page_url`. Following this, the `reduce()` function is applied to compute the aggregate of conversions and the page visits per group. Determining the number of visits involves a simple tally for each window and page. When it comes to tracking conversions, you would count only those purchase_ids that are not set to None. + +# + +traffic_threshold = 2 + +conversions = ( + matched_data.groupby(pw.this.page_url, pw.this.window_start) + .reduce( + pw.this.page_url, + pw.this.window_start, + conversions=pw.reducers.sum(pw.cast(int, pw.this.purchase_id.is_not_none())), + visits=pw.reducers.count(), + ) + .filter(pw.this.visits >= traffic_threshold) +) + +# - + +# Nonetheless, to guarantee the statistical significance of the data, only product pages that receive a minimum amount of traffic will be taken into consideration. For the small dataset at hand, a traffic threshold has been established at 2 visits. + +pw.debug.compute_and_print( + conversions.filter(pw.this.window_start == view_window_start) +) + +# Finally, you could calculate the conversion rate for each product: + +# + + +results = conversions.select( + pw.this.page_url, + pw.this.window_start, + conversion_rate=pw.this.conversions / pw.this.visits, +) + +pw.debug.compute_and_print(results) +# - + +# By printing the results, you now have a clear overview of the conversion rate for each product page within the specified time window. This powerful insight can help you understand which product pages are performing well and which ones need improvement. +# +# And that concludes the demonstration of the comprehensive, data-driven approach to analyzing user behavior and conversion rates on your e-commerce website using Pathway. +# +# If you found this demonstration useful and want to dive deeper into Pathway's capabilities, we recommend exploring tutorials about [Interval Joins](/developers/user-guide/temporal-data/interval-join) and [Asof Joins](/developers/user-guide/temporal-data/asof-join), which are alternative ways for merging time-based datasets. As always, feel free to reach out to our community for any questions and further discussions. Happy analyzing! diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.windowby-reduce-manual/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.windowby-reduce-manual/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.windowby-reduce-manual/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.windowby-reduce-manual/article.py new file mode 100644 index 00000000..981e0e68 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.windowby-reduce-manual/article.py @@ -0,0 +1,135 @@ +# --- +# title: Windowby Reduce manu[a]l +# description: Windowby Reduce manu[a]l +# notebook_export_path: notebooks/tutorials/windowby_manual.ipynb +# --- + +# # Windowby - Reduce +# In this manu\[a\]l, you will learn how to aggregate data with the windowby-reduce scheme. +# +# Pathway offers powerful features for time series data manipulation. One such feature is the `windowby` function, which allows for intricate data segmentation based on specified criteria. +# +# The `windowby` function can operate in three distinct modes—session, sliding, and tumbling—which are determined by the type of windowing function you pass to it. +# * Session Window: Groups adjacent elements based on activity and inactivity periods. +# * Sliding Window: Groups elements in overlapping windows of a specified length. +# * Tumbling Window: Groups elements in non-overlapping windows of a specified length. +# +# +# +# ::article-img +# --- +# src: 'assets/content/documentation/table-operations/windowby-types.png' +# alt: 'Illustration of Window types' +# class: 'mx-auto' +# --- +# :: +# +# This guide focuses on exploring these different types, demonstrating how each one can be used to achieve unique and important data analysis tasks. +# +# The data we're going to use is about... drumroll please... chocolate consumption! Let's suppose we have a dataset that tracks the amount of chocolate eaten during the day by a group of chocoholics. So, without further ado, let's get started. + +import pathway as pw + +# + +fmt = "%Y-%m-%dT%H:%M:%S" + +table = pw.debug.table_from_markdown( + """ + | time | name | chocolate_bars + 0 | 2023-06-22T09:12:34 | Fudge_McChoc | 2 + 1 | 2023-06-22T09:23:56 | Ganache_Gobbler | 2 + 2 | 2023-06-22T09:45:20 | Truffle_Muncher | 1 + 3 | 2023-06-22T09:06:30 | Fudge_McChoc | 1 + 4 | 2023-06-22T10:11:42 | Ganache_Gobbler | 2 + 5 | 2023-06-22T10:32:55 | Truffle_Muncher | 2 + 6 | 2023-06-22T11:07:18 | Fudge_McChoc | 3 + 7 | 2023-06-22T11:23:12 | Ganache_Gobbler | 1 + 8 | 2023-06-22T11:49:29 | Truffle_Muncher | 2 + 9 | 2023-06-22T12:03:37 | Fudge_McChoc | 4 + 10 | 2023-06-22T12:21:05 | Ganache_Gobbler | 3 + 11 | 2023-06-22T13:38:44 | Truffle_Muncher | 3 + 12 | 2023-06-22T14:04:12 | Fudge_McChoc | 1 + 13 | 2023-06-22T15:26:39 | Ganache_Gobbler | 4 + 14 | 2023-06-22T15:55:00 | Truffle_Muncher | 1 + 15 | 2023-06-22T16:18:24 | Fudge_McChoc | 2 + 16 | 2023-06-22T16:32:50 | Ganache_Gobbler | 1 + 17 | 2023-06-22T17:58:06 | Truffle_Muncher | 2 +""" +).with_columns(time=pw.this.time.dt.strptime(fmt)) +# - + +# ## Temporal Session Windowing +# The `session` windowing function is designed for grouping together adjacent time events based on a specific condition. This can either be a maximum time difference between events or a custom condition defined by you. +# +# For instance, let's say you are curious about the binge-eating sessions of the chocoholics. You'd want to group all consecutive records where the gap between the chocolate eating times is less than or equal to some period of time. +# +# Let's check out an example: + +# + +from datetime import timedelta + +result = table.windowby( + table.time, + window=pw.temporal.session(max_gap=timedelta(hours=2)), + instance=table.name, +).reduce( + pw.this.name, + session_start=pw.this._pw_window_start, + session_end=pw.this._pw_window_end, + chocolate_bars=pw.reducers.sum(pw.this.chocolate_bars), +) + +# Print the result +pw.debug.compute_and_print(result, include_id=False) +# - + +# ## Temporal Sliding Windowing +# +# Next, let's slide into sliding windows. Sliding windows move through your data at a specific step (hop) and create a window of a specific duration. This is like sliding a magnifying glass over your data to focus on specific chunks at a time. +# +# Let's find the chocolate consumption within sliding windows of duration 10 hours, sliding every 3 hours. This could be handy for identifying peak chocolate-eating times! + +# + +result = table.windowby( + table.time, + window=pw.temporal.sliding(duration=timedelta(hours=10), hop=timedelta(hours=3)), + instance=table.name, +).reduce( + name=pw.this._pw_instance, + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + chocolate_bars=pw.reducers.sum(pw.this.chocolate_bars), +) + +# Print the result +pw.debug.compute_and_print(result, include_id=False) + +# - + +# This gives you detailed insights about chocolate consumption over different time windows. + +# ## Temporal Tumbling Windowing +# +# Finally, let's tumble through tumbling windows. Tumbling windows divide our data into distinct, non-overlapping intervals of a given length. +# +# Let's divide the time series into tumbling windows of 5 hours each to see how our chocolate consumption varies over distinct periods. + +# + +result = table.windowby( + table.time, + window=pw.temporal.tumbling(duration=timedelta(hours=5)), + instance=table.name, +).reduce( + name=pw.this._pw_instance, + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + chocolate_bars=pw.reducers.sum(pw.this.chocolate_bars), +) + +# Print the result +pw.debug.compute_and_print(result, include_id=False) +# - + +# ## Conclusion +# +# In this guide, you've mastered the use of the windowby-reduce scheme in the Pathway library, a robust tool for time-series data aggregation. The three types of window functions—session, sliding, and tumbling—have been unveiled, each with its unique way of segmenting data. A playful example of chocolate consumption illuminated their practical application. As you continue to delve into data analysis, check out the tutorial [Detecting suspicious user activity with Tumbling Window group-by](/developers/showcases/suspicious_activity_tumbling_window), which utilizes the tumbling window function to spot unusual user behavior. Continue exploring, and elevate your data analysis prowess. diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.windows_temporal_behavior/__init__.py b/docs/2.developers/4.user-guide/060.temporal-data/.windows_temporal_behavior/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/060.temporal-data/.windows_temporal_behavior/article.py b/docs/2.developers/4.user-guide/060.temporal-data/.windows_temporal_behavior/article.py new file mode 100644 index 00000000..9ccbe722 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/.windows_temporal_behavior/article.py @@ -0,0 +1,167 @@ +# --- +# title: Controlling temporal behavior of windows +# description: An article exploring concepts related to temporal behavior of windows. +# date: '2024-01-08' +# thumbnail: 'assets/content/tutorials/clickstream_window_join/clickstream-window-join-th.png' +# keywords: ['windowby', 'behavior', 'late data', 'delay', 'cutoff', 'out-of-order data'] +# notebook_export_path: notebooks/tutorials/windows_temporal_behavior.ipynb +# --- + +# # Controlling temporal behavior of windows + +# In this article you will learn how to use windows effectively, by specifying their temporal behaviors. + +# Temporal behaviors in Pathway are crucial for bounding memory consumption, maintaining proper tradeoff between latency and efficiency, or adjusting windows for your needs. To read more about behaviors and the motivation behind them read our [guide](/developers/user-guide/temporal-data/behaviors/). This article goes into detail on how to define the behavior using `common_behavior` and `exactly_once_behavior` and what impact they have on the result of [windows](/developers/user-guide/temporal-data/windows-manual). + +# The examples in this article use the sliding windows, but you can also use behaviors with tumbling windows. + +# ## Event Time vs Processing Time + +# In the context of temporal behavior it is important to distinguish between an event time and a processing time. The event time is when the event happens, e.g. if your data are orders in the online shop, the event time is the time when the order happened. This information has to be present in your data because Pathway doesn't know when the event happened. Thus event time can be any time you assign to your data. +# +# The only time Pathway is aware of is when the record arrives to the Pathway engine. This time is called processing time. While the processing time of entries in a stream is always nondecreasing (because the time goes forward), due to latency the event time may be out of order. In extreme cases, this can manifest via events with _high_ latency between their event time and processing time, which we shortly call _late data_. +# +# When grouping data in windows, you usually want to consider the event time, and the temporal behavior is based on it, but the order in which the events are processed impacts the results. + +# ![Event time vs processing time](/assets/content/documentation/behavior-guide/event-time-vs-processing-time.svg) + +# + +# ## Dataset + +# To try out the temporal behaviors of windows you need an example Pathway Table with both processing time and event time. You can generate it using `pw.debug.table_from_markdown`, which takes a table specification in markdown format. If it has a column named `__time__`, Pathway will use it as a processing time, which allows you to see how the temporality of your data affects the outcome of the computation. The following code creates a table with logs. Other than the `__time__` column, it also has the `event_time`, which says when the event described by the log happened, and the `message` column. In this case, both `__time__` and `event_time` are given as timestamps. +# +# Remarks: +# - while the processing time for the `table_from_markdown` method always needs to be given as a timestamp, the event_time can be any of [various types that are supported by the windowing mechanism](/developers/api-docs/pathway-stdlib-temporal/#pathway.stdlib.temporal.windowby) +# - the `table_from_markdown` method needs the processing time to be passed in a column with a special name `__time__`, but the column holding event_time is passed as a parameter to the [`windowby`](/developers/api-docs/pathway-stdlib-temporal/#pathway.stdlib.temporal.windowby) function, and here it is called event_time just to keep the example self-explanatory. + +import pathway as pw + +t = pw.debug.table_from_markdown( + """ + event_time | message | __time__ + 360 | Processing_started | 362 + 362 | Task_completed_successfully | 362 + 366 | Error_occurred_during_processing | 368 + 370 | Data_received_from_sensor | 410 + 370 | Database_connection_established | 370 + 370 | File_saved_successfully | 372 + 372 | Processing_completed | 374 + 376 | Request_received_from_user | 396 + 382 | Task_in_progress | 382 + 382 | Warning_Low_memory | 392 +""" +) + +# Consider the following example scenario - you are given the table as defined above, and you need to count the number of logs that fall into 10-second windows, with windows starting every 4 seconds. +# +# To that end, you can use sliding windows. To keep things simple, start with a piece of code that only groups data into windows, without specifying temporal behaviors. As you can see in the code snippet below, you can do that using `windowby` with `sliding` window of `duration` set to 10 and `hop` set to 4. For the result, keep information about the start and the end of each window and the number of logs that are in those windows. + +result = t.windowby( + t.event_time, + window=pw.temporal.sliding(duration=10, hop=4), +).reduce( + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + n_logs=pw.reducers.count(), +) + +# When you use [`pw.debug_compute_and_print`](/developers/api-docs/debug/#pathway.debug.compute_and_print) to print the results, you will only get the final result, after all input rows are processed. + +pw.debug.compute_and_print(result) + +# To understand how the result changed when new rows were processed, it is useful to use [`pw.debug.compute_and_print_update_stream`](/developers/api-docs/debug/#pathway.debug.compute_and_print_update_stream) function. It shows you every change made to the Table, with column `__diff__` denoting whether the row was added or removed. + +pw.debug.compute_and_print_update_stream(result) + +# ## What time is it? + +# The behaviors depend on the "current time" of an operator, in this article denoted as _now_. It is defined as the maximum already seen time by an operator in the already processed data (when a new batch of data arrives it is processed using the value of _now_ obtained from previous batches). In the context of windows, this time is taken from the column you use for grouping data in windows - usually event time. For example, `delay` sets a shift in time, and the window will be computed once _now_ is at least `delay` after the beginning of the window. + +# ## Common Behavior + +# The general way to define temporal behaviors in Pathway is by using `pw.temporal.common_behavior`. It allows you to set `delay`, `cutoff` and `keep_results` parameters. The `delay` and `cutoff` parameters represent time duration and their type should be compatible with the time column passed to `windowby`. This means that if your time column has type `int` or `float` then `delay` and `cutoff` should also have type, respectively, int or float. If instead, the time column has type [`DatetimeUtc`](/developers/api-docs/pathway/#pathway.DateTimeUtc) or [`DatetimeNaive`](/developers/api-docs/pathway/#pathway.DateTimeNaive), then `delay` and `cutoff` should have type [`Duration`](/developers/api-docs/pathway/#pathway.Duration). To understand the motivation of these parameters read our [guide on behaviors](/developers/user-guide/temporal-data/behaviors/). + +# ### Delay + +# When you set the `delay` to be non-zero, the engine will wait before first calculating the result of each window. To be more precise, the window will be calculated, when _now_ is at least `window_start + delay`. If `delay` is not provided, it defaults to `None` which disables the delay mechanism. + +# ![Illustration of delay](/assets/content/tutorials/windows_behavior/window-behavior-delay.svg) + +# + +# You can use it to stagger calculations - this allows for more rows to be processed at once, rather than recomputing the result after each row arrives to the engine. If you set the `delay` in the log example to be 4, you will see that the update stream becomes shorter. + +result_delay = t.windowby( + t.event_time, + window=pw.temporal.sliding(duration=10, hop=4), + behavior=pw.temporal.common_behavior(delay=4), +).reduce( + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + n_logs=pw.reducers.count(), +) +pw.debug.compute_and_print_update_stream(result_delay) + +# You can notice in the `__time__` column an unexpected timestamp, that is `18446744073709551614`. That is because of the use of debug mode. As the input ended, the engine triggers the computation of the last window by setting _now_ to be maximum possible time. It won't happen in the streaming mode because the processing there never ends. + +# ### Cutoff + +# Cutoff determines when the result of the window will no longer be updated, even if there is a change to a data point inside that window. This should not be before the windows closes - in such case you would shorten the window. When the `cutoff` is set, the window is no longer updated when _now_ is later than `window_end + cutoff`. If the `cutoff` is not provided, it defaults to `None` which disables the cutoff mechanism. + +# ![Illustration of cutoff](/assets/content/tutorials/windows_behavior/window-behavior-cutoff.svg) + +# + +# Now add `cutoff=4` to the log example. You should see that the row that has processing time `410` no longer impacts the results. When you use `cutoff` omitting such late points means that you get different results than if you processed everything in batch, as the data that comes after the cutoff of a window will not be used in calculations for this window. This, however, is necessary for efficient memory consumption - without setting `cutoff` all data that ever was processed needs to be kept in memory, in case some very late event arrives and a window needs to be recomputed. When you use the cutoff mechanism you inform the engine when it can clear the memory. + +result_cutoff = t.windowby( + t.event_time, + window=pw.temporal.sliding(duration=10, hop=4), + behavior=pw.temporal.common_behavior(cutoff=4), +).reduce( + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + n_logs=pw.reducers.count(), +) +pw.debug.compute_and_print_update_stream(result_cutoff) + +# Note that since the time when cutoff triggers is based only on the window end and `cutoff` value, an event belonging to multiple windows can be late - and ignored in calculations - for one window, but on time for another. In the above example, you can notice that at time `396`. At this time the event with `event_time` equal to `376` arrives to the engine, so it belongs to 3 windows - starting at times `368`, `372` and `376`. But since for the first of these windows, we are past its cutoff when this event arrives, only the other two windows are recalculated. + +# ### Keep_results + +# The final argument of `common_behavior` - `keep_results` is only relevant if you use the cutoff mechanism. When set to `True`, its default value, the rows corresponding to windows already past cutoff are kept in the output table. You can see that by looking at the final state of the `result_cutoff` Table from the previous Section - it contains a record for each window. + +pw.debug.compute_and_print(result_cutoff) + +# If you set `keep_results=False`, however, once the window is past its cutoff, the record for this window is removed from the result Table, so, in the end, you are left only with the last few windows. The example use case is [log monitoring](/developers/showcases/realtime-log-monitoring), where you want to raise alerts based only on very recent windows. + +result_keep_results = t.windowby( + t.event_time, + window=pw.temporal.sliding(duration=10, hop=4, origin=360), + behavior=pw.temporal.common_behavior(cutoff=4, keep_results=False), +).reduce( + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + n_logs=pw.reducers.count(), +) +pw.debug.compute_and_print(result_keep_results) + +# By checking the output of `compute_and_print_update_stream` you can see that each window was calculated at some point, but some of them were later removed. + +pw.debug.compute_and_print_update_stream(result_keep_results) + +# ## Exactly Once Behavior + +# For windows that you want to calculate exactly once, Pathway offers an easier way of defining behavior with `pw.temporal.exactly_once_behavior` function. It takes one optional argument, `shift`. Then a window will be calculated at time `_pw_window_end + shift`, and after that all changes to this window will be ignored. It is equivalent to using `pw.temporal.common_behavior` with `delay` set to `duration + shift` (`duration` is an argument to both [sliding](/developers/api-docs/temporal/#pathway.stdlib.temporal.sliding) and [tumbling](/developers/api-docs/temporal/#pathway.stdlib.temporal.tumbling) windows for setting the length of the window) and `cutoff` to `shift`. + +result_exactly_once = t.windowby( + t.event_time, + window=pw.temporal.sliding(duration=10, hop=4, origin=360), + behavior=pw.temporal.exactly_once_behavior(shift=2), +).reduce( + window_start=pw.this._pw_window_start, + window_end=pw.this._pw_window_end, + n_logs=pw.reducers.count(), +) +pw.debug.compute_and_print_update_stream(result_exactly_once) diff --git a/docs/2.developers/4.user-guide/060.temporal-data/10.windows-manual.md b/docs/2.developers/4.user-guide/060.temporal-data/10.windows-manual.md new file mode 120000 index 00000000..759a190f --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/10.windows-manual.md @@ -0,0 +1 @@ +.windowby-reduce-manual/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/060.temporal-data/20.behaviors.md b/docs/2.developers/4.user-guide/060.temporal-data/20.behaviors.md new file mode 100644 index 00000000..4e744d77 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/20.behaviors.md @@ -0,0 +1,52 @@ +--- +title: 'Late data and cutoffs' +description: 'How to define the behavior of your streamed data in Pathway' +--- + + +# Understanding Temporal Behavior in Pathway + +When moving from batch to streaming you need to understand the nature of data coming over time. Pathway makes it easy to switch from batch to streaming by guaranteeing out-of-the-box the correctness of computations, by updating a result whenever new data points are inputted. This correctness comes with a price, and you may want to trade it for better latency or memory consumption. In Pathway, you can set the behavior of the temporal operations to determine the tradeoff between accuracy, latency, and memory consumption that will suit your application. + +When working with streaming data, your events will happen over time and the result of your application will be based on data that has been processed so far. In reality, the situation will be even more complicated - the time at which event occurs will be different than the time at which it is processed, and the difference between those two - latency - will vary from event to event making it hard to predict. It is caused by different speeds and levels of reliability of channels you will use to send data to a streaming system. + + + + +When some data is late, what should we do? +It's impossible to know if there is no data or if the data is late. +Should we wait indefinitely, ready to update the results, or should we finalize the computations as fast as possible and free the resources? +Also, when computing aggregated values, such as an average. Should we start the computation as soon as we receive the first data point, or should we wait to have enough data? +The default behavior of Pathway is to start the computation right away and wait for potential late data to ensure correctness: by default, the results Pathway produces are always consistent with the received data. +However, this has a cost: Pathway needs to keep track of old data in case a late data point comes and the result needs to be recalculated. + +For example, if you compute the exact number of unique website visitors over 1-hour intervals and get an entry from 1 day ago, you must remember all users from that time. It is okay, but imagine getting an entry from 1 year ago and keeping all the data! Doing this might be ridiculous, but it is the only way to guarantee the correctness if the maximal latency is unknown. Unfortunately, this can make the memory consumption unbounded. The solution is to specify a temporal behavior to let Pathway know which data is essential for you and which can be ignored. + +### Staggering updates to improve efficiency + +Because data don't come all simultaneously, there is a tradeoff between latency and efficiency - it could be better to wait for more data before doing the computations. +For example, having an output update at each data reception can be costly for regularly and frequently incoming data, and delaying the update might be more efficient. +Also, it may be wise to wait to have enough data before computing aggregated statistics. + +### Processing Incomplete Data + +When you aggregate the data, by default you start getting results when the first record gets processed. It provides low latency but in some situations can be undesirable. In the previous example of counting unique website visitors, you are fine with getting the best estimate on the result before the interval ends. On the other hand, if you want to have alerts based on suspiciously low power usage to detect potential outages, you will trigger alerts, just because you work with incomplete data. The solution would be to wait until most data should have been already collected. + +### Outdated Data Becoming Irrelevant +This last issue concerns anomaly detection use cases. Imagine that you aggregate data into [windows](/developers/user-guide/temporal-data/windows-manual), i.e., clusters of data within some duration of time from each other. Now, you analyze data inside each window, and if it is suspicious in some way, you raise an alert. Naturally, your alert should only be based on the latest or few latest windows. To accomplish that, you want to forget data if they are no longer relevant. + +All these issues can be solved in Pathway by using temporal behaviors! They will allow you to specify when computations should occur, whether to keep updating based on late data, or if you want to remove outdated results from the output of an operator. + +## Defining temporal behaviors in Pathway + +Temporal behavior in Pathway is specified using 3 parameters - `delay`, `cutoff` and `keep_results`. + +The purpose of `delay` is to inform the engine to wait for a given amount of time before any computation is done after a new record arrives. There are two reasons to do that. The first one is to avoid recomputation by buffering the data, by which you specify your desired tradeoff between latency and efficiency. The second one is distinguishing between windows for which you want the latest result, even if it is based on incomplete data, or if you want to wait until most of the data has arrived. You can see the example of the latter in the [From Jupyter to Deploy](/developers/user-guide/exploring-pathway/from-jupyter-to-deploy) tutorial. + +`cutoff` is used to specify how long to wait for late data. It sets the time, after which the results of computation results will no longer be updated, even if late data arrives, thus allowing Pathway to clear memory. + +Flag `keep_results` allows you to specify whether the computation results should be kept after `cutoff`. When `keep_results=True`, which is its default value, the operator's output is kept, but it is no longer updated with late data. When you set `keep_results=False`, not only will the results not be updated, but they will be removed from the output. It is useful for the anomaly detection use case, an example of which you can check in [`Log Monitoring`](/developers/showcases/realtime-log-monitoring). + +To understand what these arguments exactly mean for specific operators, read our tutorials on how to use behaviors with [Interval Joins](/developers/user-guide/temporal-data/temporal_behavior/) and Windows . + +To set these three parameters, provide them as an argument to [`pw.temporal.common_behavior`](/developers/api-docs/temporal/#pathway.stdlib.temporal.common_behavior). Pathway also provides a "shortcut" for windows you want to calculate exactly once, after they have already closed, by using [`pw.temporal.exactly_once_behavior`](/developers/api-docs/temporal/#pathway.stdlib.temporal.exactly_once_behavior). You can read more about `pw.temporal.exactly_once_behavior` in our tutorial on using behaviors with Windows. diff --git a/docs/2.developers/4.user-guide/060.temporal-data/25.windows_with_behaviors.md b/docs/2.developers/4.user-guide/060.temporal-data/25.windows_with_behaviors.md new file mode 120000 index 00000000..ed0717a5 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/25.windows_with_behaviors.md @@ -0,0 +1 @@ +.windows_temporal_behavior/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/060.temporal-data/30.asof-join.md b/docs/2.developers/4.user-guide/060.temporal-data/30.asof-join.md new file mode 120000 index 00000000..1f6d3c85 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/30.asof-join.md @@ -0,0 +1 @@ +.asof-join/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/060.temporal-data/40.interval-join.md b/docs/2.developers/4.user-guide/060.temporal-data/40.interval-join.md new file mode 120000 index 00000000..65aa9bc9 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/40.interval-join.md @@ -0,0 +1 @@ +.interval-join/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/060.temporal-data/50.window-join.md b/docs/2.developers/4.user-guide/060.temporal-data/50.window-join.md new file mode 120000 index 00000000..b846eb39 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/50.window-join.md @@ -0,0 +1 @@ +.window-join/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/060.temporal-data/60.temporal_behavior.md b/docs/2.developers/4.user-guide/060.temporal-data/60.temporal_behavior.md new file mode 120000 index 00000000..7e208c17 --- /dev/null +++ b/docs/2.developers/4.user-guide/060.temporal-data/60.temporal_behavior.md @@ -0,0 +1 @@ +.interval_join_temporal_behavior/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/080.deployment/10.docker-deployment.md b/docs/2.developers/4.user-guide/080.deployment/10.docker-deployment.md new file mode 100644 index 00000000..549893f1 --- /dev/null +++ b/docs/2.developers/4.user-guide/080.deployment/10.docker-deployment.md @@ -0,0 +1,137 @@ +--- +title: Docker Deployment of Pathway +description: 'A guide about how to deploy Pathway using Docker containers' +--- + +# Docker Deployment of Pathway + +## Introduction + +Deployment is a critical step that should never be overlooked. + +Pathway is fully compatible with Python and leverages its flexibility to provide various deployment options. + +> Pathway is fully Python-compatible: you can use any existing Python deployment method. + + +Whether starting a new project or adding Pathway to an existing one, it allows you to choose the best approach that suits your needs. + +In this article, we will explore how to deploy a Pathway project using Docker. + +## Why Docker? + + +Pathway is meant to be deployed in a containerized manner. +Single-machine deployments can easily be achieved using Docker. +The deployment can run concurrently on multiple cores using multiple processes or threads. +We provide a pathway spawn command to launch multi-process and multi-threaded jobs. +The choice between threads and multiple processes depends on the nature of the computation. +While communication between threads is faster, Python-heavy workloads may require multi-process parallelism to bypass the GIL. + +## Prerequisites + +Before we dive into the deployment process, ensure that you have Docker installed on your system. +You can download and install Docker from the official website [Docker Installation Guide](https://docs.docker.com/engine/install/). + +## Using the Pathway image + +To deploy your Pathway application, you can use the [Pathway Docker image](https://hub.docker.com/r/pathwaycom/pathway). +It includes all the dependencies required to run Pathway. + +### Using a Dockerfile + +Using a Dockerfile is an easy way to configure your project using Docker. +To use Pathway, you can set the image to use the Pathway Docker image using the `FROM` command: + +```dockerfile +FROM pathwaycom/pathway:latest + +# Set working directory +WORKDIR /app + +# Copy requirements file and install dependencies +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application code +COPY . . + +# Command to run the Pathway script +CMD [ "python", "./your-script.py" ] +``` + +Note that the Pathway Docker image contains everything it needs to run a Pathway application. +If you don't use any other library than Pathway, then there is no need for a `requirements.txt` file. + +You can then build and run the Docker image: + +```bash +docker build -t my-pathway-app . +docker run -it --rm --name my-pathway-app my-pathway-app +``` + +### Using multi-threading and multi-processing + +Pathway allows you to use multi-threading and multi-processing using Pathway CLI. +To run a Pathway application using 2 processes and 3 threads, replace this command: + +```dockerfile +CMD [ "python", "./your-script.py" ] +``` + +by this one: + +```dockerfile +CMD ["pathway", "spawn", "--processes", "2", "--threads", "3", "python", "./your-script.py"] +``` + +### Running a single Python script + +When dealing with single-file projects, creating a full-fledged `Dockerfile` might seem unnecessary. +In such scenarios, you can execute a Python script directly using the Python Docker image. +For example: + +```bash +docker run -it --rm --name my-pathway-app -v "$PWD":/app pathwaycom/pathway:latest python my-pathway-app.py +``` + +## Using a Python image + +Alternatively, Pathway is fully Python-compatible: you can use any existing Python deployment method. +If you prefer using a standard Python image and installing Pathway via pip, you can use such a Dockerfile: + +```dockerfile +FROM --platform=linux/x86_64 python:3.10 + +# Set working directory +WORKDIR /app + +# Copy requirements file and install dependencies +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application code +COPY . . + +# Command to run the Pathway script +CMD [ "python", "./your-script.py" ] +``` + +⚠️ Pathway is not available on Windows and requires a Python 3.10+ installation. +For compatibility reasons, you should use x86_64 Linux container and a Python 3.10+ image. + +Otherwise, it's the same as with the Pathway image, except that you don't need to install Pathway. +You can build and run your image with the same commands as before: + +```bash +docker build -t my-pathway-app . +docker run -it --rm --name my-pathway-app my-pathway-app +``` + +## Example: Log Monitoring with Docker + +You can see how to deploy a Pathway project with Docker by looking at our [Realtime Server Log Monitoring](/developers/showcases/realtime-log-monitoring). + +## Deploying on the cloud +If you want to scale your Pathway application, you may be interested in deploying Pathway in the cloud. +You can learn more about this on the [dedicated page](/developers/user-guide/deployment/cloud-deployment). \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/080.deployment/20.cloud-deployment.md b/docs/2.developers/4.user-guide/080.deployment/20.cloud-deployment.md new file mode 100644 index 00000000..1f63ed9d --- /dev/null +++ b/docs/2.developers/4.user-guide/080.deployment/20.cloud-deployment.md @@ -0,0 +1,23 @@ +--- +title: Cloud Deployment of Pathway +description: 'A guide about how to deploy Pathway using the cloud' +--- + +# Cloud Deployment of Pathway + +Local and docker deployment may be not enough. +If you want to scale your Pathway application, you may be interested in our Pathway for Enterprise. +Pathway for Enterprise is specially tailored towards end-to-end data processing and real time intelligent analytics. +It scales using distributed computing on the cloud and supports Kubernetes deployment. + +## Deploying on the cloud +Most cloud platforms offer robust support for Docker containers and/or Python deployment, allowing you to deploy your Pathway project on these cloud environments without encountering compatibility issues. +You can easily benefit from the scalability and flexibility of cloud deployment. + +### Data streaming for real-time enterprise AI apps with Azure +You can see how to build a real-time data pipeline with Azure Event Hubs, Pathway, and Azure OpenAI in our associated [GitHub repository](https://github.com/pathway-labs/azure-openai-real-time-data-app). + +## Distributed deployment +A multi-server (distributed) deployment can use Kubernetes and its cloud implementations. Pathway assumes a stateful set deployment with all pods present for a successful operation. Pathway enterprise offering covers distributed multi-server deployment for production use. Support with integration into existing helm charts and k8s tooling is provided. + +You can learn more about Pathway Enterprise [here](/features/). \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/080.deployment/30.integration.md b/docs/2.developers/4.user-guide/080.deployment/30.integration.md new file mode 100644 index 00000000..3261ffa9 --- /dev/null +++ b/docs/2.developers/4.user-guide/080.deployment/30.integration.md @@ -0,0 +1,18 @@ +--- +title: CI/CD +description: 'A guide about how to do CI/CD with Pathway' +--- + +# Easy testing and CI/CD + +Pathway is fully Python-compatible: you can use any of your favorite Python integration tools. + +Whether starting a new project or adding Pathway to an existing one, it allows you to choose the best approach that suits your needs. + +Pathway can seamlessly be integrated into your CI/CD chain as it is inherently compatible with popular tools such as [mypy](https://mypy-lang.org/) or [pytest](https://pytest.org/). +Your Pathway pipelines can be automatically tested, built, and deployed, like any other Python workflow. + +You can run Pathway tests on offline data snippets locally in any CI/CD pipeline with Python. +Tests can cover the handling of temporal (late, out of order) aspects of data by comparing results on multiple revisions. +Pathway supports several session-replay mechanisms, such as the demo API. +These allow predictably recreating streaming scenarios within standard CI/CD pipelines (Jenkins, GitHub Actions, etc.) \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/080.deployment/70.prometheus-monitoring.md b/docs/2.developers/4.user-guide/080.deployment/70.prometheus-monitoring.md new file mode 100644 index 00000000..425fa74b --- /dev/null +++ b/docs/2.developers/4.user-guide/080.deployment/70.prometheus-monitoring.md @@ -0,0 +1,68 @@ +--- +title: 'Monitoring Pathway with Prometheus' +description: 'Tutorial on how to set up monitoring of Pathway instance' +date: '2023-05-12' +thumbnail: '/assets/content/blog/th-prometheus.png' +tags: ['tutorial', 'engineering'] +keywords: ['monitoring', 'Prometheus', 'metrics'] +--- + +# Monitoring Pathway with Prometheus + +In this tutorial, you will learn how to monitor a Pathway application with Prometheus. +This will allow you to reliably oversee the execution of your streaming pipeline. + +Proper monitoring of Pathway application is inevitable for business-critical production deployment. +One has to ensure that all the current and previous bottlenecks, like drops in latency and throughput or any anomalous behaviors +are easily identifiable, so that all the issues can be immediately resolved. +This can be achieved by integrating Pathway with Prometheus. + +The setup is straightforward. You can set up your own monitoring process by following these steps: + +1. Download and unpack Prometheus from its [official website](https://prometheus.io/download/) +2. Configure it to scrape the Pathway endpoint by editing the `prometheus.yml` inside the unpacked directory +```yaml +scrape_configs: + - job_name: pathway + scrape_interval: 5s + static_configs: + - targets: ['localhost:20000'] +``` + + +> `20000` is the default Pathway monitoring http port. You can configure this setting via the `PATHWAY_MONITORING_HTTP_PORT` environment variable. + +> You can monitor all the processes (when running in multi-processing mode) by adding numbers to the port (e.g. 20002 corresponds to the 3rd process) + +3. Run prometheus `./prometheus --config.file=prometheus.yml` +4. Run any Pathway program with http monitoring enabled i.e. with `pw.run(with_http_server=True)`, e.g. + +```python +import time + +import pathway as pw + + +class InfiniteStream(pw.io.python.ConnectorSubject): + def run(self): + while True: + self.next_json({"value": 1}) + self.commit() + time.sleep(0.100) + +class InputSchema(pw.Schema): + value: int + +if __name__ == "__main__": + t = pw.io.python.read(InfiniteStream(), schema=InputSchema, format="json") + t = t.reduce(sum=pw.reducers.sum(t.value)) + pw.io.null.write(t) + pw.run(with_http_server=True) + +``` +5. Your metrics should now be available at http://localhost:20000/metrics + +Finally navigate to the Prometheus interface and query for some metric, e.g. http://localhost:9090/graph?g0.expr=output_latency_ms&g0.tab=0&g0.range_input=5m (9090 is the default Prometheus port) to verify that everything works correctly. + + +Graph image \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/.consistency/__init__.py b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/.consistency/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/.consistency/article.py b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/.consistency/article.py new file mode 100644 index 00000000..59f40b0c --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/.consistency/article.py @@ -0,0 +1,181 @@ +# --- +# title: 'Data model consistency' +# description: 'Computations in Pathway are expressed as if static data were loaded into the system. Pathway delivers consistent results by explicitly reasoning about time: every processed input message bears a timestamp, and each output message specifies exactly for which input times it was computed.' +# notebook_export_path: notebooks/tutorials/consistency.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Consistency of the Pathway data model +# +# Computations in Pathway are expressed as if static data were loaded into the system. When streaming changes, Pathway produces inputs consistent with the state of all inputs at a given point in time. +# +# Pathway delivers consistent results by explicitly reasoning about time: every processed input message bears a timestamp, and each output message specifies exactly for which input times it was computed. In other words, each output produced by Pathway is the final answer that would have been given if all sources were read up to the indicated cutoff times, and the computation was carried in entirety. No intermediate results are shown. Updates to the outputs will be sent only when new data is input into the system. +# +# This consistency behavior requires specifying an update schedule for each input. For instance, an interactive system may react to user input every 500 milliseconds and update the data to be displayed every 10 seconds. Then, fast interactive manipulations are possible and the data shown lags by at most 10 seconds. +# %% [markdown] +# ## Running example + +# Consider a Kafka topic in which each message contains simultaneous events in the banking system. Think of a cash transfer - two things occur at once: money leaves account A (a debit) and lands in account B (a credit). +# Clearly, a transaction touches two accounts and consists of two events that must be processed atomically (either both or none at all), or some outputs, e.g., account balances, will be inconsistent. +# Kafka is atomic with respect to individual messages. Thus, this event-pack design ensures atomic delivery of all events that form a transaction. +# However, processing event-packs is complex, and logic is much easier if the event-pack stream is unpacked into a single message stream (e.g., we can then group events by account and compute the balances). + +# Since Kafka only guarantees the atomic processing of single messages, consistency is lost once the event-packs are flattened into a stream of individual events. +# On the other hand, Pathway guarantees consistency. +# If a Pathway computation unpacks the event-pack into individual events, all messages that form a transaction will be consistently processed, ensuring that every produced output depends on either all or no events that were grouped together into a single event-pack. +# +# Let’s simulate such a scenario by implementing a simple connector. + + +# %% +import pathway as pw +import time +import random + +random.seed(42) + + +class InputStream(pw.io.python.ConnectorSubject): + def run(self): + for _ in range(0, 10): + a, b = random.sample(range(1, 10), 2) + amount = random.randint(1, 100) + self.next_json( + { + "events": [ + {"account_id": a, "amount": -amount, "event_type": "debit"}, + {"account_id": b, "amount": amount, "event_type": "credit"}, + ] + } + ) + time.sleep(random.choice((0, 1))) + + +# %% [markdown] +# +# Each message within this stream encapsulates a series of events occurring within the banking system during transaction executions. +# +# Now, let's bring this data into a table using the code below: + + +# %% +class Schema(pw.Schema): + events: pw.Json + + +input = pw.io.python.read(InputStream(), schema=Schema) +# %% [markdown] +# +# So far so good, each transaction has been loaded simultaneously into a single row of the stream. +# To make things more complicated, let's calculate the totals of all inflows and outflows for each account individually. To do so, you can first flatten all the events, divide them by type, and then group them by account number. + +# %% +# pw.unwrap changes the type of the column from Optional[T] to T +events = input.flatten(pw.this.events).select( + event_type=pw.unwrap(pw.this.events["event_type"].as_str()), + account_id=pw.unwrap(pw.this.events["account_id"].as_int()), + amount=pw.unwrap(pw.this.events["amount"].as_int()), +) + +credits = ( + events.filter(pw.this.event_type == "credit") + .groupby(pw.this.account_id) + .reduce(pw.this.account_id, balance=pw.reducers.sum(pw.this.amount)) +) + +debits = ( + events.filter(pw.this.event_type == "debit") + .groupby(pw.this.account_id) + .reduce(pw.this.account_id, balance=pw.reducers.sum(pw.this.amount)) +) +# %% [markdown] +# +# Events from the same transactions have been divided into independent streams. However, this doesn't imply a loss of consistency. Pathway's secret sauce lies in keeping everything consistent across dataflow nodes. +# To illustrate this fact, let's calculate the balance for each account. + +# %% +account_balance = credits.join_outer( + debits, pw.left.account_id == pw.right.account_id +).select( + pw.this.account_id, + balance=pw.coalesce(pw.left.balance, 0) + pw.coalesce(pw.right.balance, 0), +) + +# %% [markdown] +# Given that money was transferred from one account to another without any external funds entering the system, the total balance across all accounts should sum to 0. We could, of course, use a `reduce` expression, but for a clearer understanding of the system's functioning, let's utilize Python output instead. +# +# %% +import logging + +logging.basicConfig(level=logging.INFO, force=True) + + +class TotalBalance: + account_balance: dict[int, int] = {} + + def on_change(self, key, row, time, is_addition): + account_id = row["account_id"] + balance = row["balance"] + + # When a row changes, two messages are created: + # one indicating the removal of an old value + # and another indicating the insertion of a new value. + if is_addition: + logging.info(f"Account {account_id} balance: {balance}") + self.account_balance[account_id] = balance + else: + logging.info(f"Removing account {account_id} balance: {balance}") + assert self.account_balance[account_id] == balance + del self.account_balance[account_id] + + def on_time_end(self, time): + logging.info(f"Total: {sum(self.account_balance.values())}") + + def on_end(self): + self.account_balance.clear() + + +output = TotalBalance() + +pw.io.subscribe( + account_balance, + on_change=output.on_change, + on_time_end=output.on_time_end, + on_end=output.on_end, +) + + +# %% [markdown] +# `TotalBalance` stores the current balance for each account. Method `on_change` updates and prints the status of a specific account whenever a change occurs in the table. Method `on_time_end` is invoked when a processing time ends. + +# %% [markdown] +# All that's left is to run the pipeline: + +# %% +# _MD_COMMENT_START_ +pw.run(monitoring_level=pw.MonitoringLevel.NONE) +# _MD_COMMENT_END_ +# _MD_SHOW_pw.run() + +# %% [markdown] +# You can observe that Pathway processed all the events intended to be simultaneous at the same time, and the sum of operations always equals 0. +# +# # Persistency guarantees +# +# Pathway persists intermediate results recording the state of inputs with each saved datum. When restarted from a checkpoint, the saved state is loaded into memory first. Then all inputs are replayed starting at times recorded in the checkpoint. To avoid data loss, all streaming inputs should be buffered into a persistent message queue which allows multiple reads to recent items, such as a topic in Apache Kafka. +# +# Pathway gives "at least once" output data delivery guarantee for the data output in the different runs. More precisely, if some of the lines were outputted to a data sink in a non-closed Pathway's data batch, these output lines may appear on the output after the program has been re-run. +# +# The enterprise version of Pathway supports "exactly once" message delivery on selected combinations of input and output connectors which enable the use of the 2-phase commit protocol. diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/10.worker-architecture.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/10.worker-architecture.md new file mode 100644 index 00000000..6437544e --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/10.worker-architecture.md @@ -0,0 +1,67 @@ +--- +title: Architecture +description: 'This page explains the architecture of Pathway' +--- + +# Pathway Architecture + + + + + + +Pathway uses Python to conveniently describe the computations to be performed. +The computations are then optimized and converted into basic operations executed by the Rust dataflow engine. + +## Worker architecture + +By design, each Pathway worker runs the same dataflow on different subsets (shards, partitions) of the data. +This means that every worker (thread or process) runs precisely the same Python script that builds the dataflow and builds the same low-level dataflow to be executed. +Workers know their identity, a unique sequential number (ID of the worker), and use it to determine which data shards are their responsibility. +Workers selectively read only the data from their partition for supporting data sources, such as a partitioned Kafka topic. +For non-partitioned data, a single worker is responsible for reading it and forwarding parts of it to other workers. + +Workers communicate with each other as needed (using appropriate methods: shared memory for threads, sockets for processes, and workers on different machines). + +Workers send each other data to be processed and exchange progress information. Every node in the dataflow tracks its progress and efficiently (using the topology of the dataflow) notifies its peers when a portion of input data is processed. +This is important for consistency: every result produced by Pathway depends on a known prefix of the input data stream. + +The basic dataflow design concepts for Pathway follow those laid out in the foundational work of Microsoft Naiad ([SOSP 2013 paper](https://sigops.org/s/conferences/sosp/2013/papers/p439-murray.pdf)). The communication primitives, concepts of time within the system, and in-memory state representation are based on those of [Timely + Differential Dataflow](https://github.com/TimelyDataflow/differential-dataflow). + +A broader background on how Pathway relates to other distributed systems and dataflows is provided in the [arXiv preprint introducing Pathway](https://arxiv.org/pdf/2307.13116.pdf). + +## Architecture diagram + +The following diagram sketches a multi-worker Pathway deployment. +Each worker has a copy of the dataflow, a local state storage used by the stateful operators, and the input and output connectors for which it is responsible. +Workers exchange data and computation progress. +Each worker asynchronously saves the state to a permanent location (e.g., S3). +Upon failure, all workers determine the last snapshot they wrote and then rewind their computations to this snapshot. + +![Multi-worker Pathway deployment diagram](assets/content/worker_architecture.png) + +## Distributed deployment + +A multi-server (distributed) deployment can use Kubernetes and its cloud implementations (AKS, EKS). +Pathway assumes a stateful set deployment with all pods present for a successful operation. +Pathway enterprise offering covers distributed multi-server deployment for production use. +Support with integration into existing helm charts and k8s tooling is provided. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/20.persistence.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/20.persistence.md new file mode 100644 index 00000000..6bfd441b --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/20.persistence.md @@ -0,0 +1,93 @@ +# Persistence in Pathway + +The pathway framework helps to build computational pipelines easily and flexibly. Under the course of development, one may probably need to save the state of the computation. Here is where persistence comes to help. + +For example, persistence may be needed for one or several of the following reasons: +* To preserve the state of the computation, so that the program, if being restarted, continues to work from the place it resumed; +* To be able to recover from a failure without recomputing the whole data pipeline from scratch. + +## Example + +Suppose that we're solving a simple wordcount task. The input consists of a directory of CSV files, which needs to be polled constantly. Each file consists of a header and several lines, each containing a word. + +For the output there should be a file containing a sequence of JSON lines, each containing fields "word" and "count", denoting the word and the number of its' occurrences. + +Having that given, the problem can be solved with the following small Pathway program: + +```python +import pathway as pw + + +class InputSchema(pw.Schema): + word: str + + +words = pw.io.csv.read("inputs/", schema=InputSchema) +word_counts = words.groupby(words.word).reduce(words.word, count=pw.reducers.count()) +pw.io.jsonlines.write(word_counts, "result.jsonlines") +pw.run() +``` + +This program is not persistent. That means that when being restarted, it will start the computation from the beginning, scanning the files one-by-one, processing the word counts, and outputting them. + +However, it can easily be made into persistent one. + +Then only essential block that Pathway needs is the persistent storage configuration. Consider that we store the intermediary state in the local directory called `state`. Then, we obtain the following persistent config: + +```python +persistence_backend = pw.persistence.Backend.filesystem("./state/") +persistence_config = pw.persistence.Config.simple_config(persistence_backend) +``` + +Now, the persistence config needs to be passed into the `pw.run` method. It can be done as follows: + +```python +pw.run(persistence_config=persistence_config) +``` + +With these modifications the program becomes persistent. That means, if being restarted it will continue the computations and the output from the place it stopped last time. Now let's understand the main concepts for persistence setup in more detail. + +Please note that there is also a [tutorial](/developers/user-guide/persistence-consistency-scaling/persistence_recovery/) with step-by-step explanation available. + +## Main Concepts + +### Persistent Storage + +First, Pathway needs storage, which can persist the internal state, along with the checkpoint and metainformation. The persistence mechanism combines internal state serialization with a write-ahead logging mechanism. It can store the data either locally or in S3. + +The storage can be configured with the usage of [`pw.persistence.Config`](/developers/api-docs/persistence-api/#pathway.persistence.Config) class instance, which should be passed as an argument to `pw.run` command, executing the data pipeline. The persistence config accumulates several settings: +* metadata storage, denoting the storage to be used for the metadata: the times advanced, current positions in the data sources, description of the computational graph. The size of metadata doesn't depend on the volume of data being processed; +* snapshot storage, denoting the storage to be used for data snapshot. The data snapshot is a bigger structure, with the size depending on the volume of data; +* snapshot interval, denoting the desired freshness of maintained snapshot. This interval is a tradeoff between borrowing too many resources from the computation and having not so actual snapshot. It denotes how close can be the new updates to the last dumped state in order for the engine to be able to postpone storing them. + +Both metadata and snapshot storages are configurable with the class [`pw.persistence.Backend`](/developers/api-docs/persistence-api/#pathway.persistence.Backend), that has two methods for S3 and Filesystem storage configuration. + +The config itself is created via constructor, where it accepts keyword arguments `metadata_storage` and `snapshot_storage`, both of the type `pw.persistence.Backend`, and optionally the snapshot interval given in milliseconds. + +Please note that if the same backend is used for them, there's a convenience method `simple_config` that allows persistent storage config creation with just a single backend. + +This, however, is not the only thing needed for persistence to work, and it moves us to the persistent ids. + +### Persistent IDs + +To persist certain input sources, Pathway requires them to have the `persistent_id` parameter. This identifier denotes the factual data source, so it is also expected not to be changed over the reruns of the program. + +The motivation behind the persistent id is that the data source can be changed as much as it's needed for the computations as long as it contains the data with the same schema. For example, it is possible that: +* a data format in the source is changed. For instance, JSON was used, however, the newer entries are in CSV; +* a path to the data source is changed. For instance, the logs that are parsed by Pathway are now stored on a different volume; +* some fields are renamed in the data source. As an example, it is possible that the field "date" was renamed to "datetime" to have a more accurate name. + +All in all, the variety of changes can be huge. Still, by using the same `persistent_id` the engine knows that the data still corresponds to a certain table. + +These IDs can be assigned in two different ways. The first one is automatic generation. It will assign the IDs to the sources based on the order in which they are added. For example, if the program first reads a dataset from the CSV and then it reads the stream of events from Kafka, it will have two persistent IDs automatically generated, where the first one will point to the dataset and the second one will point to the stream of events. Please note that this approach is fine in case the code is not going to be updated. Otherwise, the sources can be changed which will incur the generation of different persistent IDs, not corresponding to the old ones. + +The second way can be a bit more difficult, but it allows more flexibility: the persistent IDs can be assigned manually in the input connector. To do so, one needs to specify the string parameter `persistent_id`. For example, the word count program above relied on the automatic `persistent_id` generation. However, it would also be possible to specify it explicitly this way: +```python +words = pw.io.csv.read("inputs/", schema=InputSchema, persistent_id="words_source") +``` + +## Requirements and assumptions + +The framework currently maintains a snapshot of the internal state, along with the metadata required for recovery. When a program starts, Pathway first looks for persisted checkpoints. It then loads the snapshot data and the information about the next unread offsets in the data sources. Due to this, there is a requirement for the data sources to be persistent, so that if the program is terminated for whatever reason, the next entries can be re-read on restart. Please note that most of the data sources comply with this requirement: that is, S3, Kafka topic, or filesystem entries can easily be re-read if the program restarts. + +The guarantee, given by Pathway's persistent recovery, is at-least-once delivery. As stated in the section "Transactional changes" of "Update streams" [article](/developers/documentation/input-and-output-streams/stream-of-updates/#transactional-changes), internally, the inputs are split into smaller transactional batches. If the program is interrupted during execution, the outputs corresponding to the unclosed transactional batches may duplicate. However, it shouldn't be the case for graceful program termination, where exactly-once semantic can be guaranteed. diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/21.persistence_recovery.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/21.persistence_recovery.md new file mode 100644 index 00000000..dec5ca1e --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/21.persistence_recovery.md @@ -0,0 +1,370 @@ +--- +title: 'Persistence in Pathway: how to resume the interrupted computation' +description: 'Tutorial on persistence in Pathway: how to set it up and how to use it to recover the interrupted computation' +# notebook_export_path: notebooks/tutorials/persistence_recovery.ipynb +date: '2023-11-02' +thumbnail: '' +tags: ['tutorial', 'engineering', 'dataflow management'] +keywords: ['persistence', 'recovery'] +--- + +# Persistence in Pathway: resume the interrupted computation + +In this article, you will learn how to resume the interrupted Pathway computation. We will start with a fairly simple Pathway program, which doesn't persist in its state. + +It will be run as a separate process in the streaming mode, that is, waiting for the inputs all the time. + +In another process, we will run the streamer: a simple script that puts the data into the input directory every so often. So, we will expect that the Pathway program fetches the data from the streamer and uses it in the computation. + +We will then terminate the computation and show that when being re-run, the Pathway program without persistence indeed starts and provides the results from the very beginning. + +Finally, we demonstrate how to easily make a persistent program from the one we've had before and will see that it catches up where we stopped last time. + +## Sample task + +First of all, we need to pick a task which we will be solving. In this tutorial, we will do the basic word counting. + +It will work as follows: the Pathway program will scan the directory in the filesystem, containing the CSV files. The expected format of any CSV file is just one column named `word` and containing exactly one word. The output format, for example, can be the JsonLines file, containing the stream of changes for the collection of two columns: the word and the count of this word. + +## Preparing the environment + +First things first, we will need some methods which would help us to test both variants: the simple one and the persistent one. The first of these "helpers" would be the cleanup routine. We will store inputs in the directory `inputs`, and when doing our experiments, we will need to ensure that we start with an empty one. + +So, we obtain the following method: + + +```python +import multiprocessing +import os +import shutil +import subprocess +import time + + +def clean_input_directory(): + if os.path.exists("inputs/"): + shutil.rmtree("inputs/") + os.mkdir("inputs/") + + +clean_input_directory() +``` + +Next, we will need the streamer method which we were talking about before. The goal for the streamer is to emit the new file in the inputs directory every so often. For our purposes, it would be convenient to use the input files containing only one word. + +Proceeding with the interface and the implementation, let's also note a few things important for us: +* We provide a way to specify the emit interval of the new files. It is the variable `interval_sec` in the method signature; +* We provide a way to specify the exact amount of input blocks, which will be denoted and the variable `how_many` in the signature; +* Finally, we would want to give some stable and reproducible way to generate the words themselves. So, we can emit then in round-robin manner from the list called `words`. + +Eventually, we obtain the following method. Please note that `time.time()` has microsecond precision on most platforms and thus it will give us convenient and unique file names: + + +```python +def generate_inputs(interval_sec, how_many, words): + for file_id in range(how_many): + time.sleep(interval_sec) + with open(f"inputs/{time.time()}", "w") as f: + f.write(f"word\n{words[file_id % len(words)]}") +``` + +## Solving the problem in Pathway + +Now let's create a simple program in Pathway that would do the word count for us. For now, we don't try to be persistent. + +First of all, we need to denote the schema for the entries which will be read from the input. This schema is denoted by the class `InputSchema` in the next section. + +Then, let's proceed with the computational logic description. It will consist of three basic steps: + +1. Defining the input connector: since we read from the CSV directory, we can use the `pw.io.csv.read` method. As a source, let's point to the directory `inputs/` (as you can see above the files are streamed there). We also pass the schema and the auto-commit duration of 10 milliseconds just to ensure that we send the updates to the engine often; +2. Defining the computation: in fact, word counting is the group-by operation on the stream of words. So, we add a group-by operation with the reducer which takes the count of each word and attaches this count to it; +3. Defining the output connector: the result we've got in the second step should now be sent to the output. Since we write it to the filesystem in jsonlines format, we can use the `pw.io.jsonlines.write` method. + +For convenience, we don't use the top-like view for monitoring the execution, so we disable it by passing the keyword argument to the `pw.run` method. + +Last but not least point, we simulate the envoronment where the Pathway program is run as a separate process - as it usually does. To do so we save the code into a separate file named `wordcount.py` and then use `subprocess.Popen` class to run it. + + +```python +%%writefile wordcount.py + +import pathway as pw + + +class InputSchema(pw.Schema): + word: str + + +if __name__ == "__main__": + words = pw.io.csv.read( + "inputs/", + schema=InputSchema, + autocommit_duration_ms=10 + ) + word_counts = words.groupby(words.word).reduce(words.word, count=pw.reducers.count()) + pw.io.jsonlines.write(word_counts, "result.jsonlines") + pw.run(monitoring_level=pw.MonitoringLevel.NONE) +``` + + Overwriting wordcount.py + + +The code that runs the computation, in this case, would look as follows: + + +```python +def run_pathway_wordcount_program(): + pd = subprocess.Popen(["python", "wordcount.py"]) + return pd +``` + +Now let's test the program and the streamer together. To do so, we first start the generation of 200 input files with an interval of 50 milliseconds between their writes. These files will alternatively contain strings "hello" and "world". + +Having that given, in 10 seconds we will have 200 input files, 100 of which will contain the word "hello" while the other 100 will contain the word "world". + +Right after launching the streaming, we launched our Pathway program. We wait for 5 seconds, which is not enough for the streamer to produce all input blocks and then we terminate it with the kill method of `subprocess.Popen`. And then we wait for another five seconds for the streamer to complete. + + +```python +# Start streaming inputs +pd_gen = multiprocessing.Process( + target=generate_inputs, + args=(0.05, 200, ["hello", "world"]) +) +pd_gen.start() + +# Run Pathway program +pd_comp = run_pathway_wordcount_program() +time.sleep(5) +pd_comp.kill() +pd_gen.join() +``` + +Let's see the results! Reading the whole file can be tedious, and we only need to see its tail. + + +```python +!tail -5 result.jsonlines +``` + + {"word":"hello","count":49,"diff":1,"time":1699279664772} + {"word":"world","count":48,"diff":-1,"time":1699279664822} + {"word":"world","count":49,"diff":1,"time":1699279664822} + {"word":"hello","count":49,"diff":-1,"time":1699279664874} + {"word":"hello","count":50,"diff":1,"time":1699279664874} + + +As we can see, the results are incomplete. The counts are 50 per word, while the produced amount was 100 for each of these words. No doubt, the reason was that the program was killed during its' normal execution. + +But let's run it again. Now we can see that all files are present in the directory. Our goal is to see that now it produces the expected result. + + +```python +pd_comp = run_pathway_wordcount_program() +time.sleep(5) +pd_comp.kill() + +!tail -5 result.jsonlines +``` + + {"word":"hello","count":98,"diff":1,"time":1699279678510} + {"word":"world","count":97,"diff":-1,"time":1699279678512} + {"word":"world","count":100,"diff":1,"time":1699279678512} + {"word":"hello","count":98,"diff":-1,"time":1699279678512} + {"word":"hello","count":100,"diff":1,"time":1699279678512} + + +But now we would also like to see where it started. So we can check the first lines of the output: + + +```python +!head -5 result.jsonlines +``` + + {"word":"world","count":3,"diff":1,"time":1699279678436} + {"word":"hello","count":4,"diff":1,"time":1699279678436} + {"word":"world","count":3,"diff":-1,"time":1699279678438} + {"word":"world","count":4,"diff":1,"time":1699279678438} + {"word":"hello","count":4,"diff":-1,"time":1699279678438} + + +As we can see, the program started the computations from the beginning. Let's see if we can avoid this. + +# Introducing Persistence + +Persistence is the way for the program to remember where it stopped the computations, the read, and the output during the last execution. + +The main idea behind this is that Pathway periodically dumps the state the the given data storage backend. When being restarted, Pathway first looks for the snapshot of the data that was made and if it finds this snapshot, the snapshot gets loaded into the engine and it doesn't have to do the reads and the processing for the data that was preserved there. + +Persistence is easy to get started with. Now we will walk you through its setup, after which we will show the program resumes the computation. + +## Persistence Backend + +The persistence mechanism saves the snapshot of the computation. This snapshot consists of some raw data which can be proportional to the input size and some metadata which contains smaller bits of information. These entities must be stored in a durable storage. + +Pathway currently gives two choices for this storage: +* the filesystem, namely the folder on your local drive; +* the S3 bucket where you can allocate the root directory; + +In this demo, we will use the locally-hosted persistent storage, which is the first option among the above. + +Since you may want to rerun this demo several times, it's a good idea to have a helper that would clean this locally-hosted storage. We will suppose that it will be hosted in the directory named `PStorage`: + + + +```python +def clean_persistent_storage(): + if os.path.exists("./PStorage"): + shutil.rmtree("./PStorage") + + +clean_persistent_storage() +``` + +Now we can proceed with the configuration for the persistence. It will contain two lines. + +In the first one, we create the config for the persistent backend. Since the config is filesystem-based, we use the `pw.persistence.Backend.filesystem` method to create it. It only requires the path where the storage can be hosted. + +The second line is where we create the persistence config itself. For our case, we host the snapshot and the metadata in the same place, so we simply use the `pw.persistence.Config.simple_config` method. + +All in all, we have the following configuration code: + +```python +backend = pw.persistence.Backend.filesystem("./PStorage") +persistence_config = pw.persistence.Config.simple_config(backend) +``` + +## Persistent IDs + +The second (and optional) thing we need to do is the persistent ID assignment. The persistent IDs are required for the engine to match the data sources between different runs. + +In principle, the persistent ID assignment can be done automatically by the engine. In this case, it will assign the persistent IDs to the sources in the order of their appearance and construction. However, this is not generally recommended if you need to change your Pathway program and the data source in the future. + +For the sake of completeness, in this tutorial, we will demonstrate the manual persistent ID assignment. The only difference from the non-persistent variant of the input is the parameter `persistent_id` which should be passed to the `pw.io.csv.read` method. So, if we name the data source `words_data_source` the assignment may look as follows: + +```python +pw.io.csv.read( + ..., + persistent_id="words_data_source" +) +``` + +## Revisited Pathway Program + +Now we can apply the described changes to the program that we had in the beginning. We still need a way to run it in a separate process that can be interrupted, so the interface stays the same. At the same time, we have the following persistent code saved as the wordcount program: + + +```python +%%writefile wordcount.py + +import pathway as pw + + +class InputSchema(pw.Schema): + word: str + + +if __name__ == "__main__": + words = pw.io.csv.read( + "inputs/", + schema=InputSchema, + autocommit_duration_ms=10, + persistent_id="words_input_source", # Changed: now persistent_id is assigned here + ) + word_counts = words.groupby(words.word).reduce(words.word, count=pw.reducers.count()) + pw.io.jsonlines.write(word_counts, "result.jsonlines") + + backend = pw.persistence.Backend.filesystem("./PStorage") + persistence_config = pw.persistence.Config.simple_config(backend) + pw.run( + monitoring_level=pw.MonitoringLevel.NONE, + persistence_config=persistence_config, # Changed: now persistence_config is passed here + ) +``` + + Overwriting wordcount.py + + +Let's test it! Like the last time, let's generate the 200 files with 50 millisecond intervals, each containing either the word "hello" or the word "world" in an alternating manner. Similarly, we will terminate the Pathway program before it manages to read all input files because not all of them will be in place: + + +```python +# Clean the old files: remove old results and inputs +!rm -rf result.jsonlines +clean_input_directory() + +# Start streaming inputs +pd_gen = multiprocessing.Process( + target=generate_inputs, + args=(0.05, 200, ["hello", "world"]) +) +pd_gen.start() + +# Run Pathway program +pd_comp = run_pathway_wordcount_program() +time.sleep(5) +pd_comp.kill() +``` + +As a check, we can observe what we have on the output. The streamer wouldn't be able to produce much more than half of the input files by the time the program is interrupted: + + +```python +!tail -5 result.jsonlines +``` + + {"word":"hello","count":49,"diff":1,"time":1699279708352} + {"word":"world","count":48,"diff":-1,"time":1699279708402} + {"word":"world","count":49,"diff":1,"time":1699279708402} + {"word":"hello","count":49,"diff":-1,"time":1699279708452} + {"word":"hello","count":50,"diff":1,"time":1699279708452} + + +But now let's run the program again! Our goal is to see, with the fill input, where it will start to produce the results. We are running it for 5 seconds because, in the streaming mode, it won't finish by itself. + + + +```python +pd_comp = run_pathway_wordcount_program() +time.sleep(5) +pd_comp.kill() +``` + +Let's check the results. This time we need a few first and a few last lines of the output. + +The last ones are needed to check if the result is correct: the last count for each word should be equal to 100. + +The first lines are needed to see where the program started the computations from. + + +```python +!head -5 result.jsonlines +!echo "===" +!tail -5 result.jsonlines +``` + + {"word":"world","count":49,"diff":-1,"time":1699279716584} + {"word":"world","count":51,"diff":1,"time":1699279716584} + {"word":"hello","count":50,"diff":-1,"time":1699279716584} + {"word":"hello","count":51,"diff":1,"time":1699279716584} + {"word":"world","count":51,"diff":-1,"time":1699279716586} + === + {"word":"hello","count":99,"diff":1,"time":1699279716634} + {"word":"world","count":98,"diff":-1,"time":1699279716636} + {"word":"world","count":100,"diff":1,"time":1699279716636} + {"word":"hello","count":99,"diff":-1,"time":1699279716636} + {"word":"hello","count":100,"diff":1,"time":1699279716636} + + +As we can see, the results are correct! + +Moreover, now the results are produced from the counts around 50 (which may vary from run to run), which means that we did not have to recompute and output the previously computed and outputted data. + +Please note that the first lines of the output above may intersect with the last lines of the previous run. Here we can observe the "at-least-once" semantics, with duplicated deliveries happening for the transaction minibatch which wasn't committed by the time the initial computation was interrupted. + +# Conclusion + +Here we demonstrated persistence in action on a very simple example. + +While being a powerful mechanism, persistence is also capable of solving different tasks. For instance, in the later tutorial, we will show how it can handle the data source change under certain conditions. + diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/22.persistence_restart_with_new_data.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/22.persistence_restart_with_new_data.md new file mode 100644 index 00000000..bcb87886 --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/22.persistence_restart_with_new_data.md @@ -0,0 +1,382 @@ +--- +title: 'Persistence in Pathway: how to add new data without full computation restart' +description: 'Tutorial on persistence in Pathway: how to run the finished computation with new inputs added' +# notebook_export_path: notebooks/tutorials/persistence_restart_with_new_data.ipynb +date: '2023-11-14' +thumbnail: '' +tags: ['tutorial', 'engineering', 'dataflow management'] +keywords: ['persistence', 'optimization', 'restart'] +--- + +# Restarting Pathway computation with the new data added + +Persistence allows you to pick up computation where it left off, and the presence of a snapshot makes handling data additions between reruns smoother, avoiding the need to restart the entire computation. + +This capability is valuable in various scenarios. For instance, real-time logs stored on pods may only become available in hourly increments. In other cases, logs are collected periodically through a cron-like job, such as every ten minutes, every hour, or every day. Typically, analytics need to be rebuilt each time new data appears. + +This tutorial will explore how to incorporate such additions using a simple example of log statistics calculation. You will start by computing user sessions on historical data and demonstrate how to integrate new information from real-time logs. + +![figure.png](/assets/content/tutorials/persistence-restart-with-new-data.png) + +## Scenario + +Suppose that you have the website access log. The log is rather simple and consists of the access time in the format of [UNIX timestamp](https://en.wikipedia.org/wiki/Unix_time) and the user's login. + +You would like to analyze the users' behavior on the website. One approach is to create sessions, defined as consecutive, non-extendable time intervals during which the user interacts with the website. Let's assume users spend a maximum of five minutes on a single page before moving to the next page or leaving. Hence, two visits are considered part of the same session if there's no more than a five-minute (or 300-second) gap between them. + +The objective is to manage a set of sessions, ensuring each log entry belongs to precisely one session and no two sessions can be merged. + +To accommodate adding new data and rerunning the data pipeline, you need to establish a stream of updates on the output. + +## What to do with updated data + +Imagine you have efficiently processed all the logs in the directory using the static mode, and the program has completed successfully. However, since logs are received regularly, you face the challenge of repeating this computation on new log portions that may be generated later or arrive with a delay. + +Starting the computation from scratch and receiving the entire stream of updates from the beginning would be costly. The following discussion will teach you how to handle the computation on new, as-yet-unprocessed log files efficiently. + +## Preparing inputs + +First, you need to take care of the input files the engine will parse. You can start by defining the schema. In Pathway format, its definition looks as follows: + + +```python +import pathway as pw + + +class InputSchema(pw.Schema): + user_id: str + access_time: int +``` + + +That is, the access time is an integer, denoting the number of seconds passed since the beginning of [UNIX epoch](https://en.wikipedia.org/wiki/Epoch_(computing)). The login, naturally, is a string. + +Now, you can start creating the inputs that correspond to this schema. Let's place them in a directory called `logs`. You first need to create it and clean off the old files, so you need to run a couple of Unix commands: + + +```python +!rm -rf logs +!mkdir logs +``` + +You can now place a few input files in the newly created directory. Let's generate two log files, each with access entries of a single day. Of course, in a real-life scenario, the span and the number of different users can be more significant: + + +```python +%%writefile logs/part_one.csv +user_id,access_time +alice,1701102730 +bob,1701102743 +alice,1701103035 +bob,1701102774 +bob,1701102805 +alice,1701103037 +``` + + Writing logs/part_one.csv + + +The second file has the same format: + + +```python +%%writefile logs/part_two.csv +user_id,access_time +alice,1701103040 +alice,1701103331 +alice,1701103618 +``` + + Writing logs/part_two.csv + + +With this setup, you can solve the problem with Pathway and get some initial results. + +The script below first reads the data according to the schema and then uses a [temporal function](/developers/api-docs/temporal/) to build the unexpendable user-session intervals: + + +```python +%%writefile script.py +import pathway as pw + + +class InputSchema(pw.Schema): + user_id: str + access_time: int + + +if __name__ == "__main__": + table = pw.csv.read("logs/", mode="static", schema=InputSchema) + sessions = table.windowby( + pw.this.access_time, + window=pw.temporal.session(max_gap=300), + shard=pw.this.user_id, + ).reduce( + user_id=table.user_id, + session_length=( + pw.reducers.max(table.access_time) - pw.reducers.min(table.access_time) + ), + session_start=pw.reducers.min(table.access_time), + session_finish=pw.reducers.max(table.access_time), + count=pw.reducers.count(), + ) + pw.csv.write(sessions, "sessions.csv") + pw.run(monitoring_level=pw.MonitoringLevel.NONE) +``` + + Overwriting script.py + + +The script can then be run with a simple console command: + + +```python +!python script.py +``` + + sys:1: UserWarning: pathway.csv has been moved to pathway.io.csv + /usr/local/lib/python3.10/dist-packages/pathway/internals/table.py:1189: UserWarning: Key sets of self and other in update_cells are the same. Using with_columns instead of update_cells. + warnings.warn( + [2023-11-29T15:01:32]:INFO:Preparing Pathway computation + [2023-11-29T15:01:32]:INFO:CsvFilesystemReader-0: 0 entries (1 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:32]:INFO:CsvFilesystemReader-0: 9 entries (2 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:32]:WARNING:CsvFilesystemReader-0: Closing the data source + [2023-11-29T15:01:32]:INFO:FileWriter-0: Done writing 0 entries, time 1701270092168. Current batch writes took: 0 ms. All writes so far took: 0 ms. + [2023-11-29T15:01:32]:INFO:FileWriter-0: Done writing 3 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms. + + +And the results can be seen with the usage of `cat`: + + +```python +!cat sessions.csv +``` + + user_id,session_length,session_start,session_finish,count,time,diff + "bob",62,1701102743,1701102805,3,1701270092168,1 + "alice",0,1701102730,1701102730,1,1701270092168,1 + "alice",583,1701103035,1701103618,5,1701270092168,1 + + +As you can see, the program works correctly. + +Specifically, there are two sessions for the user `alice`: the first involves accessing a single page, and the second spans more than 5 minutes, resulting from multiple website accesses, each with less than a 5-minute interval before the next one. + +As for the user `bob`, there is one session encompassing all of their access events. + +Now, let's dive deeper into the problem of adding more data. + +# Processing newly arrived data in the rerun + +Logs may arrive at varying intervals, with a cron-like job delivering data every ten or thirty minutes. In our ideal scenario, you'd like to avoid recalculating results from the beginning and instead focus on processing the new data. But how can you achieve this? + +[Persistence](/developers/api-docs/persistence-api/) can help here. The idea is to store the state of previous calculations so that when new files are added later, there's no need to start from scratch. + +So you need to do two things. First, you need to "name" input sources by assigning persistent IDs to them. This way, when the program recovers, it can accurately match the operator with the correct data dump. + +Here, it is optional because the data processing pipeline doesn't change. It will be shown for the fullness of the example that the only difference is the parameter `persistent_id`: + + +```python +access_entries = pw.io.csv.read( + "logs/", + schema=InputSchema, + mode="static", + persistent_id="logs" +) +``` + +The second thing is mandatory: you must set up the persistent storage. The persistent storage is where Pathway stores its dump of data in computations. It can be hosted in S3-like data storage and locally. + +Let's store it locally in the folder `./PStorage`. To correctly handle the notebook's reruns, let's clean it first: + + +```python +!rm -rf PStorage +``` + +Then, you can create the script for persistent config. This config will consist of two lines. + +The first line denotes the backend. Since it's a filesystem-based one, you can use the method `pw.persistence.Backend.filesystem` to pass the path to the stores. + +The second line denotes persistence config. It can be constructed with a method `pw.persistence.Config.simple_config`, which accepts the backend settings. + + +```python +persistence_backend = pw.persistence.Backend.filesystem("./PStorage") +persistence_config = pw.persistence.Config.simple_config(persistence_backend) +``` + +With the changes described above, the code will look as follows: + + +```python +%%writefile script.py +import pathway as pw + + +class InputSchema(pw.Schema): + user_id: str + access_time: int + + +if __name__ == "__main__": + table = pw.io.csv.read( + "logs/", + mode="static", + schema=InputSchema, + persistent_id="logs", # Change: persistent ID assigned + ) + sessions = table.windowby( + pw.this.access_time, + window=pw.temporal.session(max_gap=300), + shard=pw.this.user_id, + ).reduce( + user_id=table.user_id, + session_length=( + pw.reducers.max(table.access_time) - pw.reducers.min(table.access_time) + ), + session_start=pw.reducers.min(table.access_time), + session_finish=pw.reducers.max(table.access_time), + count=pw.reducers.count(), + ) + pw.io.csv.write(sessions, "sessions.csv") + + # Change: persistence config created + persistence_backend = pw.persistence.Backend.filesystem("./PStorage") + persistence_config = pw.persistence.Config.simple_config(persistence_backend) + pw.run( + monitoring_level=pw.MonitoringLevel.NONE, + + # Change: persistence config passed to the method + persistence_config=persistence_config, + ) +``` + + Overwriting script.py + + +Now, you can run it so that it provides the initial artifacts to the persistent storage. + + +```python +!python script.py +``` + + /usr/local/lib/python3.10/dist-packages/pathway/internals/table.py:1189: UserWarning: Key sets of self and other in update_cells are the same. Using with_columns instead of update_cells. + warnings.warn( + [2023-11-29T15:01:38]:INFO:Preparing Pathway computation + [2023-11-29T15:01:38]:INFO:No time has been advanced in the previous run, therefore no data read from the snapshot + [2023-11-29T15:01:38]:INFO:Reached the end of the snapshot. Exiting the rewind after 0 entries + [2023-11-29T15:01:38]:INFO:Seek the data source to the frontier OffsetAntichain { antichain: {} } + [2023-11-29T15:01:38]:INFO:CsvFilesystemReader-logs: 0 entries (1 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:38]:INFO:CsvFilesystemReader-logs: 9 entries (2 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:38]:WARNING:CsvFilesystemReader-logs: Closing the data source + [2023-11-29T15:01:38]:INFO:FileWriter-0: Done writing 0 entries, time 1701270098482. Current batch writes took: 0 ms. All writes so far took: 0 ms. + [2023-11-29T15:01:38]:INFO:Save offset: (Empty, FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }) + [2023-11-29T15:01:38]:INFO:FileWriter-0: Done writing 3 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms. + + +Now, let's introduce some new logs. These logs will include access entries for times after the previous data. To demonstrate that the results only capture the new changes, we'll focus solely on the access events for user `bob`. + + +```python +%%writefile logs/part_three.csv +user_id,access_time +bob,1701104640 +bob,1701104931 +bob,1701105218 +``` + + Writing logs/part_three.csv + + +Let's run the script with the new entries added: + + +```python +!python script.py +``` + + /usr/local/lib/python3.10/dist-packages/pathway/internals/table.py:1189: UserWarning: Key sets of self and other in update_cells are the same. Using with_columns instead of update_cells. + warnings.warn( + [2023-11-29T15:01:47]:INFO:Preparing Pathway computation + [2023-11-29T15:01:47]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270098483 } + [2023-11-29T15:01:47]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270098483 } + [2023-11-29T15:01:47]:INFO:Reached the end of the snapshot. Exiting the rewind after 9 entries + [2023-11-29T15:01:47]:INFO:Seek the data source to the frontier OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }} } + [2023-11-29T15:01:47]:INFO:CsvFilesystemReader-logs: 0 entries (1 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:47]:INFO:CsvFilesystemReader-logs: 3 entries (2 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:47]:WARNING:CsvFilesystemReader-logs: Closing the data source + [2023-11-29T15:01:48]:INFO:Save offset: (Empty, FilePosition { total_entries_read: 15, path: "/content/logs/part_three.csv", bytes_offset: 65 }) + [2023-11-29T15:01:48]:INFO:FileWriter-0: Done writing 1 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms. + + +And let's check the results. + + +```python +!cat sessions.csv +``` + + user_id,session_length,session_start,session_finish,count,time,diff + "bob",578,1701104640,1701105218,3,1701270107982,1 + + +As you can see, the results were generated only for the modified entries. Specifically, there is no update for the user `alice` since their session data remains unchanged. + +In contrast, there is an update for the user `bob`; they now have a new session. This fresh session, spanning 578 seconds with three events, is reflected in the output. + +To further confirm that only the differences from unprocessed additions are considered each time, let's add one more file. For instance, you can extend the last session of the user `alice`: + + +```python +%%writefile logs/part_four.csv +user_id,access_time +alice,1701103338 +alice,1701103629 +alice,1701103916 +``` + + Writing logs/part_four.csv + + +And the run the script again: + + +```python +!python script.py +``` + + /usr/local/lib/python3.10/dist-packages/pathway/internals/table.py:1189: UserWarning: Key sets of self and other in update_cells are the same. Using with_columns instead of update_cells. + warnings.warn( + [2023-11-29T15:01:52]:INFO:Preparing Pathway computation + [2023-11-29T15:01:52]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270098483 } + [2023-11-29T15:01:52]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 15, path: "/content/logs/part_three.csv", bytes_offset: 65 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270107983 } + [2023-11-29T15:01:52]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 11, path: "/content/logs/part_two.csv", bytes_offset: 71 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270098483 } + [2023-11-29T15:01:52]:INFO:Merge the current state with block: StoredMetadata { frontiers: OffsetAntichainCollection { antichains: {250563226431679957366536773997412061550: OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 15, path: "/content/logs/part_three.csv", bytes_offset: 65 }} }} }, storage_types: {250563226431679957366536773997412061550: CsvFilesystem}, last_advanced_timestamp: 1701270107983 } + [2023-11-29T15:01:52]:INFO:Reached the end of the snapshot. Exiting the rewind after 12 entries + [2023-11-29T15:01:52]:INFO:Seek the data source to the frontier OffsetAntichain { antichain: {Empty: FilePosition { total_entries_read: 15, path: "/content/logs/part_three.csv", bytes_offset: 65 }} } + [2023-11-29T15:01:52]:INFO:CsvFilesystemReader-logs: 0 entries (1 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:52]:INFO:CsvFilesystemReader-logs: 3 entries (2 minibatch(es)) have been sent to the engine + [2023-11-29T15:01:52]:WARNING:CsvFilesystemReader-logs: Closing the data source + [2023-11-29T15:01:53]:INFO:FileWriter-0: Done writing 0 entries, time 1701270112954. Current batch writes took: 0 ms. All writes so far took: 0 ms. + [2023-11-29T15:01:53]:INFO:Save offset: (Empty, FilePosition { total_entries_read: 19, path: "/content/logs/part_four.csv", bytes_offset: 71 }) + [2023-11-29T15:01:53]:INFO:FileWriter-0: Done writing 2 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms. + + + +```python +!cat sessions.csv +``` + + user_id,session_length,session_start,session_finish,count,time,diff + "alice",583,1701103035,1701103618,5,1701270112954,-1 + "alice",881,1701103035,1701103916,8,1701270112954,1 + + +In this scenario, the situation is more complex. The last session of the user `alice` was extended. Consequently, the first event signifies its removal since it's no longer accurate due to the new events. + +The second event represents the insertion of the new extended session. The updated session lasts 881 seconds instead of 583 and includes eight events instead of five. By applying the removal of the old session and the insertion of the new one, you arrive at a valid updated snapshot. + +In summary, you've learned how persistence can optimize your computations. Whether running in static mode or streaming, it is a valuable tool for expediting analytics and conserving resources. diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/30.consistency.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/30.consistency.md new file mode 120000 index 00000000..1d7ad954 --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/30.consistency.md @@ -0,0 +1 @@ +.consistency/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/40.pathway-vs-flink.md b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/40.pathway-vs-flink.md new file mode 100644 index 00000000..1f630c22 --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/40.pathway-vs-flink.md @@ -0,0 +1,234 @@ +--- +title: Pathway vs Flink +description: 'This page compares Pathway and Flink' +toc: false +--- + +# Comparison with Flink + +Pathway is a Python framework with a unified engine for batch and streaming data processing. +Why should you choose Pathway instead of any other existing streaming engines, such as Apache Flink? + +To assist you in your choice, here is a blueprint of Pathway features is provided below, together with a comparison to Apache Flink. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Feature +Pathway +Apache Flink +
General +
Processing TypeStream and batch (with the same engine).
Guarantees of same results returned whether running in batch or streaming.
Capacity for asynchronous stream processing and API integration.
Stream and batch (with different engines).
Programming language APIsPython, SQLJVM (Java, Kotlin, Scala), SQL, Python
Programming APITable APIDataStream API and Table API, with partial compatibility
Software integration ecosystems/plugin formats.Python,
C binary interface (C, C++, Rust),
REST API.
JVM
Ease of development +
How to QuickStartGet Python.
Do `pip install pathway`.
Run directly.
Get Java.
Download and unpack Flink packages.
Start a local Flink Cluster with `./bin/start-local.sh`.
Use netcat to start a local server.
Submit your program to the server for running.
Running local experiments with dataUse Pathway locally in VS Code, Jupyter, etc.Based on local Flink clusters
CI/CD and TestingUsual CI/CD setup for Python (use GitHub Actions, Jenkins etc.)
Simulated stream library for easy stream testing from file sources.
Based on local Flink cluster integration into CI/CD pipelines
Interactive work possible?Yes, data manipulation routines can be interactively created in notebooks and the Python REPLCompilation is necessary, breaking data-scientist's flow of work
Performance
ScalabilityHorizontal* and vertical scaling.
Scales to thousands of cores and terabytes of application state.
Standard and custom libraries (including ML library) are scalable.
Horizontal and vertical scaling.
Scales to thousands of cores and terabytes of application state.
Most standard libraries (including ML library) do not parallelize in streaming mode.
Performance for basic tasks (groupby, filter, single join)Delivers high throughput and low latency.Slower than Pathway in benchmarks.
Transformation chain length in batch computing1000+ transformations possible, iteration loops possibleMax. 40 transformations recommended (in both batch and streaming mode).
Fast advanced data transformation (iterative graph algorithms, machine learning)In batch and streaming mode.No; restricted subset possible in batch mode only.
Parameter tuning requiredInstance sizing only.
Possibility to set window cut-off times for late data.
Considerable tuning required for streaming jobs.
Architecture and deployment
Distributed Deployment (for Kubernetes or bare metal clusters)Pool of identical workers (pods).*
Sharded by data.
Includes a JobManager and pool of TaskManagers.
Work divided by operation and/or sharded by data.
Dataflow handling and communicationEntire dataflow handled by each worker on a data shard, with asynchronous communication when data needs routing between workers.
Backpressure built-in.
Multiple communication mechanisms depending on configuration.
Backpressure handling mechanisms needed across multiple workers.
Internal Incremental Processing ParadigmCommutative
(based on record count deltas)
Idempotent
(upsert)
Primary data structure for stateMulti-temporal Log-structured merge-tree (shared arrangements).
In-memory state.
Log-structured merge-tree.
In-memory state.
State ManagementIntegrated with computation.
Cold-storage persistence layer optional.
Low checkpointing overhead.*
Integrated with computation.
Cold-storage persistence layer optional.
Semantics of stream connectorsInsert / UpsertInsert / Upsert
Message Delivery GuaranteesEnsures exactly-once delivery guarantees for state and outputs (if enabled)Ensures exactly-once delivery guarantees for state and outputs (if enabled)
ConsistencyConsistent, with exact progress tracking. Outputs reflect all data contained in a prefix of the source streams. All messages are atomically processed, if downstream systems have a notion of transaction no intermediate states are sent out of the system.Eventually consistent, with approximate progress tracking using watermarks. Outputs may reflect partially processed messages and transient inconsistent outputs may be sent out of the system.
Processing out-of-order dataSupported by default.
Outputs of built-in operations do not depend on data arrival order (unless they are configured to ignore very late data).
Event times used for windowing and temporal operations.
Supported or fragile, depending on the scenario. Event time processing supported in addition to arrival time and approximate watermarking semantics.
Fault toleranceRewind-to-snapshot.
Partial failover handled transparently in hot replica setups.*
Rewind-to-snapshot.
Support for partial failover present or not depending on scheduler.
Monitoring systemPrometheus-compatible endpoint on each pod
Logging systemIntegrates with Docker and Kubernetes Container logs
Machine Learning support
Language of ML library implementationPython / PathwayJVM / Flink
Parallelism support by ML librariesML libraries scale vertically and horizontallyMost ML libraries are not built for parallelization
Supported modes of ML inferenceCPU Inference on worker nodes.
Asynchronous Inference (GPU/CPU).
Alerting of results updates after model change.
CPU Inference on worker nodes.
Supported modes of ML learningAdd data to the training set.
Update or delete data in the training set.
Revise past classification decisions.
Add data to the training set.
Representative real-time Machine Learning libraries.Classification (including kNN), Clusterings, graph clustering, graph algorithms, vector indexes, signal processing.
Geospatial libraries, spatio-temporal data, GPS and trajectories.*
Possibility to integrate external Python real-time ML libraries.
Classification (including kNN), Clusterings, vector indexes.
Support for iterative algorithms (iterate until convergence, gradient descent, etc.)YesNo
API Integration with external Machine Learning models and LLMsYesNo / fragile
Typical Analytics and Machine Learning use casesData fusion
Monitoring and alerting (rule-based or ML-powered)
IoT and logs data observability (rule-based or ML-powered)
Trajectory mining*
Graph learning
Recommender systems
Ontologies and dynamic knowledge graphs.
Real-time data indexing (vector indexes).
LLM-enabled data pipelines and RAG services.
Low-latency feature stores.
Monitoring and alerting (rule-based)
IoT and logs data observability (rule-based)
API and HTTP microservices
REST/HTTP API integrationNon-blocking (Asynchronous API calls) supported in addition to Synchronous calls.Blocking (Synchronous calls)
Acting as microservice hostProvides API endpoint mechanism for user queries.
Supports registered queries (API session mechanism, alerting).
No
Use as low-latency feature storeYes, standalone. From 1ms latency.Possible in combination with Key-value store like Redis. From 5ms latency.
Requires manual versioning/consistency checks.
+ +\* Features only available in the enterprise version of Pathway. See also [Feature comparison](/features). diff --git a/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/_dir.yml b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/_dir.yml new file mode 100644 index 00000000..3002464d --- /dev/null +++ b/docs/2.developers/4.user-guide/100.persistence-consistency-scaling/_dir.yml @@ -0,0 +1 @@ +title: 'Persistence, Consistency, Scaling' diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-classes/__init__.py b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-classes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-classes/article.py b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-classes/article.py new file mode 100644 index 00000000..27fc7875 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-classes/article.py @@ -0,0 +1,142 @@ +# --- +# title: 'Building a tree with transformer classes' +# description: 'Example on how transformer classes work by implementing a tree' +# notebook_export_path: notebooks/tutorials/transformer_tree_example.ipynb +# --- + +# # Implementing a tree using transformer classes +# A more advanced example on how transformer classes work by implementing a tree. +# +# Pathway's transformer class is a powerful tool to perform advanced operations on tables. +# In the following, we are going to show you how to use a transformer class to implement a tree and compute recursive operations on it. +# +# We strongly advise you to read our [introduction on transformer classes](/developers/user-guide/diving-deeper/transformer-introduction/) and the [simple examples](/developers/user-guide/diving-deeper/transformer-example/) before reading further. +# +# ## Pathway Data Structures \& Algorithms 101: How to represent a Tree? +# +# Let's take a look at one of the simplest graph-like data structures: a tree. Let's encode tree nodes into a table with columns: +# +# 1. Node ID +# 2. A value `val` of integer type, stored in nodes of the tree. +# 3. The node's parent ID - which can be Null for the root. +# +# To do this, in Pathway you can write the following schema for the considered table (ID's are implicit and don't need to be defined). + +# + +from __future__ import annotations + +from typing import Optional + +import pathway as pw + + +class Nodes(pw.Schema): + val: int + parent: Optional[pw.Pointer[Nodes]] + + +# - + +# ## Transformer Classes acting on a single row +# +# You would now like to compute some basic statistics on the tree. For example, is a given node the root? In Python, this would follow through a simple row operation: +# +# ```py +# # We would want to add this logic as a "method" to the `nodes` schema +# +# def is_root(self): +# return self.parent is None +# ``` +# +# How to make a transformer which takes a table following the schema `nodes` and "gives it" the method above? The answer is a Transformer Class which acts on a single table argument called `nodes`, and adds the `is_root` logic as an output argument. We call our transformer `tree_node_roots`: + + +@pw.transformer +class tree_node_roots: + class nodes(pw.ClassArg, input=Nodes): + val = pw.input_attribute() + parent = pw.input_attribute() + + @pw.output_attribute + def is_root(self): + return self.parent is None + + +# Let's provide a quick explanation of what happens here. +# You can specify `Nodes` as input for the class `nodes` to enforce that the rows of the table are of type `Nodes`. +# You link the parameters of the class `nodes` to the ones of `Nodes` with the `pw.input_attribute()` function. Note that the names of the parameters (`val` and `parent` in the example) must be exactly the same as the column names of the input table. +# Finally, you declare the different columns of the resulting table using the annotation `pw.output_attribute` on different functions. Each function defines a column in the output table and the value of the function is going to be used to as the value: the name of the function defines the name of the column. +# +# You can now use `tree_node_roots` as a transformer, and call `tree_node_roots(TN)` for a table `TN` of nodes to get the required output columns, just as you would for any other transformer. + +# + +tree = pw.debug.table_from_markdown( + """ + | val | parent_label + 0 | 0 | + 1 | 1 | 0 + 2 | 2 | 0 + 3 | 3 | 1 + 4 | 4 | 1 + 5 | 5 | 2 + 6 | 6 | 2 + """ +) +tree += tree.select(parent=tree.pointer_from(tree.parent_label, optional=True)) +pw.debug.compute_and_print(tree) + +result = tree_node_roots(tree).nodes +pw.debug.compute_and_print(result) + + +# - + +# ## Transformer Classes acting on multiple rows +# +# Now, let's try something which shows the power of Pathway a bit more. Suppose you would like to see how many steps away a node is from its root. Let's call this the `level` of a node. How would you compute this? +# +# Logically, the `level` of a node is higher by 1 unit than the `level` of its parent. So, the solution can be obtained by recursion. +# +# Recursion is perhaps something you would think twice about before [attempting in SQL](https://medium.com/swlh/recursion-in-sql-explained-graphically-679f6a0f143b). In Pathway, recursion is natively supported, and efficient to use where the "recursion stack" does not change much for old data rows as new data arrives. +# +# The transformer which does just what we want is provided below. + + +@pw.transformer +class tree_node_roots_and_levels: + class nodes(pw.ClassArg, input=Nodes): + val = pw.input_attribute() + parent = pw.input_attribute() + + @pw.output_attribute + def is_root(self): + return self.parent is None + + @pw.output_attribute + def level(self): + if self.is_root: + return 0 + else: + return 1 + self.transformer.nodes[self.parent].level + + +# Most of the logic is contained in the final line, `1 + self.transformer.nodes[self.parent].level`. +# +# You obtain the following table: + +result = tree_node_roots_and_levels(tree).nodes +pw.debug.compute_and_print(result) + +# A small side note: you might simply have wanted to write here `1 + self.parent.level` instead, however, this would be missing information about the table that `self.parent` lives in. This table is identified through `self.transformer.nodes`. +# +# Though making the syntax a bit more verbose, identifying objects through both a table, and a row identifier, helps to avoid confusion. +# +# You will see why this is useful in this [article](/developers/user-guide/diving-deeper/transformer-example/) where we introduce Transformer Classes that use not just one, but two or more arguments. These will allow us to work with a `matchings` table and a `profiles` table, indicating a pair of nodes for which the required computation should be performed. +# +# +# ## Conclusion +# +# In this guide, you learned how to write transformer classes building a tree and computing some basic operations on that tree. This is useful for defining row-based logic for tables, oblivious of the fact that we are operating on top of data streams. +# You can take a look at our [tour of Pathway's transformers](/developers/user-guide/diving-deeper/transformer-example/) in which you will find a list of examples of transformers. +# +# You can also check our [connectors](/developers/user-guide/connecting-to-data/connectors/) to connect your data into Pathway. diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-examples/__init__.py b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-examples/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-examples/article.py b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-examples/article.py new file mode 100644 index 00000000..07fc6cf3 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/.transformer-examples/article.py @@ -0,0 +1,242 @@ +# --- +# title: "Examples of transformer classes" +# description: 'Several examples of Pathway transformer classes' +# notebook_export_path: notebooks/tutorials/transformers.ipynb +# --- + +# # A tour of Pathway's transformer classes +# +# In this section, we will go through several examples of Pathway transformer classes. +# This should give you a good overview of how to handle them and how useful they are. +# +# We will not go into implementation details, so you are strongly encouraged to read [our introduction](/developers/user-guide/diving-deeper/transformer-introduction/) first. +# +# In the following, we are going to see how to use transformer classes to perform [simple operations on a single row](#simple-operations-on-a-single-row), use [transformers as a method](#transformers-as-a-method), and use transformers to [combine several tables at once](#transformer-classes-using-two-different-tables). +# +# ## Our guinea pig +# +# You will experiment on the following table: + +# + +from typing import Any + +import pathway as pw + +guinea_pig = pw.debug.table_from_markdown( + """ + | val | aux + 0 | 0 | 10 + 1 | 1 | 11 + 2 | 2 | 12 + 3 | 3 | 13 + 4 | 4 | 14 + 5 | 5 | 15 + 6 | 6 | 16 + """ +) + + +# - + +# ## Simple operations on a single row +# +# First, you are going to perform simple operations on the table: adding a given number, obtaining the squared value, and performing the sum of two columns. +# +# ### Adding 10 to each value: + +# + +@pw.transformer +class add_ten: + class table(pw.ClassArg): + val = pw.input_attribute() + + @pw.output_attribute + def result(self) -> float: + return self.val + 10 + + +result = add_ten(guinea_pig).table +pw.debug.compute_and_print(result) + + +# - + +# As you can see only the column `val` has been taken into account. +# +# ### Obtaining the squared value of each value: + +# + +@pw.transformer +class squared_value: + class table(pw.ClassArg): + val = pw.input_attribute() + + @pw.output_attribute + def result(self) -> float: + return self.val * self.val + + +result = squared_value(guinea_pig).table +pw.debug.compute_and_print(result) + + +# - + +# ### Summing two columns + +# + +@pw.transformer +class summing_columns: + class table(pw.ClassArg): + val = pw.input_attribute() + aux = pw.input_attribute() + + @pw.output_attribute + def result(self) -> float: + return self.val + self.aux + + +result = summing_columns(guinea_pig).table +pw.debug.compute_and_print(result) + + +# - + +# Those three results can be obtained by a unique transformer: + +# + +@pw.transformer +class combined_transformer: + class table(pw.ClassArg): + val = pw.input_attribute() + aux = pw.input_attribute() + + @pw.output_attribute + def result_add(self) -> float: + return self.val + 10 + + @pw.output_attribute + def result_squared(self) -> float: + return self.val * self.val + + @pw.output_attribute + def result_sum(self) -> float: + return self.val + self.aux + + +result = combined_transformer(guinea_pig).table +pw.debug.compute_and_print(result) + + +# - + +# Finally, you can use the new values inside the same transformer to perform more advanced operations: + +# + +@pw.transformer +class reusing_transformer: + class table(pw.ClassArg): + val = pw.input_attribute() + + @pw.output_attribute + def result_add(self) -> float: + return self.val + 10 + + @pw.output_attribute + def result_double(self) -> float: + return self.result_add + self.result_add + + +result = reusing_transformer(guinea_pig).table +pw.debug.compute_and_print(result) + + +# - + +# ## Transformers as a method +# +# You are not bound to static computation as transformers provide a way to obtain methods as new values. +# This is done using the `method` keyword: + +# + +@pw.transformer +class method_transformer: + class table(pw.ClassArg): + val: float = pw.input_attribute() + + @pw.method + def method_result(self, arg) -> float: + return self.val + arg + + +method_table = method_transformer(guinea_pig).table +result = method_table.select(res=method_table.method_result(10)) +pw.debug.compute_and_print(result) +# - + +# ## Transformer Classes using two different tables +# +# Now you might want to do something more complicated which requires two different tables. +# +# You have a table `matchings` which contains pairs of values `a` and `b` and a table `profiles` which contains the profile of each value of the pairs. +# You want to compute, for each pair, the sum of the profiles of the values of the pair. +# +# First, you need the tables: + + +# + +profiles = pw.debug.table_from_markdown( + """ + | profile + 0 | 1 + 1 | 10 + 2 | 100 + 3 | 1000 + """ +) + +matchings = pw.debug.table_from_markdown( + """ + | a | b + 0 | 0 | 2 + 1 | 1 | 3 + """ +) +matchings = matchings.select( + a=profiles.pointer_from(matchings.a), b=profiles.pointer_from(matchings.b) +) + + +# - + +# Now, you can do a transformer which takes the two tables as parameters. +# To access a given table inside the transformer, use the notation `self.transformer.my_table`. + +# + +@pw.transformer +class using_two_tables: + class profiles_table(pw.ClassArg): + profile: float = pw.input_attribute() + + class matchings_table(pw.ClassArg): + a: pw.Pointer = pw.input_attribute() + b: pw.Pointer = pw.input_attribute() + + @pw.output_attribute + def sum_profiles(self) -> float: + pa = self.transformer.profiles_table[self.a].profile + pb = self.transformer.profiles_table[self.b].profile + return pa + pb + + +result = using_two_tables( + profiles_table=profiles, matchings_table=matchings +).matchings_table +pw.debug.compute_and_print(result) +# - + +# ## Other topics +# +# We hope these examples make you feel comfortable using Pathway transformer classes. You can take a look at our advanced example of [transformer classes on a tree](/developers/user-guide/diving-deeper/transformer-recursion/). +# +# To continue your exploration of Pathway, you can also check out our [connectors](/developers/user-guide/connecting-to-data/connectors/), or see directly how to use Pathway to implement classic algorithms such as [PageRank](/developers/showcases/pagerank). diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/10.table-in-pathway.md b/docs/2.developers/4.user-guide/110.diving-deeper/10.table-in-pathway.md new file mode 100644 index 00000000..3668399a --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/10.table-in-pathway.md @@ -0,0 +1,91 @@ +--- +title: Table - What is it +description: 'This pages explains what are the Tables in Pathway' +--- + +# What is a Table in Pathway +The Table is the central data abstraction used in Pathway. +It is used to represent the data and model the transformations done on it. + + +In Pathway, data is stored in Tables ([`pw.Table`](/developers/api-docs/pathway-table/#pathway.internals.table.Table)). +The Table is a two-dimensional data structure, similar to a SQL table or a pandas' DataFrame. +However, Tables in Pathway have a very different nature as they store potentially infinite and ever-changing streaming data. +You can think of it as a "changing data frame": the number of rows and the content changes with time as new data comes into the system. + + +![Summary of properties of Pathway Table](/assets/content/documentation/Table.svg) + + +## Main elements of a Table +Pathway Tables are composed of three key elements: **rows**, **columns**, and **indexes** (ids). + +**Rows** represent individual records or observations, with each row containing a unique set of values across various attributes. +**Columns**, on the other hand, define the distinct attributes or fields associated with the data, specifying the characteristics of each record. +The intersection of a row and a column, known as a **cell**, holds a single data value. +The structure of a Table, defined by its columns and their data types, is called a **schema** ([`pw.Schema`](/developers/api-docs/pathway/#pathway.Schema)). + +The third vital component is the **index** (or id), which serves as a unique pointer to the associated row in the Table. +Each row has its own unique index stored in the column `id` (`id` is a reserved column name). +This index is crucial for referencing and accessing specific records within the Table. +The indexes are either automatically attributed or based on **primary keys**. +You can specify a set of columns as **primary keys**: the indexes will be computed based on those columns. + +![Table updating at reception of new data](/assets/content/documentation/table-example.svg) + +## Specificities of Tables +Tables in Pathway possess a distinctive dual nature: **immutability and dynamic adaptability**. + +Tables are immutable: when operations such as [`select`](/developers/user-guide/data-transformation/table-operations/#select-and-notations) or other transformers are applied to the Table, rather than modifying the existing data, a new Table is generated. +Immutability ensures that the original data remains unchanged. + +![Table updating at reception of new data](/assets/content/documentation/table-immutable.svg) + +Simultaneously, Tables are designed to seamlessly handle the ever-changing nature of data streams by incorporating new rows dynamically. +Whenever a new entry arrives, the Tables are updated by adding, removing, or updating the rows. +The updates are propagated to all the Tables of the pipeline. +The data has changed, not the Table: it still represents the same transformation over the same data stream. + +![Table updating at reception of new data](/assets/content/documentation/table-update.svg) + + +## How to create a Table +You can create a Table either by using an **input connector** or by **modifying an existing Table**. +If you want to connect to an external data source, you can use Pathway [**input connectors**](/developers/user-guide/connecting-to-data/connectors/) to store the input data in a Table. +Otherwise, a Table can be derived from an already existing Table by using [transformers](/developers/user-guide/data-transformation/table-operations/). +You can also create Tables with artificial data streams using the [demo module](/developers/user-guide/connecting-to-data/artificial-streams/). + +![Creating a Table in Pathway](/assets/content/documentation/table-creation.svg) + + +## How to access the data +Tables store dynamic data streams so accessing data in Pathway differs from static data frameworks. +With Pathway, you first set up your pipeline, with input connectors and transformers to define the transformations you want to do on your data. +At that point, the pipeline is built, but no data is ingested: your Tables are empty. +You then use [`pw.run()`](/developers/api-docs/pathway/#pathway.run) to ingest the data: the input is dynamically ingested in the Tables. + +The static nature of the usual "print" is not compatible with the dynamic nature of the Tables in Pathway. +Of course, we could print a snapshot of the Table, but this would defeat the whole purpose of data stream processing: any update occurring after the snapshot would be lost. + +You can output the Table, which represents a data stream, to external services using [**output connectors**](/developers/user-guide/connecting-to-data/connectors/). +Every change in the Table will be forwarded to the chosen service (Kafka, Postgres, etc.). +Alternatively, you can visualize the Table "live" using the [`pw.Table.show` operator](/developers/api-docs/pathway-table/#pathway.internals.table.Table.show) (see our [Jupyter example](/developers/showcases/live_data_jupyter/)). +Finally, you might want to test your pipeline with static data. +In that case, you can print the data using `pw.debug.compute_and_print` which will compute the Table with the available data and print the results. +You can learn more on how to use `pw.debug.compute_and_print` in our ["Your first realtime app" article](/developers/user-guide/introduction/first_realtime_app_with_pathway/#use-static-data-to-debug). + + +## Operations on the data +Pathway supports most of the common basic operations on Tables such as mathematical and boolean operations, filters, or apply. +You can learn more about those basic operations in the [associated guide](/developers/user-guide/data-transformation/table-operations/). +These include [standard join](/developers/user-guide/data-transformation/join-manual/), [groupby](/developers/user-guide/data-transformation/groupby-reduce-manual/) and [windowby](/developers/user-guide/temporal-data/windows-manual/). Pathway also provides temporal join operations such as [ASOF join](/developers/user-guide/temporal-data/asof-join/) or [interval joins](/developers/user-guide/temporal-data/interval-join/). diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/20.id-universes.md b/docs/2.developers/4.user-guide/110.diving-deeper/20.id-universes.md new file mode 100644 index 00000000..345b1074 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/20.id-universes.md @@ -0,0 +1,23 @@ +--- +title: 'Universes' +description: 'Description of the data concept of universes in Pathway.' +--- + +## Universes + +The **universe** of a table is a central concept in Pathway. +In a nutshell, **the universe of a table is the collection of the ids of the said table**. +It is used to know if some operations can be performed: if you want to update the cells of one table `t1` by the values contained in a table `t2`, you have to make sure that each id of `t2` is also in `t1`. + +![Universe](/assets/content/documentation/Universe.svg) + +Simple, right? + +Simple, but there is a bit more to this than meets the eye! Pathway aims at working on ever-changing data tables: Pathway' engine ensures that each update in the data is taken into account in realtime on each table. +In this context, the set of ids is not something static nor stable: **universes are used to ensure operations are performed on compatible data tables**, despite potential instability. + +For example, two data tables from two different data sources may have the same set of ids at some point but may diverge with upcoming updates. +By default, tables from different data sources will not have the same universe. + +Pathway tries to infer whether the tables have the same universe but depending on how the tables are processed it may fail. +You may want to force two tables to have the same universe (or one table have a universe which is a subset of the universe of the second): Pathway might have failed the inference or because you know that the two tables actually derive from the same data source: you can specify it manually with `t1.promise_universe_is_equal_to(t2)`. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/30.transformer-introduction.md b/docs/2.developers/4.user-guide/110.diving-deeper/30.transformer-introduction.md new file mode 100644 index 00000000..7b967ff2 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/30.transformer-introduction.md @@ -0,0 +1,109 @@ +--- +title: 'Introducing transformer classes' +description: 'Transformer syntax allows you to express pipelines of transformations on entire (and ever-changing) data tables' +--- + +# Introduction to transformer classes + +A quick introduction to Pathway' transformer classes. + +Pathway' transformer syntax allows you to express pipelines of transformations on entire (and ever-changing) **data tables**. In Pathway, transformers behave like functions, whose arguments are Pathway Tables. If you have used Spark SQL or Kafka Streams in the past, the syntax should feel familiar. + +In addition to this, Pathway also natively supports transformers defined on **data rows**. This is achieved through an objected-oriented (ORM) view of rows in data. These are known as Transformer Classes. + +Transformer Classes are used for easy implementation of data-structure querying operations, defining APIs in Data Products, and on-demand computations. + +Transformer classes provide a way to achieve row-centric operations in Pathway where use of `apply` is not sufficient or not convenient. +Using transformer classes is the easiest way do advanced computation, involving pointers between fields of tables. + + + +# Transformers 101: how to make a map + +To create a transformer class is creating a class which is annotated by `@pw.transformer`. +In that class, you can declare other classes: each class defines *one input table* and *one output table*. + + +First, you can access and use the values of the input table by declaring the field existing in the table: `val = pw.input_attribute()`. +Note that the variable `val` has to be named with the name of the targeted column of the input table. + +You can then define the output field by using the annotation `@pw.output_attribute` before a function: the name of the function will be the column name in the output column and the return value will be the value stored in that column. + +As an example, let's consider the following transformer doing a map: the transformer takes a table which has a column named `col_name` as input and applies a given function `f` to each row and the output values are stored in a new column named `col_name_output`: + +```python +import pathway as pw + +@pw.transformer +class my_transformer: + class my_table(pw.ClassArg): + col_name=pw.input_attribute() + + @pw.output_attribute + def col_name_output(self): + return f(self.col_name) +``` + +In this transformer, the class `my_table` takes one input table whose columns will be match to the parameters defined using `pw.input_attribute()` and will output a table whose columns are defined by functions annotated by `@pw.output_attribute`. + +To test our transformer, let's consider this toy table `t`: + + col_name + 0 x + 1 y + 2 z + +Let's apply the transformer to the table `t`, and extract the resulting table stored in `my_table`: + +```python +t_map = my_transformer(my_table=t).my_table +``` + +You obtain the following table: + + col_name_output + 0 f(x) + 1 f(y) + 2 f(z) + + +# Why using transformers? + +Now that you are familiar with transformer classes and their basic syntax, let's see how they can be useful. +Using transformer classes to do simple maps is a bit complicated, a map can be done in one line with Pathway: + +```python +t_map = t.select(col_name_output=apply(f,t.col_name)) +``` + +So one natural question you might ask yourself is 'why use transformer classes?'. + +It is true that when doing single row operations, using `apply` is the way to go. +Transformer classes are made for more advanced operations, in particular operations involving different tables. +While using `apply` is limited to row-centric operations, transformer classes are able to perform look-ups and recursive operations on rows. +Furthermore, **inside the transformer class, you can easily access any table referenced by a class** by doing `self.transformer_name.table_name`. + +For instance, if you need to add the values of two different tables, things get more complicated with only standard operations. +It is possible to make a `join` and then use `apply`, but it would result in copying the values in a new table before doing the sum. +This does not scale well on large datasets. +On the other hand, using a class transformer would allow you to do it without having to create a new table. +You can check out how easy it is to use transformer classes to [combine several tables at once](/developers/user-guide/diving-deeper/transformer-example/#transformer-classes-using-two-different-tables). + +# Complexity + +While transformer classes allow you to work with different rows from different tables at once, this comes with a price. +Indeed, using transformer classes may have **up to quadratic complexity** in the number of dependencies. +Here, dependencies refer to the rows responsible for look-ups: all the rows you are accessing and using except for the one referred by `self`. +As a rule of thumb, try to limit the number of row dependencies per row to not more than a dozen or so. + +**Transformer classes are not meant to access too many rows at once.** +For complex operations involving many rows simultaneously, you may prefer to use a `join` to obtain a single row containing all the relevant values and then use the standard `pw.apply`. + + +# Going further + +Transformer classes are a key component of Pathway programming framework. + +If you want to learn more about transformer classes, you can see our [basic examples of transformer classes](/developers/user-guide/diving-deeper/transformer-example/) or our [advanced example](/developers/user-guide/diving-deeper/transformer-recursion/) on how to make a tree using transformer classes. + +You can also take a look at our [connectors](/developers/user-guide/connecting-to-data/connectors/) to see how to use different data sources to Pathway. \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/40.transformer-example.md b/docs/2.developers/4.user-guide/110.diving-deeper/40.transformer-example.md new file mode 120000 index 00000000..9af9a592 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/40.transformer-example.md @@ -0,0 +1 @@ +.transformer-examples/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/110.diving-deeper/50.transformer-recursion.md b/docs/2.developers/4.user-guide/110.diving-deeper/50.transformer-recursion.md new file mode 120000 index 00000000..647e4d31 --- /dev/null +++ b/docs/2.developers/4.user-guide/110.diving-deeper/50.transformer-recursion.md @@ -0,0 +1 @@ +./.transformer-classes/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/article.py b/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/article.py new file mode 100644 index 00000000..3f859dcf --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/article.py @@ -0,0 +1,358 @@ +# --- +# title: "Always up-to-date Vector Data Indexing pipeline" +# description: '' +# aside: true +# author: 'pathway' +# article: +# date: '2023-12-15' +# thumbnail: '' +# tags: ['showcase', 'llm', 'data-pipeline'] +# keywords: ['LLM', 'RAG', 'GPT', 'OpenAI', 'Google Docs', 'KNN', 'Vector store', 'langchain', 'llama-index', 'vectordb', 'vectore store langchain', 'retriever', 'unstructured'] +# notebook_export_path: notebooks/showcases/live_vector_indexing_pipeline.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.16.1 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Always up-to-date data indexing pipeline +# +# This showcase shows how to use Pathway to deploy a live data indexing pipeline, which can be queried like a typical vector store. However, under the hood, Pathway updates the index on each data change, always giving up-to-date answers. +# +# ::article-img +# --- +# src: '/assets/content/showcases/vectorstore/vectorstore_doc.png' +# alt: 'Pathway data indexing pipeline' +# class: 'mx-auto' +# zoomable: true +# --- +# :: +# +# Pathway Vectorstore enables building a document index on top of you documents without the +# complexity of ETL pipelines, managing different containers for storing, embedding, and serving. +# It allows for easy to manage, always up-to-date, LLM pipelines accesible using a RESTful API +# and with integrations to popular LLM toolkits such as Langchain and LlamaIndex. +# +# +# In this article, we will use a simple document processing pipeline that: +# 1. Monitors several data sources (files, S3 folders, cloud storages) for data changes. +# 2. Parses, splits and embeds the documents. +# 3. Builds a vector index for the data. +# +# However, If you prefer not to create the pipeline from the ground up and would like to check out the functionality, +# take a look at our [`managed pipelines`](https://cloud.pathway.com/docindex) in action. +# +# We will connect to the index using a `VectorStore` client, which allows retrieval of semantically similar documents. + +# %% [markdown] +# ## Prerequisites +# +# Install the `pathway` package. You can also install the `unstructured` package to use the most powerful `unstructured.io`-based parser. +# +# Then download sample data. + +# %% +# _MD_SHOW_!pip install pathway litellm +# # !pip install unstructured[all-docs] +# _MD_SHOW_!mkdir -p sample_documents +# _MD_SHOW_![ -f sample_documents/repo_readme.md ] || wget 'https://gist.githubusercontent.com/janchorowski/dd22a293f3d99d1b726eedc7d46d2fc0/raw/pathway_readme.md' -O 'sample_documents/repo_readme.md' + +# _MD_COMMENT_START_ +if 1: # group to prevent isort messing up + import json + import os + + from common.shadows import fs + + os.environ["OPENAI_API_KEY"] = json.loads( + fs.open("vault://kv.v2:deployments@/legal_rag_demo").read() + )["OPENAI_KEY"] +# _MD_COMMENT_END_ + +# %% +import logging +import sys +import time + +logging.basicConfig(stream=sys.stderr, level=logging.WARN, force=True) + +# %% [markdown] +# ## Building the data pipeline +# +# First, make sure you have an API key with an LLM provider such as OpenAI. + +# %% +import getpass +import os + +if "OPENAI_API_KEY" not in os.environ: + os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:") + +# %% [markdown] +# We will now assemble the data vectorization pipeline, using a simple `UTF8` file parser, a character splitter and an embedder from the [Pathway LLM xpack](/developers/user-guide/llm-xpack/overview). +# +# First, we define the data sources. We use the files-based one for simplicity, but any supported `pathway` [connector](/developers/api-docs/pathway-io/), such as [s3](/developers/api-docs/pathway-io/s3/) or [Google Drive](/developers/api-docs/pathway-io/gdrive/#pathway.io.gdrive.read) will also work. +# +# Then, we define the embedder and splitter. +# +# Last, we assemble the data pipeline. We will start it running in a background thread to be able to query it immediately from the demonstration. Please note that in a production deployment, the server will run in another process, possibly on another machine. For the quick-start, we keep the server and client as different threads of the same Python process. + +# %% +import pathway as pw + +# This creates a connector that tracks files in a given directory. +data_sources = [] +data_sources.append( + pw.io.fs.read( + "./sample_documents", + format="binary", + mode="streaming", + with_metadata=True, + ) +) + +# This creates a connector that tracks files in Google Drive. +# Please follow the instructions at /developers/user-guide/connectors/gdrive-connector/ to get credentials. +# data_sources.append( +# pw.io.gdrive.read(object_id="17H4YpBOAKQzEJ93xmC2z170l0bP2npMy", service_user_credentials_file="credentials.json", with_metadata=True)) + +# %% +# We now build the VectorStore pipeline + +from pathway.xpacks.llm.embedders import OpenAIEmbedder +from pathway.xpacks.llm.splitters import TokenCountSplitter +from pathway.xpacks.llm.vector_store import VectorStoreClient, VectorStoreServer + +PATHWAY_PORT = 8765 + +# Choose document transformers +text_splitter = TokenCountSplitter() +embedder = OpenAIEmbedder(api_key=os.environ["OPENAI_API_KEY"]) + +# The `PathwayVectorServer` is a wrapper over `pathway.xpacks.llm.vector_store` to accept LangChain transformers. +# Fell free to fork it to develop bespoke document processing pipelines. +vector_server = VectorStoreServer( + *data_sources, + embedder=embedder, + splitter=text_splitter, +) +# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT, threaded=True, with_cache=False) +# _MD_SHOW_time.sleep(30) # Workaround for Colab - messages from threads are not visible unless a cell is running + +# %% [markdown] +# We now instantiate and configure the client + +# %% +client = VectorStoreClient( + host="127.0.0.1", + port=PATHWAY_PORT, +) + +# %% [markdown] +# And we can start asking queries + +# %% +query = "What is Pathway?" +# _MD_SHOW_docs = client(query) +# _MD_SHOW_docs + + +# %% [markdown] +# **Your turn!** Now make a change to the source documents or make a fresh one and retry the query! + +# %% [markdown] +# ## Integrations +# +# ### Langchain +# +# This currently is submitted to Langchain in [a PR](https://github.com/langchain-ai/langchain/pull/14859) +# + +# %% +# _MD_SHOW_!pip install langchain +# _MD_SHOW_!pip install langchain-openai + +# %% [markdown] +# ```python +# # PathwayVectorClient in Langchain is waiting for merging +# # in https://github.com/langchain-ai/langchain/pull/14859 +# from langchain.vectorstores import PathwayVectorClient +# +# # PathwayVectorClient implements regular VectorStore API of LangChain +# client = PathwayVectorClient(host="127.0.0.1", port=PATHWAY_PORT) +# docs = client.similarity_search("What is Pathway?") +# ``` + +# %% +# Here we show how to configure a server that uses LangChain document processing components + +# _MD_SHOW_from langchain_openai import OpenAIEmbeddings +# _MD_SHOW_from langchain.text_splitter import CharacterTextSplitter + +# Choose proper LangChain document transformers +# _MD_SHOW_text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) +# _MD_SHOW_embeddings_model = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"]) + +# Use VectorStoreServer.from_langchain_components to create a vector server using LangChain +# document processors +# _MD_SHOW_vector_server = VectorStoreServer.from_langchain_components( +# _MD_SHOW_ *data_sources, +# _MD_SHOW_ embedder=embeddings_model, +# _MD_SHOW_ splitter=text_splitter, +# _MD_SHOW_) +# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT+1, threaded=True, with_cache=False) +# _MD_SHOW_time.sleep(30) # colab workaround + +# %% +# You can connect to the Pathway+LlamaIndex server using any client - Pathway's, Langchain's or LlamaIndex's! +# _MD_SHOW_client = VectorStoreClient( +# _MD_SHOW_ host="127.0.0.1", +# _MD_SHOW_ port=PATHWAY_PORT+1, +# _MD_SHOW_) + +# _MD_SHOW_client.query("pathway") + +# %% [markdown] +# ### LlamaIndex +# +# Pathway is fully integrated with LlamaIndex! We show below how to instantiate a Llama-Index +# retriever that queries the Pathway VectorStoreServer +# and how to configure a server using LlamaIndex components. +# +# For more information see `Pathway Retriever` +# [cookbook](https://docs.llamaindex.ai/en/stable/examples/retrievers/pathway_retriever.html). +# %% +# _MD_SHOW_!pip install llama-index llama-index-retrievers-pathway llama-index-embeddings-openai + +# %% +# You can connect to the PathwayVectorStore using a llama-index compatible retriever +# _MD_SHOW_from llama_index.retrievers.pathway import PathwayRetriever + +# PathwayRetriever implements the Retriever interface +# _MD_SHOW_pr = PathwayRetriever(host="127.0.0.1", port=PATHWAY_PORT) +# _MD_SHOW_pr.retrieve(str_or_query_bundle="What is Pathway?") + +# %% +# Here we show how to configure a server that uses LlamaIndex document processing components + +# _MD_SHOW_from llama_index.embeddings.openai import OpenAIEmbedding +# _MD_SHOW_from llama_index.core.node_parser import TokenTextSplitter + +# Choose proper LlamaIndex document transformers +# _MD_SHOW_embed_model = OpenAIEmbedding(embed_batch_size=10) + +# _MD_SHOW_transformations_example = [ +# _MD_SHOW_ TokenTextSplitter( +# _MD_SHOW_ chunk_size=150, +# _MD_SHOW_ chunk_overlap=10, +# _MD_SHOW_ separator=" ", +# _MD_SHOW_ ), +# _MD_SHOW_ embed_model, +# _MD_SHOW_] + +# Use VectorStoreServer.from_llamaindex_components to create a vector server using LlamaIndex +# document processors +# _MD_SHOW_vector_server = VectorStoreServer.from_llamaindex_components( +# _MD_SHOW_ *data_sources, +# _MD_SHOW_ transformations=transformations_example, +# _MD_SHOW_) +# _MD_SHOW_vector_server.run_server(host="127.0.0.1", port=PATHWAY_PORT+2, threaded=True, with_cache=False) +# _MD_SHOW_time.sleep(30) # colab workaround + +# %% +# You can connect to the Pathway+LlamaIndex server using any client - Pathway's, Langchain's or LlamaIndex's! +# _MD_SHOW_client = VectorStoreClient( +# _MD_SHOW_ host="127.0.0.1", +# _MD_SHOW_ port=PATHWAY_PORT+2, +# _MD_SHOW_) + +# _MD_SHOW_client.query("pathway") + +# %% [markdown] +# ## Advanced topics +# +# ### Getting information on indexed files + +# %% [markdown] +# `PathwayVectorClient.get_vectorstore_statistics()` gives essential statistics on the state of the vector store, like the number of indexed files and the timestamp of the last updated one. You can use it in your chains to tell the user how fresh your knowledge base is. + +# %% +# _MD_SHOW_client.get_vectorstore_statistics() + +# %% [markdown] +# ### Filtering based on file metadata +# +# We support document filtering using [jmespath](https://jmespath.org/) expressions, for instance: + +# %% +# take into account only sources modified later than unix timestamp +# _MD_SHOW_docs = client(query, metadata_filter="modified_at >= `1702672093`") + +# take into account only sources modified later than unix timestamp +# _MD_SHOW_docs = client(query, metadata_filter="owner == `james`") + +# take into account only sources with path containing 'repo_readme' +# _MD_SHOW_docs = client(query, metadata_filter="contains(path, 'repo_readme')") + +# and of two conditions +# _MD_SHOW_docs = client(query, metadata_filter="owner == `james` && modified_at >= `1702672093`") + +# or of two conditions +# _MD_SHOW_docs = client(query, metadata_filter="owner == `james` || modified_at >= `1702672093`") + +# %% [markdown] +# ### Configuring the parser +# +# The vectorization pipeline supports pluggable parsers. If not provided, defaults to `UTF-8` parser. You can find available parsers [here](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/parser.py). +# An example parser that can read PDFs, Word documents and other formats is provided with `parsers.ParseUnstructured`: + +# %% +# # !pip install unstructured[all-docs] # if you will need to parse complex documents + +# %% [markdown] +# ```python +# from pathway.xpacks.llm import parsers +# +# vector_server = VectorStoreServer( +# *data_sources, +# parser=parsers.ParseUnstructured(), +# embedder=embeddings_model, +# splitter=text_splitter, +# ) +# ``` + +# %% [markdown] +# ### Configuring the cache +# +# The Pathway vectorizing pipeline comes with an embeddings cache: +# ```python +# vector_server.run_server(..., with_cache=True) +# ``` +# +# The default cache configuration is the locally hosted disk cache, stored in the `./Cache` directory. However, it can be customized by explicitly specifying the caching backend chosen among several persistent backend [options](/developers/api-docs/persistence-api/#pathway.persistence.Backend). + + +# %% [markdown] +# ### Running in production +# +# A production deployment will typically run the server in a separate process. We provide a template application under [`templates`](https://github.com/langchain-ai/langchain/tree/master/templates/rag-pathway). We recommend running the Pathway data indexing pipeline in a container-based deployment environment like Docker or Kubernetes. For more info, see [Pathway's deployment guide](/developers/user-guide/deployment/docker-deployment/). +# +# ::shoutout-banner +# --- +# href: "https://discord.gg/pathway" +# icon: "ic:baseline-discord" +# --- +# #title +# Discuss tricks & tips for RAG +# #description +# Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +# :: diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md b/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md new file mode 100644 index 00000000..c73ce844 --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/.vectorstore_pipeline/sample_documents/repo_readme.md @@ -0,0 +1,5 @@ +Pathway is an open framework for high-throughput and low-latency real-time data processing. It is used to create Python code which seamlessly combines batch processing, streaming, and real-time API's for LLM apps. Pathway's distributed runtime (🦀-🐍) provides fresh results of your data pipelines whenever new inputs and requests are received. + + +In the first place, Pathway was designed to be a life-saver (or at least a time-saver) for Python developers and ML/AI engineers faced with live data sources, where you need to react quickly to fresh data. Still, Pathway is a powerful tool that can be used for a lot of things. If you want to do streaming in Python, build an AI data pipeline, or if you are looking for your next Python data processing framework, keep reading. + diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/10.overview.md b/docs/2.developers/4.user-guide/115.llm-xpack/10.overview.md new file mode 100644 index 00000000..e02f8d6d --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/10.overview.md @@ -0,0 +1,181 @@ +--- +title: 'Overview of LLM xpack' +description: 'Introduction to the Pathway LLM xpack' +date: '2024-02-14' +thumbnail: '' +tags: ['tutorial', 'LLM'] +keywords: ['LLM', 'GPT', 'OpenAI', 'Gemini'] +--- + +# Overview of LLM xpack + +The LLM xpack provides you all the tools you need to use Large Language Models in Pathway. Wrappers for most common LLM services and utilities are included, making working with LLMs as easy as it can be. + +## Wrappers for LLMs + +Out of the box, the LLM xpack provides wrappers for text generation and embedding LLMs. For text generation, you can use native wrappers for the OpenAI chat model and HuggingFace models running locally. Many other popular models, including Azure OpenAI, HuggingFace (when using their API) or Gemini can be used with the wrapper for LiteLLM. To check the full list of providers supported by LiteLLM check [LiteLLM documentation](https://docs.litellm.ai/docs/providers). + +Each wrapper is a [UDFClass](/developers/api-docs/pathway#pathway.UDF). To use it, first create an instance of the wrapper, which you can then apply to a column with prompts. For OpenAI, you create a wrapper with [`OpenAIChat` class](/developers/api-docs/pathway-xpacks-llm/llms/#pathway.xpacks.llm.llms.OpenAIChat). +```python +model = OpenAIChat( + model="gpt-3.5-turbo", + api_key=os.environ["OPENAI_API_KEY"], # Read OpenAI API key from environmental variables +) +# Send queries from column `messages` in table `query` to OpenAI +responses = query.select(result=model(pw.this.messages)) +``` + +### Preparing queries +`OpenAIChat` expects messages to be in the format required by [OpenAI API](https://platform.openai.com/docs/api-reference/chat/create) - that is a list of dictionaries, where each dictionary is one message in the conversation so far. If you want to ask single questions use [`pw.xpacks.llm.llm.prompt_chat_single_qa`](/developers/api-docs/pathway-xpacks-llm/llms/#pathway.xpacks.llm.llms.prompt_chat_single_qa) to wrap them. + +```python +from pathway.xpack.llm.llms import prompt_chat_single_qa + + +model = OpenAIChat( + model="gpt-3.5-turbo", + api_key=os.environ["OPENAI_API_KEY"], # Read OpenAI API key from environmental variables +) +# Column `prompt` holds strings with questions to be sent to OpenAI chat +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt))) +``` + +### Model Parameters +OpenAI API takes a number of parameters, including `model` and `api_key` used in the code stubs above. `OpenAIChat` allows you to set their default value during the initialization of the class, but you can also override them during application. + +```python +model = OpenAIChat( + model="gpt-3.5-turbo", + api_key=os.environ["OPENAI_API_KEY"], # Read OpenAI API key from environmental variables + max_tokens=200, # Set default value of max_tokens to be 200 +) +# As max_tokens is not set, value 200 will be used +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt))) +# Now value of max_tokens is taken from column `max_tokens`, overriding default value set when initializing OpenAIChat +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt), max_tokens(pw.this.max_tokens))) +``` + +### What about other models? +So far we focused on the wrapper for the OpenAI chat model, but other wrappers work in the same way! + +Pathway has two more wrappers for chat models - [`LiteLLMChat`](/developers/api-docs/pathway-xpacks-llm/llms/#pathway.xpacks.llm.llms.LiteLLMChat) and [`HFPipelineChat`](/developers/api-docs/pathway-xpacks-llm/llms/#pathway.xpacks.llm.llms.HFPipelineChat). For example, to use Gemini with LiteLLM, create an instance of `LiteLLMChat` and then apply it to the column with messages to be sent over API. + +```python +model = LiteLLMChat( + model="gemini/gemini-pro", # Choose the model you want + api_key=os.environ["GEMINI_API_KEY"], # Read GEMINI API key from environmental variables +) +# Ask Gemini questions from `prompt` column +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt))) +``` + +With the wrapper for LiteLLM, Pathway allows you to use many popular LLMs. For models from HuggingFace that you want to run locally, Pathway gives a separate wrapper (for calling HuggingFace through API, use LiteLLM wrapper), called `HFPipelineChat`. When an instance of this wrapper is created, it initializes a HuggingFace `pipeline`, so any [arguments to the `pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline) - including the name of the model - must be set during the initialization of `HFPipelineChat`. Any parameters to `pipeline.__call__` can be as before set during initialization or overridden during application. + +```python +model = HFPipelineChat( + model="gpt2", # Choose the model you want +) +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt))) +``` + +You can check an example with HuggingFace running locally in the [llm-app repository](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/local/app.py). + +Pathway also comes with wrappers for embedding models - [`OpenAIEmbedder`](/developers/api-docs/pathway-xpacks-llm/embedders/#pathway.xpacks.llm.embedders.OpenAIEmbedder), [`LiteLLMEmbedder`](/developers/api-docs/pathway-xpacks-llm/embedders/#pathway.xpacks.llm.embedders.LiteLLMEmbedder) and [`SentenceTransformersEmbedder`](/developers/api-docs/pathway-xpacks-llm/embedders/#pathway.xpacks.llm.embedders.SentenceTransformerEmbedder). Each of them can be applied to a column of strings and returns a column with a list of floats - the embeddings. + +```python +embedder = OpenAIEmbedder( + model="text-embedding-ada-002", # model for embedding + api_key=os.environ["OPENAI_API_KEY"], # Read OpenAI API key from environmental variables +) +# calculate embedding for column `text` in table `documents` +responses = documents.select(result=embedder(pw.this.text)) +``` + +### Asynchrony +Wrapper for OpenAI and LiteLLM, both for chat and embedding, are asynchronous, and Pathway allows you to set three parameters to set their behavior. These are: +- `capacity`, which sets the number of concurrent operations allowed, +- `retry_strategy`, which sets the strategy for handling retries in case of failures, +- `cache_strategy`, which defines the cache mechanism. + + +These three parameters need to be set during the initialization of the wrapper. + +```python +model = OpenAIChat( + capacity=5, # maximum concurrent operations is 5 + # in case of failure, retry 5 times, each time waiting twice as long before retrying + retry_strategy=pw.asynchronous.ExponentialBackoffRetryStrategy(max_retries=5, initial_delay=1000, backoff_factor=2), + # if PATHWAY_PERSISTENT_STORAGE is set, then it is used to cache the calls + cache_strategy=pw.asynchronous.DefaultCache(), + model="gpt-3.5-turbo", + api_key=os.environ["OPENAI_API_KEY"], # Read OpenAI API key from environmental variables +) +responses = query.select(result=model(prompt_chat_single_qa(pw.this.prompt))) +``` + +### Creating a Pathway LLM pipeline +You can now combine these wrappers to create a LLM pipeline using Pathway. To learn how to do this, read [our tutorial](/developers/user-guide/llm-xpack/llm-app-pathway). + +## Preparing documents for LLMs + +The Pathway xpack for LLMs provides tools for preparing your documents and texts in order to use them with LLMs. You can use [`ParseUnstructured`](/developers/api-docs/pathway-xpacks-llm/parsers/#pathway.xpacks.llm.parsers.ParseUnstructured) for parsing your documents into texts and [`TokenCountSplitter`](/developers/api-docs/pathway-xpacks-llm/splitters/#pathway.xpacks.llm.splitters.TokenCountSplitter) for dividing texts into smaller chunks. + +### Parsing documents +Use the [`ParseUnstructured` class](/developers/api-docs/pathway-xpacks-llm/parsers/#pathway.xpacks.llm.parsers.ParseUnstructured) to parse documents in Pathway. Underneath, it uses the [Unstructured](https://unstructured.io/) library to parse your documents. To use it, you need to read the contents of a file into a Pathway Table using any connector of your choice. Then, apply an instance of the `ParseUnstructured` class to get a Pathway Table with parsed content of documents. `ParseUnstructured` has an argument `mode` which takes one of three values: `single`, `paged` or `elements`. If set to `single`, the whole document is returned as one string, if set to `paged` then there is a string for each page in the document, and if set to `elements` then Unstructured's division into elements is preserved. The `mode` argument can be set either during initialization or application of `ParseUnstructured`. + +```python +import os +import pathway as pw +from pathway.xpacks.llm.parsers import ParseUnstructured + +files = pw.io.fs.read( + os.environ.get("DATA_DIR"), + mode="streaming", + format="binary", + autocommit_duration_ms=50, +) +parser = ParseUnstructured(mode="elements") +documents = files.select(elements=parser(pw.this.data)) +``` + +`ParseUnstructured` for a document returns a list of tuples with parsed text and [associated metadata returned from Unstructured](https://unstructured-io.github.io/unstructured/metadata.html). If you want to have each string with text in another row of the table, you should use the [`flatten`](/developers/api-docs/pathway-table/#pathway.Table.flatten) function. + +```python +documents = documents.flatten(pw.this.elements) # flatten list into multiple rows +documents = documents.select(text=pw.this.elements[0], metadata=pw.this.elements[1]) # extract text and metadata from tuple +``` + +### Splitting texts +Once you have some texts in a Pathway Table, you can use the [`TokenCountSplitter` class](/developers/api-docs/pathway-xpacks-llm/splitters/#pathway.xpacks.llm.splitters.TokenCountSplitter) to divide them into smaller chunks. It tries to split the text in such a way that each part has between `min_token` and `max_token` tokens, but so that sentences are not cut in half. + +`TokenCountSplitter` has three parameters - `min_token`, `max_token` and `encoding` - and each of them can be overridden during the call of the function. `min_token` and `max_token`, as mentioned above, set the minimum and maximum length of each chunk, whereas `encoding` is the name of the [tiktoken encoding](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb) to be used. + +```python +from pathway.xpacks.llm.splitters import TokenCountSplitter + +splitter = TokenCountSplitter(min_tokens=100, max_tokens=300, encoding) +texts = documents.select(chunk=splitter(pw.this.text)) +``` + +`TokenCountSplitter` returns data in the same format as `ParseUnstructured` - that is for each row it returns a list of tuples, where each tuple consists of a string with the text of a chunk and a dictionary with associated metadata. + +With these tools it is easy to create in Pathway a pipeline serving as a Vector Store, but which updates on each data change. You can check such an example in [the llm-app repository](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/unstructured/app.py). As it is a common pipeline, Pathway provides a [class `VectorStore`](/developers/api-docs/pathway-xpacks-llm/vectorstore/#pathway.xpacks.llm.vector_store.VectorStoreServer) which implements this pipeline. + +## Ready-to-use Vector Store + +Pathway Vector Store enables building a document index on top of your documents and allows for easy-to-manage, always up-to-date LLM pipelines accessible using a RESTful API. It maintains an index of your documents and allows for querying for documents closest to a given query. It is implemented using two classes - [`VectorStoreServer`](/developers/api-docs/pathway-xpacks-llm/vectorstore/#pathway.xpacks.llm.vector_store.VectorStoreServer) and [`VectorStoreClient`](/developers/api-docs/pathway-xpacks-llm/vectorstore/#pathway.xpacks.llm.vector_store.VectorStoreClient). + +The `VectorStoreServer` class implements the pipeline for indexing your documents and runs an HTTP REST server for nearest neighbors queries. You can use `VectorStoreServer` by itself to use Pathway as a Vector Store, and you then query it using REST. Alternatively, use `VectorStoreClient` for querying `VectorStoreServer` which implements wrappers for REST calls. + +You can learn more about Vector Store in Pathway in a [dedicated tutorial](/developers/user-guide/llm-xpack/vectorstore_pipeline). + +::shoutout-banner +--- +href: "https://discord.gg/pathway" +icon: "ic:baseline-discord" +--- +#title +Discuss tricks & tips for RAG +#description +Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +:: diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/20.llm-app-pathway.md b/docs/2.developers/4.user-guide/115.llm-xpack/20.llm-app-pathway.md new file mode 100644 index 00000000..ef5016db --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/20.llm-app-pathway.md @@ -0,0 +1,349 @@ +--- +title: 'Build an LLM App' +description: 'Querying a live document database with LLMs and Pathway' +author: 'mohamed' +aside: true +article: + date: '2023-07-20' + thumbnail: '/assets/content/showcases/llm-app/vector-index-th.png' + tags: ['showcase', 'llm'] +keywords: ['LLM', 'RAG', 'AWS S3', 'Slack', 'GPT', 'OpenAI', 'KNN', 'HTTP connector', 'unstructured'] +--- + +View in Github + +# Build an LLM App with Pathway + +In this blog series, learn how to construct a dynamic, real-time LLM App using Pathway. Explore key features like real-time document indexing from S3 storage and other solutions, adaptive learning from updated documentation, and managing user sessions. Dive into this exciting combination of technologies that brings a responsive, knowledge-growing application to life. + + +Pathway makes handling realtime data easy. In this showcase we are going to demonstrate how Pathway can be used to build a chatbot answering questions about the Pathway documentation. This interactive application will exhibit dynamic adaptability to changing data sources: +1. User queries, to which responses must be generated in realtime, +2. Documentation entries, which should be incrementally re-indexed after each change. + +The app in action can be seen in the video below: + +::article-img +--- +src: '/assets/content/showcases/llm-app/video-th.png' +alt: 'Build your LLM App without a vector database (in 30 lines of code)' +class: 'mx-auto' +href: 'https://www.youtube.com/watch?v=kcrJSk00duw' +--- +:: + + +The LLM (Large Language Model) App first reads a corpus of documents stored in S3. It preprocesses them and builds a vector index. It then listens to user queries coming as HTTP REST requests. Each query uses the index to retrieve relevant documentation snippets and uses the OpenAI API to provide a response in natural language. The bot is reactive to changes to the corpus of documents: once new snippets are provided, it reindexes them and starts to use the new knowledge to answer subsequent queries. + + +::article-img +--- +src: '/assets/content/showcases/llm-app/llm-architecture.png' +alt: 'Illustration of LLMs in Pathway' +class: 'mx-auto' +zoomable: true +--- +:: + +In this series of posts we will explain below how Pathway can be used to: +- Implement a microservice that responds in realtime to HTTP REST queries +- Implement a realtime document indexing pipeline directly reading data from S3-compatible storage, without having to query a vector document database +- Extend the query building process to handle user sessions and beta tests for new models +- Reuse exactly the same code for offline evaluation of the system. + +For an in-depth exploration of our app's underlying code, visit our GitHub repository at [llm-app](https://github.com/pathwaycom/llm-app). Ensure you install the necessary dependencies with `poetry` by following the steps on the README file before diving in. We value transparency and collaborative learning, and invite you to explore and contribute to this open-source platform. + +::shoutout-banner +--- +href: "/solutions/rag-pipelines" +icon: "heroicons:code-bracket-square-solid" +--- +#title +Realtime Document AI pipelines +#description +Get started with Pathway Realtime Document AI pipelines with our step-by-step guide, from setup to live document sync. Explore built-in features like Similarity Search, Vector Index, and more! +:: + +## Warmup: answering user queries without context +The simplest way to get started with a conversational AI model using Pathway is to create an application that answers user queries without any context. This application will leverage a RESTful API and apply a Large Language Model. + +**Key Insights from This Section** +- How to use a REST connector. +- Apply an LLM or any custom model on a user query. + + +The corresponding code can be located in the `examples/pipelines` directory. Now, we'll proceed with importing `Pathway`. + + +```python [contextless/app.py] +import os +import pathway as pw +from pathway.xpacks.llm.embedders import OpenAIEmbedder +from pathway.xpacks.llm.llms import OpenAIChat, prompt_chat_single_qa +``` + + +```python [contextless/app.py] + +# REST Connector config. +HTTP_HOST = os.environ.get("PATHWAY_REST_CONNECTOR_HOST", "127.0.0.1") +HTTP_PORT = os.environ.get("PATHWAY_REST_CONNECTOR_PORT", "8080") + +# LLM model parameters +# For OPENAI API +API_KEY = os.environ["OPENAI_API_KEY"] +# Specific model from OpenAI. You can also use gpt-3.5-turbo for faster responses. +MODEL_LOCATOR = "gpt-4" +# Controls the stochasticity of the openai model output. +TEMPERATURE = 0.0 +# Max completion tokens +MAX_TOKENS = 50 +``` + +Firstly, we define the input schema for our application. This is done using pw.Schema, which helps to enforce the structure of the data being processed by Pathway. Our schema, QueryInputSchema, expects a query (the question or prompt from the user) and a user (the identifier for the user). Then, we establish a RESTful connection using `pw.io.http.rest_connector`. + + +```python [contextless/app.py] +class QueryInputSchema(pw.Schema): + query: str + user: str + + +query, response_writer = pw.io.http.rest_connector( + host=HTTP_HOST, + port=int(HTTP_PORT), + schema=QueryInputSchema, + autocommit_duration_ms=50, +) +``` + +Here, `query` will be a Pathway stream that receives input from HTTP requests. `response_writer` is a function that we can use to write responses back to the HTTP client. +We can now construct the main query pipeline for our application. The model to use here is GPT4 from OpenAI API. + + +```python [contextless/app.py] +model = OpenAIChat( + api_key=API_KEY, + model=MODEL_LOCATOR, + temperature=TEMPERATURE, + max_tokens=MAX_TOKENS, + retry_strategy=pw.asynchronous.FixedDelayRetryStrategy(), + cache_strategy=pw.asynchronous.DefaultCache(), +) + +response = query.select( + query_id=pw.this.id, result=model(prompt_chat_single_qa(pw.this.query)) +) + +response_writer(response) +pw.run() +``` +```shellscript +poetry run ./run_examples.py contextless +``` +On a different terminal: +```shellscript +curl --data '{"user": "user", "query": "How to connect to Kafka in Pathway?"}' http://localhost:8080/ +``` + +::article-img +--- +src: '/assets/content/showcases/llm-app/contextless_curl.png' +alt: 'Run LLM on a user query.' +class: 'mx-auto' +--- +:: + + +## Context Enhancement for Better Responses + +Despite `GPT-4`'s extensive training, it may not recognize certain context-specific elements, such as Pathway documentation. The solution lies in adding pertinent documents to the context. This is where the role of a vector database becomes crucial. + + +**Essential Learnings from This Section** + +- Use an S3 input connector in Pathway. +- Generating vector embeddings using an LLM. +- Creating a k-Nearest Neighbors (k-NN) powered Index. +- Expanding the bot's capabilities to respond to user queries. + + +In our illustration, we'll consider a scenario where documents are stored in JSON Lines files within an AWS S3 bucket, though it could equally apply to a local directory using [`jsonlines`](/developers/api-docs/pathway-io/jsonlines/) reader. Each document is represented as a separate line within these files. The JSON Lines format is particularly advantageous for managing large data sets that cannot fit into memory all at once. Each line in a JSON Lines file contains a separate, independent JSON object. This makes the format especially suitable for handling and streaming large data, as it doesn't require loading the entire files into memory. + + +For each document and each query, we calculate embeddings using a pre-trained language model. These embeddings are numerical representations of the documents and they are used to find the documents that are most relevant to each query. Pathway offers API integration with premier LLM service providers, including but not limited to OpenAI and HuggingFace. You can import the model interface for the provider of your choice, specify the api key and the model id to call. By default the embedder is `text-embedding-ada-002` from OpenAI which returns vectors of dimension `1536`. Please check out [openai-model-endpoint-compatibility](https://platform.openai.com/docs/models/model-endpoint-compatibility) for more information on the available models. + + +```python [contextful/app.py] +from pathway.stdlib.ml.index import KNNIndex + + +EMBEDDER_LOCATOR = "text-embedding-ada-002" +EMBEDDING_DIMENSION = 1536 + +embedder = OpenAIEmbedder( + api_key=api_key, + model=EMBEDDER_LOCATOR, + retry_strategy=pw.asynchronous.FixedDelayRetryStrategy(), + cache_strategy=pw.asynchronous.DefaultCache(), +) + + +class DocumentInputSchema(pw.Schema): + doc: str + +documents = pw.io.s3.read( + "llm_demo/data/", + aws_s3_settings=pw.io.s3.AwsS3Settings( + bucket_name="pathway-examples", + region="eu-central-1", + ), + format="json", + schema=DocumentInputSchema, + mode="streaming" +) + + +enriched_documents = documents + documents.select( + data=embedder(text=pw.this.doc) +) + +query += query.select( + data=embedder(text=pw.this.query), +) + + +``` + + | query | data + ^X1MXHYY... | How to connect to Kafka in Pathway? | [-0.00027798660448752344, 0.0035375410225242376, -0.00889557134360075...] + + +To achieve efficient retrieval of relevant documents, we leverage the power of [KNN (K-Nearest Neighbors)](/developers/showcases/lsh/lsh_chapter1) indexing. By constructing an index using the generated embeddings, the KNN model allows us to quickly identify the documents that bear the most similarity to a given query. This technique is significantly faster and more efficient than conducting individual comparisons between the query and every document. + + +```python [contextful/app.py] +index = KNNIndex(enriched_documents, d=EMBEDDING_DIMENSION) + +query_context = query + index.get_nearest_items( + query.data, k=3, collapse_rows=True + ).select(documents_list=pw.this.doc) + +``` + + | query | documents_list + ^X1MXHYY... | How to connect to Kafka in Pathway? | ('The documentation describes a ...', 'The pw.io.debezium.read() func...', 'This documentation lists the a...') + + +By implementing the `build_prompt` function, we consolidate the query and associated documents into one coherent string, allowing the model to use the given documents for contextual understanding when generating its response. This procedure also provides an opportunity to include specific directives and guidelines for the Large Language Model (LLM) to adhere to. + + +```python [contextful/app.py] +@pw.udf +def build_prompt(documents, query) -> str: + docs_str = "\n".join(documents) + prompt = ( + f"Given the following documents : \n {docs_str} \nanswer this query: {query}" + ) + return prompt + + +prompt = query_context.select( + prompt=build_prompt(pw.this.documents_list, pw.this.query) +) + + +``` + + | prompt + ^X1MXHYY... | Given the following documents... + + +Ultimately, we invoke the `GPT-4` model with these thoughtfully crafted prompts and observe the sophistication of its generated responses. + + +```python [contextful/app.py] +response = prompt.select( + query_id=pw.this.id, + result=model( + pw.this.prompt, + ), +) + +response_writer(response) +pw.run() +``` + +```shellscript +poetry run ./run_examples.py contextful_s3 +``` + +```shellscript +curl --data '{"user": "user", "query": "How to connect to Kafka in Pathway?"}' http://localhost:8080/ +``` + +::article-img +--- +src: '/assets/content/showcases/llm-app/contextful_curl.png' +alt: 'Query result after using pathway index.' +class: 'mx-auto' +--- +:: + +## Real-time Adaptability: Automatic Updates with Pathway + +A remarkable feature of Pathway is its automatic adaptability to changes. This feature makes Pathway an effective and efficient tool for real-time document indexing and query answering. + +Once you have preprocessed your corpus and created the index, Pathway automatically detects any changes in the document directory and updates the vector index accordingly. This real-time reactivity ensures that app's responses are always based on the most recent and relevant information available. + +Let's put this feature to the test. Consider a scenario where you initially query the system with "How to run large language models with Pathway?". Since the bot doesn't have any context about LLMs in Pathway, it wouldn't provide a satisfactory response at this point. + +```shellscript +curl --data '{"user": "user", "query": "How to use LLMs in Pathway?"}' http://localhost:8080/ +``` + +::article-img +--- +src: '/assets/content/showcases/llm-app/contextful_react_pre_curl.png' +alt: 'Query before adding new docs to data source.' +class: 'mx-auto' +--- +:: + +Next, we add some additional documents which provide context about Pathway to our S3 bucket. + +```shellscript +aws s3 cp documents_extra.jsonl s3://pathway-examples/llm_demo/data/ +``` + +Now, when you query the system with the same question again, Pathway automatically detects the newly added documents, updates the vector index, and the bot can provide a more appropriate response. + +```shellscript +curl --data '{"user": "user", "query": "How to use LLMs in Pathway?"}' http://localhost:8080/ +``` + + +::article-img +--- +src: '/assets/content/showcases/llm-app/contextful_react_post_curl.png' +alt: 'Query after adding new docs to data source.' +class: 'mx-auto' +--- +:: + +This real-time adaptability of Pathway is truly a game-changer when it comes to keeping your AI models updated with the latest data. + +At this point, you should have a complete pipeline that not only sifts continuously through your document database to find the most relevant documents for a given query but also calls upon a Generative AI model to generate a detailed and coherent response based on these relevant documents. + +The power of Pathway lies in its flexibility and robustness - you can tweak this pipeline to suit a variety of other applications, from customer support to medical literature review. The possibilities are truly endless. + +::shoutout-banner +--- +href: "https://discord.gg/pathway" +icon: "ic:baseline-discord" +--- +#title +Discuss tricks & tips for RAG +#description +Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +:: diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/30.vectorstore_pipeline.md b/docs/2.developers/4.user-guide/115.llm-xpack/30.vectorstore_pipeline.md new file mode 120000 index 00000000..7aa379aa --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/30.vectorstore_pipeline.md @@ -0,0 +1 @@ +.vectorstore_pipeline/article.md \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/115.llm-xpack/_dir.yml b/docs/2.developers/4.user-guide/115.llm-xpack/_dir.yml new file mode 100644 index 00000000..e9c85187 --- /dev/null +++ b/docs/2.developers/4.user-guide/115.llm-xpack/_dir.yml @@ -0,0 +1 @@ +title: 'LLM xpack' \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/120.development/10.troubleshooting.md b/docs/2.developers/4.user-guide/120.development/10.troubleshooting.md new file mode 100644 index 00000000..c939e84b --- /dev/null +++ b/docs/2.developers/4.user-guide/120.development/10.troubleshooting.md @@ -0,0 +1,125 @@ +--- +title: 'Troubleshooting' +description: '' +--- + +# Troubleshooting +This page provides a guide to common issues encountered when using a Pathway. + +As with any powerful tool, there is a learning curve to using Pathway effectively. +This guide will outline some of the most common problems developers encounter when working with Pathway, along with tips and best practices for avoiding these pitfalls. +Whether you are new to Pathway or an experienced user, this guide will help you to optimize your workflow and avoid common mistakes. +By following these guidelines, you can save yourself time and frustration and get the most out of Pathway. + +So let's get started! + +:article-toc + +## Package versioning +If a module is missing or you encounter an issue reproducing the examples displayed on the website, you will likely need the latest version of Pathway. +The best solution is to reinstall it. + +⚠️ Pathway requires Python 3.10 or higher and runs on Linux and MacOS. + +Possible error messages: +- `This is not the real Pathway package.` +- `ModuleNotFoundError: No module named 'pathway.stdlib'` + +Solution: +- Reinstall Pathway by uninstalling it first: +```bash +pip uninstall pathway && pip install pathway +``` +- If the above solution does not work, try installing Pathway with `--force-reinstall`: +```bash +pip install --force-reinstall pathway +``` + +You can access your Pathway version using `pw.__version_` or `pathway --version` in the CLI. + +⚠️ Windows is currently not supported. + +### Docker on MacOS +When using docker on MacOS, make sure to have a linux/x86_64 platform: +- `FROM --platform:linux/x86_64 python:3.10` + +## Windows users + +⚠️ Pathway is currently not supported on Windows. +Windows users may want to use Windows Subsystem for Linux (WSL), docker, or a VM. + +You can also try these steps in an online notebook environment like [Colab](https://colab.research.google.com/). + + +## Nothing happens / missing output + +### The application terminates. + +You launch your Pathway application, and nothing happens: no outputs, no errors. +The application terminates without error, but the expected output, whether stdout or a CSV file, is empty. + +Your application likely builds the dataflow, but doesn't launch a computation using it. You need to trigger the computation: + - streaming mode: use `pw.run()` (in addition to the use of output connectors), + - static mode: print your table using `pw.debug.compute_and_print`. + +Explanation: + - Pathway's operators are used to build a pipeline modeled by a [dataflow](/developers/user-guide/introduction/concepts#dataflow). + The data is ingested only when the computation is started. + In the streaming mode, the computation is launched using `pw.run()`, while in the static mode, the computation is triggered at each output connector call. + See our [article](/developers/user-guide/connecting-to-data/streaming-and-static-modes/) about streaming and static modes for more details. + + ### The application does not terminate + +Similarly to above, you launch your Pathway application, and nothing happens. +The application is running, there is no error but, at the same time, the output remains empty. + +You input data is likely to be at fault: + - is your incoming data stream on? + - are you correctly connected to it? +For example, let's suppose you have a data stream with a column colA. +If you are connecting to this data source with an incorrect schema, expecting a colB instead of colA for instance, +the data may be coming but it will not be ingested as the system expects values for colB and not colA. +This will not trigger any error as the connection is working, expecting correct entries. + + +## Different universes + +Error message: +- `ValueError: universes do not match` + +Explanation: + - The error comes from the combination of two tables with different universes. + The universe is the set of indexes of each table. + Some operations, such as [`update_cells`](/developers/api-docs/pathway/#update_cellsother) or [`update_rows`](/developers/api-docs/pathway/#update_rowsother), require the universes to be identical, or at least one should be a subset of the other. + You can read our [article](/developers/user-guide/diving-deeper/id-universes) about universes. + Pathway will raise an error when it is impossible to infer whether two tables have the same universe. + You can manually assert that the two universes are compatible with `unsafe_promise_same_universe_as` or `unsafe_promise_universe_is_subset_of`. + +Solutions: +- You can force the given operation by giving a manual guarantee that the universe will be the same: +```python +T1=T1.unsafe_promise_same_universe_as(T2) +# OR +T1=T1.unsafe_promise_universe_is_subset_of(T2) +``` + + + +# Still blocked? + + +We hope this guide has helped identify and avoid common mistakes when using Pathway. +Our team is always happy to help you find a solution to your problem and ensure that you get the most out of Pathway. +If you have any questions or encounter issues not covered in this guide, don't hesitate to get in touch with us on our [Discord channel](https://discord.com/invite/pathway). diff --git a/docs/2.developers/4.user-guide/120.development/30.get-help.md b/docs/2.developers/4.user-guide/120.development/30.get-help.md new file mode 100644 index 00000000..21f6a0fc --- /dev/null +++ b/docs/2.developers/4.user-guide/120.development/30.get-help.md @@ -0,0 +1,97 @@ +--- +title: Get help +description: 'This page references where to look for help if you have an issue using Pathway' +--- + +::landing-section +#title +How can we help you? + +#default +We're here to help you make the most of your experience. Whether you're a beginner or an experienced developer, this page provides resources to assist you with common issues and inquiries. + + ::card-grid + #default + ::landing-card + --- + icon: 'heroicons:bolt-solid' + button: + text: 'Get Pathway' + href: '/developers/user-guide/introduction/welcome/' + --- + #title + Getting started + + #default + Trouble installing Pathway? Take a look a our installation guide. + :: + ::landing-card + --- + icon: 'heroicons:book-open-20-solid' + button: + text: 'See the API docs' + href: '/developers/api-docs/pathway/' + --- + #title + API docs. + + #default + Wondering how to use a specific feature of Pathway? The answer to your question is likely in the API docs. + :: + ::landing-card + --- + icon: 'heroicons:wrench-screwdriver-20-solid' + button: + text: 'See more' + href: '/developers/user-guide/development/troubleshooting/' + --- + #title + Troubleshooting + + #default + You got an error? Try to debug your code with the troubleshooting guide. + :: + :: + ::card-grid + #default + ::landing-card + --- + icon: 'uil:github' + button: + text: 'Open a GitHub issue' + href: 'https://github.com/pathwaycom/pathway/issues' + --- + #title + GitHub issues + + #default + If you've identified a bug, have a feature request, or need technical support, open an issue on GitHub. + :: + ::landing-card + --- + icon: 'uil:discord' + button: + text: 'Join us on Discord' + href: 'https://github.com/pathwaycom/' + --- + #title + Connect with Us on Discord + + #default + Join our community on Discord to connect with other users and get real-time assistance from our team. + :: + ::landing-card + --- + icon: 'heroicons:chat-bubble-left-ellipsis-20-solid' + button: + text: 'Send us an email' + href: 'mailto:contact@pathway.com' + --- + #title + Contact us + + #default + For private inquiries or matters that are not suitable for public discussion, feel free to reach out to us via email. + :: + :: +:: \ No newline at end of file diff --git a/docs/2.developers/4.user-guide/120.development/_dir.yml b/docs/2.developers/4.user-guide/120.development/_dir.yml new file mode 100644 index 00000000..ccdbbacb --- /dev/null +++ b/docs/2.developers/4.user-guide/120.development/_dir.yml @@ -0,0 +1 @@ +title: 'Help And Updates' diff --git a/docs/2.developers/7.showcases/.adaptive-rag/.gitignore b/docs/2.developers/7.showcases/.adaptive-rag/.gitignore new file mode 100644 index 00000000..6abfbd0a --- /dev/null +++ b/docs/2.developers/7.showcases/.adaptive-rag/.gitignore @@ -0,0 +1 @@ +adaptive-rag-contexts.jsonl* diff --git a/docs/2.developers/7.showcases/.adaptive-rag/__init__.py b/docs/2.developers/7.showcases/.adaptive-rag/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.adaptive-rag/article.py b/docs/2.developers/7.showcases/.adaptive-rag/article.py new file mode 100644 index 00000000..54c90c8d --- /dev/null +++ b/docs/2.developers/7.showcases/.adaptive-rag/article.py @@ -0,0 +1,332 @@ +# --- +# title: 'Cheap RAGs up for grabs: How we cut LLM costs without sacrificing accuracy?' +# description: 'Cheap RAGs up for grabs: How we cut LLM costs without sacrificing accuracy?' +# aside: true +# article: +# thumbnail: '/assets/content/blog/adaptive-rag-plots/visual-abstract.png' +# tags: ['showcase', 'llm'] +# date: '2024-03-28' +# related: false +# notebook_export_path: notebooks/showcases/adaptive_rag_question_answering.ipynb +# author: +# name: Jan Chorowski +# description: CTO +# img: "/assets/authors/jan-chorowski.jpg" +# linkedin: "https://www.linkedin.com/in/janchorowski/" +# extra_info: joint work with Jacek Kowalski, Szymon Dudycz +# keywords: ['LLM', 'RAG', 'Adaptive RAG', 'prompt engineering', 'prompt', 'explainability'] +# --- + +# # Cheap RAGs up for grabs: How we cut LLM costs without sacrificing accuracy? + +# ![Visual abstract](/assets/content/blog/adaptive-rag-plots/visual-abstract.png) +# + +# **TLDR**: We demonstrate how to dynamically adapt the number of documents in a RAG prompt using feedback from the LLM. This allows a 4x cost reduction of RAG LLM question answering while maintaining good accuracy. We also show that the method helps explain the lineage of LLM outputs. We provide runnable code examples implemented in the [Pathway](https://pathway.com/developers) [LLM expansion pack](/developers/user-guide/llm-xpack/overview), feel free to [skip directly to the code](#trying-it-out). + +#
+# +# What are blue boxes? +# +# Blue boxes, such as this one, provide additional insights which can be skipped. However all are eager to be read!. +#
+ + +# ## Introduction +# +# Retrieval Augmented Generation (RAG) allows Large Language Models (LLMs) to answer questions based on knowledge not present in the original training set. At [Pathway](pathway.com) we use RAG to build [document intelligence solutions](/solutions/rag-pipelines) that answer questions based on private document collections, such as a repository of legal contracts. We are constantly working on improving the accuracy and explainability of our models while keeping the costs low. In this blog post, we share a trick that helped us reach those goals. +# +# A typical RAG Question Answering procedure works in two steps. First the question is analyzed and a number of relevant documents are retrieved from a database, typically using a similarity search inside a vector space created by a neural embedding model. Second, retrieved documents are pasted, along with the original question, into a prompt which is sent to the LLM. Thus, the LLM answers the question within a relevant context. +# +# Practical implementations of the RAG procedure need to specify the number of documents put into the prompt. A large number of documents increases the ability of the LLM to provide a correct answer, but also increases LLM costs, which typically grow linearly with the length of the provided prompt. The prompt size also influences model explainability: retrieved context documents explain and justify the answer of the LLM and the fewer context documents are needed, the easier it is to verify and trust model outputs. +# +# Thus the context size, given by the number of considered documents in a RAG setup, must be chosen to balance costs, desired answer quality, and explainability. However, can we do better than using the same context size regardless of the question to be answered? Intuitively, not all questions are equally hard and some can be answered using a small number of supporting documents, while some may require the LLM to consult a larger prompt. We can confirm this by running a question answering experiment. +# + +#
+# +# Experiment details and prompt +# +# +# We base our evaluation on the Stanford Question Answering Dataset ([SQUAD](https://rajpurkar.github.io/SQuAD-explorer/)) which we convert into a RAG open-domain question answering task. We take the first 50 Wikipedia articles that form the SQUAD dataset. As in SQUAD, each article is split into paragraphs and we select for testing one SQUAD question for each paragraph, which yields nearly 2500 tuples containing a paragraph, a query, and a ground-truth answer. +# +# We form a document knowledge base from all wikipedia paragraphs indexed using their vector representations computed using [OpenAI ADA-002](https://platform.openai.com/docs/guides/embeddings) neural embedding model. To answer a question we vectorize the question using ADA-002 embedder, then select closest paragraphs according to the cosine distance. We use the retrieved paragraphs to construct the following prompt inspired by LlamaIndex [QueryCitationEngine](https://docs.llamaindex.ai/en/stable/examples/query_engine/citation_query_engine/): +# +# ``` +# Use the below articles to answer the subsequent question. +# If the answer cannot be found in the articles, write +# "I could not find an answer." Do not answer in full sentences. +# When referencing information from a source, cite the appropriate +# source(s) using their corresponding numbers. Every answer should +# include at least one source citation. +# +# Only cite a source when you are explicitly referencing it. For example: +# "Source 1: +# The sky is red in the evening and blue in the morning. +# Source 2: +# Water is wet when the sky is red. +# Query: When is water wet? +# Answer: When the sky is red [2], which occurs in the evening [1]." +# Now it's your turn. +# ``` +# +# Unless stated otherwise, all results use gpt-3.5-turbo in the gpt-3.5-turbo-0613 variant. +# +# The LLM can respond using a paraphrase of the ground-truth answer. To properly score paraphrases we canonicalize answers by removing using [nltk](https://www.nltk.org/) all stopwords and stemming all remaining words. We then count an answer as correct if all remaining stemmed words in the ground truth are found in the LLM RAG response. While this method yields some false negatives, we found it to be simple and robust enough to tune RAG question answering systems. +#
+ +# The chart below shows a typical relation between accuracy and supporting context size for a RAG question answering system using a budget friendly LLM (e.g. gpt-3.5-turbo): + +# ![Accuracy of base RAG](/assets/content/blog/adaptive-rag-plots/base-rag-accuracy.png) + +# We can notice a few things. First, even with one supporting document the model is right 68% of the time. The accuracy rises by more than 10 percentage points to nearly 80% when we provide it with 10 context documents. We then see diminishing returns: increasing the prompt size to 50 documents brings the accuracy just above 82%. With 100 context documents the model is getting slightly worse. The LLM has trouble finding the correct information in the large prompt provided to it, this phenomenon has been [widely observed](https://arxiv.org/abs/2307.03172) and LLM providers are actively working on expanding both [supported](https://www.anthropic.com/news/100k-context-windows) [context](https://openai.com/blog/new-models-and-developer-products-announced-at-devday) [sizes](https://blog.google/technology/ai/google-gemini-next-generation-model-february-2024/#gemini-15) and the [accuracy](https://twitter.com/AnthropicAI/status/1656700156518060033?lang=en) of [information](https://github.com/gkamradt/LLMTest_NeedleInAHaystack) [retrieval](https://twitter.com/JeffDean/status/1758146211029405951) from [context](https://twitter.com/alexalbert__/status/1764722513014329620). +# +# We can get further insights into the operation of the RAG LLM by performing error analysis: categorizing each error as a `do not know` if the LLM refuses to provide an answer, or a `hallucination` if the LLM provides a wrong answer: + +# ![Accuracy of base RAG](/assets/content/blog/adaptive-rag-plots/mistake-analysis.png) + +# We can see two opposite trends: as the number of supporting documents grows, we get fewer `Do not know`s. However, at the same time, the number of hallucinated answers slightly raises. Those trends can be explained with the following hypothesis: The more context we provide to the LLM, the higher chances there are that the model will retrieve some information (instead of answering with a `Do not know`), but a larger prompt may also confuse the model and promote wrong answers (hallucinations). + +#
+# +# Examples of model behavior +# +# Further insight may be gained from analyzing model errors on a few concrete examples. We will look at the answers when the LLM is provided with a varying number of context documents: two (documents 0-1), five (documents 0-4), ten, and twenty. The table is not a representative sample of model behavior: we only list questions on which the model answered wrong with 2 context documents. +# +# We see that for many questions the LLM knows the correct response after seeing sufficiently many context documents. Moreover, it indicates that the smaller context is insufficient. We will use this to formulate an Adaptive RAG strategy next. +# +#

Question

Ground truth SQUAD answer

Answer for 2 context docs

Answer for 5 context docs

Answer for 10 context docs

Answer for 20 context docs

What was undertaken in 2010 to determine where dogs originated from?

extensive genetic studies

Genetic studies were undertaken in 2010 to determine where dogs originated from [0].

Genetic studies were undertaken in 2010 to determine where dogs originated from [0].

Genetic studies were undertaken in 2010 to determine where dogs originated from [1].

Extensive genetic studies were undertaken in 2010 to determine where dogs originated from [0].

When it is burned what does hydrogen make?

water

I could not find an answer.

Water [4]

Water [2]

Water [2].

What fraction of New Yorkers in the private sector are employed by foreign companies?

One out of ten

I could not find an answer.

One out of ten private sector jobs in New York City is with a foreign company [0].

One out of ten private sector jobs in New York City is with a foreign company [0].

One out of ten private sector jobs in New York City is with a foreign company [0].

Deshin Shekpa was the head of what school?

the Karma Kagyu school

I could not find an answer.

I could not find an answer.

Karma Kagyu [3]

I could not find an answer.

Middle Devonian limestone exists between Plymstock and what location?

Cremyll

I could not find an answer.

West Hoe [0]

West Hoe [0]

I could not find an answer.

How much precipitation on average falls within the city?

35.9 inches

The city receives 49.9 inches (1,270 mm) of precipitation annually [0].

The city receives about 35.9 inches (91.2 cm) of precipitation annually [2].

The city receives about 35.9 inches (91.2 cm) of precipitation annually [2]

The city receives about 35.9 inches (91.2 cm) of precipitation annually [2].

Who was added to the tour in season 12?

a semi-finalist who won a sing-off

I could not find an answer.

I could not find an answer.

A semi-finalist who won a sing-off [6].

A semi-finalist who won a sing-off was added to the tour in season 12 [6].

What is the name of the Buddhist doctrine that states that phenomena arise together in a mutually interdependent web of cause and effect?

pratītyasamutpāda, (Sanskrit; Pali: paticcasamuppāda; Tibetan Wylie: rten cing 'brel bar 'byung ba; Chinese: 緣起)

The Buddhist doctrine that states that phenomena arise together in a mutually interdependent web of cause and effect is called pratītyasamutpāda [0].

The doctrine is called pratītyasamutpāda [0].

The doctrine is called pratītyasamutpāda [0]

The doctrine is called pratītyasamutpāda [0].

Which actress carried the torch for part of the route?

Sulaf Fawakherji

Gabriela Sabatini [1]

I could not find an answer.

Sulaf Fawakherji [8]

Maria Nafpliotou [10]

Who provides a virtual server service?

Internet hosting services

I could not find an answer.

A virtual server service is provided by internet hosting services [0]

A virtual server service is provided by internet hosting services [0].

I could not find an answer.

+#
+ +# ## Adaptive RAG +# We can use the model’s refusal to answer questions as a form of model introspection which enables an adaptive RAG question answering strategy: +# +# ::card +# #title +# Adaptive Rag Idea +# #description +# Ask the LLM with a small number of context documents. If it refuses to answer, repeat the question with a larger prompt. +# :: +# +# This RAG scheme adapts to the hardness of the question and the quality of the retrieved supporting documents using the feedback from the LLM - for most documents a single LLM call with a small prompt is sufficient, and there is no need for auxiliary LLM calls to e.g. guess an initial supporting document count for a question. However, a fraction of questions will require re-asking or rere-asking the LLM. +# +# ### How to expand the prompt? Linearly? Exponentially? +# +# To turn the adaptive RAG intuition into a practical system we need to specify some vital details. For starters, a good prompt expansions scheme must balance: +# +# - low typical question cost: more than 60% of questions need only one supporting document +# - low hard question cost: the LLM will be called multiple times to answer a hard question. The whole cost of handling the question should be low +# - acceptable latency on hard questions: the number of LLM answer retires should grow slowly with question difficulty +# +# These design criteria can be met by expanding the prompt according to a geometric series, such as doubling the number of documents in the prompt on each retry. It has the following advantages: +# - We can start with just one supporting document: easy questions are answered fast and cheap +# - The number of sequential LLM calls (latency) grows logarithmically (read slowly) with final supporting context size. With a prompt doubling strategy only 6 rounds are needed to reach the accuracy plateau in Figure 1 with $64$ supporting documents. +# - The total cost of repeated calls to the LLM with expanding prompts only grows linearly. The summed count of documents in the 6 prompts needed to reach 64 supporting documents is $1+2+4+...+64 = 127 = 2*64 -1$. Form math nerds, recall that for an expansion factor $k$ the total cost of geometric LLM retries is $\sum_{i=0}^{n} k^i = \frac{k*k^n - 1}{k-1} <= \frac{k}{k-1}k^n$. In other words, the total cost is only $\frac{k}{k-1}$ than the cost of doing the last LMM query on the large context needed to answer the question. +# +# Of course, the cost-vs-latency tradeoff can be further tuned by choosing a different prompt expansion factor. In contrast, a linear prompt expansion strategy which grows the prompt by a constant number of documents (e.g. first use 5, then 10, then 15, then 20, ...) will result in the latency growing linearly (read much faster than logarithmically) and the LLM cost growing quadratically (again much faster than linearly) with final prompt size! +# + +# ### How subsequent subsequent prompts relate? +# +# This design question is less intuitive. Suppose LLM didn’t find an answer when provided with K most relevant documents from the knowledge. The next prompt could: +# - include the K documents which were not sufficient along with next $K$ documents (overlapping prompts strategy), or +# - ignore the K insufficient documents and instead try the next $2K$ documents (non-overlapping prompts strategy). +# +# The overlapping prompts strategy is closer to the original experiment with RAG performance over different context sizes. With a doubling expansion strategy, $i$-th call to the LLM sees $2^i$ most relevant documents. The non-overlapping prompts strategy at first sight is cheaper: $i$-th call to the LLM sees documents $2^i,..., 2\cdot 2^i -1$. However, this means that the LLM sees more and more irrelevant documents. +# +# To answer this question we experimentally compare the two adaptive strategies along with the base RAG. +# Both adaptive RAG strategies were tested up to 100 retrieved documents. +# +# ![Comparison of accuracy of Adaptive RAG and Base RAG](/assets/content/blog/adaptive-rag-plots/adaptive-comparison-accuracy.png) +# +# We see that the accuracy of base RAG and the overlapping expanding prompt strategy is very similar up to about 20 retrieved documents, (at 50 consulted articles the baseline RAG gets 1 percentage point better). However, the cost versus accuracy plot below clearly indicates that adaptive RAG is significantly cheaper which in our opinion makes up for the slightly lower accuracy, similarly to the wide preference for using smaller or `turbo` models to save on running costs. +# +# In contrast, the non-overlapping prompt creation strategy is clearly less accurate. This shows that the LLM needs the most relevant documents even if they are not sufficient by themselves to formulate a confident answer. +# +# ![Comparison of efficiency of Adaptive RAG and Base RAG](/assets/content/blog/adaptive-rag-plots/adaptive-comparison-efficiency.png) +# +# The costs vs accuracy plot clearly indicates that the two adaptive RAGs are significantly more efficient than the basic variant, despite being able to consult more articles if needed. The non-overlapping adaptive RAG strategy is less accurate: even after using all 100 retrieved context documents, it cannot reach the peak performance of the overlap prompt creation strategy. This settles the second design decision :). + +#
+# +# Side-note: how repeatable are the ratios of hallucinations vs do not knows across different models +# +# +# While the `do not knows` and `hallucinations` both count as errors, their severity is very different: a `do not know` is merely not useful to the user, it just indicates further actions are needed to answer a question. On the other hand a hallucination is directly harmful: it propagates false knowledge. We thus believe that scoring models based on their accuracy alone is insufficient and it is beneficial to separately count refusals to answer and hallucinations. +# +# While doing experiments for this article we have found that different models in the same family, e.g. the gpt-3.5-turbo variants offer widely different performance, with newer models seemingly tuned to more confidently return hallucinated answers. +# +# ![Mistake analysis of GPT 3.5 Turbo 0125](/assets/content/blog/adaptive-rag-plots/mistake-analysis-gpt-3.5-turbo-0125.png) +# ![Mistake analysis of GPT 3.5 Turbo 1106](/assets/content/blog/adaptive-rag-plots/mistake-analysis-gpt-3.5-turbo-1106.png) +# ![Mistake analysis of GPT 4 0613](/assets/content/blog/adaptive-rag-plots/mistake-analysis-gpt-4-0613.png) +# +# From the `do not know` vs `hallucinations` plots we see that new models maintain or improve answer accuracy, however they are differently calibrated: there is a visible preference to answer wrong, but answer nevertheless. This breaks the self-introspection of models, which indicates if the question should be retried with a larger prompt or terminated. +# +# Proper LLM calibration is an important topic for us at Pathway and we are actively working on the topic. Stay tuned for a follow-up blog post concentrating on calibration. +# +#
+ +# ## Trying it out +# +# We provide an implementation of the Adaptive RAG in the [Pathway](https://pathway.com/developers) data processing framework. Pathway is your one-stop-shop for building realtime data processing pipelines, from simple ETL to synchronizing and indexing document collections into knowledge bases. The Pathway [LLM Xpack](/developers/user-guide/llm-xpack/overview/) is a set of pipeline components that are useful in working with LLMs: [auto-updating vector stores](/developers/user-guide/llm-xpack/vectorstore_pipeline/), [RAGs](/solutions/rag-pipelines) and [many more LLM examples](https://github.com/pathwaycom/llm-app). +# If you are interested in how Adaptive RAG is implemented inside Pathway, you can [dive into the internals directly here](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/question_answering.py#L37). + +# As a prerequisite to run the code, install necessary packages and download sample data which will be used. + +# + +# Uncomment, if you need to install Pathway and OpenAI packages +# # !pip install pathway +# # !pip install openai +# Download `adaptive-rag-contexts.jsonl` with ~1000 contexts from SQUAD dataset +# !wget -q -nc https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/adaptive-rag-contexts.jsonl + +# If you want to use cache, set `PATHWAY_PERSISTENT_STORAGE environmental variable +# # !export PATHWAY_PERSISTENT_STORAGE=".cache" +# _MD_COMMENT_START_ +if 1: # group to prevent isort messing up + import json + import os + + from common.shadows import fs + + os.environ["OPENAI_API_KEY"] = json.loads( + fs.open("vault://kv.v2:deployments@/legal_rag_demo").read() + )["OPENAI_KEY"] +# _MD_COMMENT_END_ + +# + +import getpass +import os + +import pandas as pd + +import pathway as pw +from pathway.stdlib.indexing import VectorDocumentIndex +from pathway.xpacks.llm.embedders import OpenAIEmbedder +from pathway.xpacks.llm.llms import OpenAIChat +from pathway.xpacks.llm.question_answering import ( + answer_with_geometric_rag_strategy_from_index, +) + +# Set OpenAI API Key +if "OPENAI_API_KEY" in os.environ: + api_key = os.environ["OPENAI_API_KEY"] +else: + api_key = getpass.getpass("OpenAI API Key:") + +# Parameters for OpenAI models +embedder_locator: str = "text-embedding-ada-002" +embedding_dimension: int = 1536 +chat_locator: str = "gpt-3.5-turbo-16k-0613" +max_tokens: int = 120 +temperature: float = 0.0 + +# Set up OpenAI Embedder and Chat +embedder = OpenAIEmbedder( + api_key=api_key, + model=embedder_locator, + retry_strategy=pw.udfs.FixedDelayRetryStrategy(), + cache_strategy=pw.udfs.DefaultCache(), +) + +model = OpenAIChat( + api_key=api_key, + model=chat_locator, + temperature=temperature, + max_tokens=max_tokens, + retry_strategy=pw.udfs.FixedDelayRetryStrategy(), + cache_strategy=pw.udfs.DefaultCache(), +) + + +# + +# Change logging configuration. Uncomment this if you want to see requests to OpenAI in the logs +# import logging +# import sys + +# logging.basicConfig(stream=sys.stderr, level=logging.INFO, force=True) + + +# + +# Load documents in which answers will be searched +class InputSchema(pw.Schema): + doc: str + + +documents = pw.io.fs.read( + "adaptive-rag-contexts.jsonl", + format="json", + schema=InputSchema, + json_field_paths={"doc": "/context"}, + mode="static", +) + +# Create table with questions +df = pd.DataFrame( + { + "query": [ + "When it is burned what does hydrogen make?", + "What was undertaken in 2010 to determine where dogs originated from?", + ] + } +) +query = pw.debug.table_from_pandas(df) + +# + +# Main part of the code - creating index of documents and running adaptive RAG! + +# Index for finding closest documents +index = VectorDocumentIndex( + documents.doc, documents, embedder, n_dimensions=embedding_dimension +) + +# Run Adaptive RAG +result = query.select( + question=query.query, + result=answer_with_geometric_rag_strategy_from_index( + query.query, + index, + documents.doc, + model, + n_starting_documents=2, + factor=2, + max_iterations=5, + ), +) + +# uncomment this line if you want to run the calculations and print the result +# pw.debug.compute_and_print(result) +# - + +# The result is a table with an answer to each question. If you want to see how many documents were needed to obtain the answer, you can check that in the logs. To print the logs with requests uncomment the cell which sets `logging.basicConfig(stream=sys.stderr, level=logging.INFO, force=True)`. + +# Then, when you run the code, you would see entries like this in the logs: +# ``` +# INFO:pathway.xpacks.llm.llms:{"_type": "openai_chat_request", "kwargs": {"temperature": 0.0, "max_tokens": 120, "model": "gpt-3.5-turbo-16k-0613"}, "id": "a7556bb5", "messages": [{"content": "\n Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Do not answer in full sentences.\n When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation.\n Only cite a source when you are explicitly referencing it. For example:\n\"Source 1:\nThe sky is red in the evening and blue in the morning.\nSource 2:\nWater is wet when the sky is red.\n\n Query: When is water wet?\nAnswer: When the sky is red [2], which occurs in the evening [1].\"\n Now it's your turn.\n------\nSource 1: One of the first quantum effects to be explicitly noticed (but not understood at the time) was a Maxwell observation involving hydrogen, half a century before full quantum mechanical theory arrived. Maxwell observed that the specific heat capacity of H2 unaccountably departs from that of a diatomic gas below room temperature and begins to increasingly resemble that of a monatomic gas at cryogenic temperatures. According to quantum theory, this behavior arises from the spacing of the (quantized) rotational energy levels, which are particularly wide-spaced in H2 because of its low mass. These widely spaced levels inhibit equal partition of heat energy into rotational motion in hydrogen at low temperatures. Diatomic gases composed of heavier atoms do not have such widely spaced levels and do not exhibit the same effect.\n\nSource 2: In 1671, Robert Boyle discovered and described the reaction between iron filings and dilute acids, which results in the production of hydrogen gas. In 1766, Henry Cavendish was the first to recognize hydrogen gas as a discrete substance, by naming the gas from a metal-acid reaction \"flammable air\". He speculated that \"flammable air\" was in fact identical to the hypothetical substance called \"phlogiston\" and further finding in 1781 that the gas produces water when burned. He is usually given credit for its discovery as an element. In 1783, Antoine Lavoisier gave the element the name hydrogen (from the Greek \u1f51\u03b4\u03c1\u03bf- hydro meaning \"water\" and -\u03b3\u03b5\u03bd\u03ae\u03c2 genes meaning \"creator\") when he and Laplace reproduced Cavendish's finding that water is produced when hydrogen is burned.\n------\nQuery: When it is burned what does hydrogen make?\nAnswer:", "role": "system"}]} +# INFO:pathway.xpacks.llm.llms:{"_type": "openai_chat_response", "response": "When hydrogen is burned, it produces water [2].", "id": "a7556bb5"} +# ``` +# This shows that question about water used only 2 sources. On the other hand, for the question about dogs, you will see multiple requests to openai, with the final one being: +# ``` +# INFO:pathway.xpacks.llm.llms:{"_type": "openai_chat_request", "kwargs": {"temperature": 0.0, "max_tokens": 120, "model": "gpt-3.5-turbo-16k-0613"}, "id": "3f61fa40", "messages": [{"content": "\n Use the below articles to answer the subsequent question. If the answer cannot be found in the articles, write \"I could not find an answer.\" Do not answer in full sentences.\n When referencing information from a source, cite the appropriate source(s) using their corresponding numbers. Every answer should include at least one source citation.\n Only cite a source when you are explicitly referencing it. For example:\n\"Source 1:\nThe sky is red in the evening and blue in the morning.\nSource 2:\nWater is wet when the sky is red.\n\n Query: When is water wet?\nAnswer: When the sky is red [2], which occurs in the evening [1].\"\n Now it's your turn.\n------\nSource 1: The coats of domestic dogs are of two varieties: \"double\" being common with dogs (as well as wolves) originating from colder climates, made up of a coarse guard hair and a soft down hair, or \"single\", with the topcoat only.\n\nSource 2: Dogs have lived and worked with humans in so many roles that they have earned the unique nickname, \"man's best friend\", a phrase used in other languages as well. They have been bred for herding livestock, hunting (e.g. pointers and hounds), rodent control, guarding, helping fishermen with nets, detection dogs, and pulling loads, in addition to their roles as companions. In 1957, a husky-terrier mix named Laika became the first animal to orbit the Earth.\n\nSource 3: In 1758, the taxonomist Linnaeus published in Systema Naturae a categorization of species which included the Canis species. Canis is a Latin word meaning dog, and the list included the dog-like carnivores: the domestic dog, wolves, foxes and jackals. The dog was classified as Canis familiaris, which means \"Dog-family\" or the family dog. On the next page he recorded the wolf as Canis lupus, which means \"Dog-wolf\". In 1978, a review aimed at reducing the number of recognized Canis species proposed that \"Canis dingo is now generally regarded as a distinctive feral domestic dog. Canis familiaris is used for domestic dogs, although taxonomically it should probably be synonymous with Canis lupus.\" In 1982, the first edition of Mammal Species of the World listed Canis familiaris under Canis lupus with the comment: \"Probably ancestor of and conspecific with the domestic dog, familiaris. Canis familiaris has page priority over Canis lupus, but both were published simultaneously in Linnaeus (1758), and Canis lupus has been universally used for this species\", which avoided classifying the wolf as the family dog. The dog is now listed among the many other Latin-named subspecies of Canis lupus as Canis lupus familiaris.\n\nSource 4: There have been two major trends in the changing status of pet dogs. The first has been the 'commodification' of the dog, shaping it to conform to human expectations of personality and behaviour. The second has been the broadening of the concept of the family and the home to include dogs-as-dogs within everyday routines and practices.\n\nSource 5: Domestic dogs have been selectively bred for millennia for various behaviors, sensory capabilities, and physical attributes. Modern dog breeds show more variation in size, appearance, and behavior than any other domestic animal. Dogs are predators and scavengers, and like many other predatory mammals, the dog has powerful muscles, fused wrist bones, a cardiovascular system that supports both sprinting and endurance, and teeth for catching and tearing.\n\nSource 6: In 14th-century England, hound (from Old English: hund) was the general word for all domestic canines, and dog referred to a subtype of hound, a group including the mastiff. It is believed this \"dog\" type was so common, it eventually became the prototype of the category \"hound\". By the 16th century, dog had become the general word, and hound had begun to refer only to types used for hunting. The word \"hound\" is ultimately derived from the Proto-Indo-European word *kwon- \"dog\".\n\nSource 7: The cohabitation of dogs and humans would have greatly improved the chances of survival for early human groups, and the domestication of dogs may have been one of the key forces that led to human success.\n\nSource 8: Medical detection dogs are capable of detecting diseases by sniffing a person directly or samples of urine or other specimens. Dogs can detect odour in one part per trillion, as their brain's olfactory cortex is (relative to total brain size) 40 times larger than humans. Dogs may have as many as 300 million odour receptors in their nose, while humans may have only 5 million. Each dog is trained specifically for the detection of single disease from the blood glucose level indicative to diabetes to cancer. To train a cancer dog requires 6 months. A Labrador Retriever called Daisy has detected 551 cancer patients with an accuracy of 93 percent and received the Blue Cross (for pets) Medal for her life-saving skills.\n------\nQuery: What was undertaken in 2010 to determine where dogs originated from?\nAnswer:", "role": "system"}]} +# ``` +# which contains 8 sources. + +# ::shoutout-banner +# --- +# href: "https://discord.gg/pathway" +# icon: "ic:baseline-discord" +# --- +# #title +# Discuss tricks & tips for RAG +# #description +# Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +# :: + +# ## Summary +# +# We have shown a simple and effective strategy to reduce RAG costs by adapting the number of supporting documents to LLM behavior on a given question. The approach builds on the ability of LLMs to know when they don’t know how to answer. With proper LLM confidence calibration the adaptive RAG is as accurate as a large context base RAG, while being much cheaper to run. +# +# Beyond cost savings, the adaptive RAG offers better model explainability - questions are answered using a small number of supporting documents, helping to see which documents are truly relevant to the answers and allowing better tracing of LLM answer lineages. +# +# At Pathway we are very excited about inventing new improvements to foundational LLM techniques and we are actively working on providing an integrated RAG pipeline in which the document index and LLMs collaborate to return the best answers, complete with self-tuning indexes to better rank documents for the needs of the LLM. [Connect with me](https://www.linkedin.com/in/janchorowski/) or [drop me a line](mailto:jan.chorowski@pathway.com) if you are interested! And share your cost-cutting strategies with me. +# diff --git a/docs/2.developers/7.showcases/.alerting_on_significant_changes/__init__.py b/docs/2.developers/7.showcases/.alerting_on_significant_changes/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.alerting_on_significant_changes/article.py b/docs/2.developers/7.showcases/.alerting_on_significant_changes/article.py new file mode 100644 index 00000000..fd2fe2e3 --- /dev/null +++ b/docs/2.developers/7.showcases/.alerting_on_significant_changes/article.py @@ -0,0 +1,172 @@ +# --- +# title: Alerting on significant changes +# description: Event stream processing +# author: 'mateusz' +# aside: true +# article: +# date: '2023-11-16' +# tags: ['tutorial', 'data-pipeline'] +# keywords: ['alert', 'deduplication', 'monitoring'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Alerting on significant changes +# +# In many monitoring environments, especially those dealing with complex systems, it's common for multiple alerts to be triggered for the same underlying issue. This creates the necessity for alert deduplication, with rules matching specific business needs. In this tutorial we will show how to design and implement such deduplication mechanism in Pathway handling real-time streaming data. +# +# For the sake of this tutorial, let's assume we observe a simple stream of floating-point values and our business rule is to notify only whenever the maximal observed value is 30% larger than the previously alerted value. +# +# ## Sample data +# Let's generate some static data, which we will convert to a stream later on: + + +# %% +# GENERATE AND PLOT SAMPLE DATA +import matplotlib.pyplot as plt +import numpy as np + +np.random.seed(10) +nb_points = 70 + +# Generate x values +x = np.linspace(0, nb_points - 1, nb_points).astype(int) + +# Generate y values with a globally increasing trend and periodic pattern +trend = 0.3 * x**1.1 # Globally increasing trend +periodic_pattern = 10 * np.sin(2 * np.pi * x / 20) +noise = np.random.normal(0, 1, nb_points) + +# Combine trend and periodic pattern to create y values +y = trend + periodic_pattern + noise + + +# PLOTTING +def set_params_plot(): + plt.xlabel("time") + plt.ylabel("value") + plt.xticks([], []) + plt.yticks([], []) + plt.title("") + + +# Plot the data points +plt.subplot(2, 1, 1) +set_params_plot() +plt.plot(x, y) + +plt.show() + +# %% [markdown] +# Great! The rule mentioned at the beginning should discover the peaks in this data. Let's see how to use Pathway to create an alerting application notifying us about these peaks. +# + +# %% [markdown] +# We start by creating a stream out of the above data + +# %% +import pathway as pw + +value_functions = { + "time": lambda i: int(x[i]), + "value": lambda i: float(y[i]), +} + + +class InputSchema(pw.Schema): + time: int + value: float + + +input = pw.demo.generate_custom_stream( + value_functions, + schema=InputSchema, + nb_rows=len(x), + input_rate=50, + autocommit_duration_ms=10, +) + +# %% [markdown] +# To track the maximum value, we could write `input.groupby().reduce(max=pw.reducers.max(input.value))`. Here we want to keep track also *when* this maximum occured, therefore we use the `argmax_rows` utility function. + +# %% +reduced = pw.utils.filtering.argmax_rows(input, what=input.value) + + +# %% [markdown] +# The newly defined `reduced` table will contain only at most a single row, which will be automatically updated by Pathway with a current maximum. This is not yet what we want - adding alerting callback listening for changes to the above table, would result in excessive notifications. +# + +# %% [markdown] +# We would want to keep a state with the previous maximum value and see if the change is significant, e.g. if a new maximum is 30% larger than the previous one. Such rule can be expressed as a plain Python function returning `True` if we want to accept new maximum and somehow save it in the state +# + + +# %% +def accept_larger_max(new_max: float, prev_max: float) -> bool: + return ( + new_max > prev_max * 1.3 + ) # your custom business rule for deduplicating alerts + + +# %% [markdown] +# All you have to do now is to use the `pw.stateful.deduplicate` function to tell Pathway to use your newly defined rule. New values pushed by the stream to the `col` column will be compared to the previously accepted value using the `acceptor` function which we just wrote. Pathway will keep the needed state (i.e. previously accepted value) and perform all the necessary updates for you. + +# %% +result = pw.stateful.deduplicate(reduced, col=reduced.value, acceptor=accept_larger_max) + +# %% [markdown] +# Now we can send the alerts to e.g. Slack. We can do it similarily as in the [realtime log monitoring tutorial](/developers/showcases/realtime-log-monitoring#scenario-2-sending-the-alert-to-slack) by using `pw.io.subscribe`. +# +# Here, for testing purposes, instead of sending an alert, we will store the accepted maxima in the list. + +# %% +alerts = [] + + +def send_alert(key, row, time, is_addition): + if is_addition: + alerts.append( + row + ) # change here to send slack message instead of appending to a list + + +# %% +pw.io.subscribe(result, send_alert) + +# %% [markdown] +# Let's run the program. Since the stream we defined is bounded (and we set high `input_rate` in the `generate_custom_stream`), the call to `pw.run` will finish quickly. Hovever, in most usecases, you will be streaming data (e.g. from kafka) indefinitely. + +# %% +pw.run(monitoring_level=pw.MonitoringLevel.NONE) + +# %% [markdown] +# Let's see the results and plot them on the dataset, to see what the alerts are: + +# %% +alerts + +# %% +# plot alerted points on top of the data +plt.subplot(2, 1, 1) +set_params_plot() +plt.plot(x, y) +plt.plot([r["time"] for r in alerts], [r["value"] for r in alerts], "x") +plt.show() + +# %% [markdown] +# Great, we won't be overwhelmed with excessive notifications! +# One downside is that initially we get some alerts, but this is how we defined our deduplication rule! It is easy to fix it by e.g. by considering maxima above a given threshold. +# +# The presented deduplication functionality can be used in many other contexts - e.g. in transportation to filter GPS positions of devices so that we keep only relevant measurements which are sufficiently distant apart. diff --git a/docs/2.developers/7.showcases/.bellman_ford/__init__.py b/docs/2.developers/7.showcases/.bellman_ford/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.bellman_ford/article.py b/docs/2.developers/7.showcases/.bellman_ford/article.py new file mode 100644 index 00000000..30d6626a --- /dev/null +++ b/docs/2.developers/7.showcases/.bellman_ford/article.py @@ -0,0 +1,194 @@ +# --- +# title: Bellman-Ford Algorithm +# description: Article explaining step-by-step how to implement the Bellman-Ford algorithm in Pathway. +# aside: true +# article: +# date: '2022-11-03' +# thumbnail: '/assets/content/blog/th-bellman-ford.png' +# tags: ['tutorial', 'machine-learning'] +# author: 'pathway' +# notebook_export_path: notebooks/tutorials/bellman_ford.ipynb +# keywords: ['iterate', 'fixed-point', 'shortest-path', 'graph'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # The Bellman-Ford Algorithm +# This article explains step-by-step how the Bellman–Ford algorithm may be implemented in Pathway. +# +# ## Introduction +# +# The [Bellman-Ford algorithm](https://en.wikipedia.org/w/index.php?title=Bellman%E2%80%93Ford_algorithm&oldid=1088801570) computes the shortest paths from a single source vertex to all the other +# vertices in a weighted graph. +# A weighted graph is composed of a set of points, called *vertices*, which are connected via *edges*. Each edge is associated to a value, called either *weight* or *distance*. +# For instance, the set of all the cities and the roads which connect them form such a graph. In that example, the Bellman-Ford algorithm would help to find the fastest way, in terms of distance, to go from a given city to another. +# +# This article is also a perfect place to familiarize yourself with several constructs used in Pathway. +# +# ## Code +# First things first - imports 🙂 +# %% +import math + +import pathway as pw + + +# %% [markdown] +# ### I/O Data +# The input is a weighted graph so it is natural to split representation of the data +# into two parts: Vertices and Edges. Their schemas: +# %% +class Vertex(pw.Schema): + is_source: bool + + +class Edge(pw.Schema): + u: pw.Pointer[Vertex] + v: pw.Pointer[Vertex] + dist: int + + +# %% [markdown] +# These schemas have a natural interpretation. You can think of the `Edge` schema as of +# a blueprint of a table that has 3 columns: `u`, `v` (foreign keys) and `dist`. +# The output schema: + + +# %% +class DistFromSource(pw.Schema): + dist_from_source: int + + +# %% [markdown] +# *Note:* The schemas inherit from `pw.Schema` special class. +# +# *Note:* You might wonder why output schema has only one column `dist_from_source`. +# Actually, you can join schemas together to create a new one. And so, the output schema +# is `Vertex + DistFromSource`. (Look for that type annotation later in the code.) +# +# ### The algorithm +# The Bellman-Ford algorithm performs some number of relaxations until it reaches a [fixed point](https://en.wikipedia.org/wiki/Fixed_point_(mathematics) "Wipedia article of 'fixed point'"). +# +# #### Relaxations +# Each node checks if a path via it would make any so-far-optimal path to some other +# node shorter. + + +# %% +def bellman_ford_step( + vertices_dist: pw.Table[DistFromSource], edges: pw.Table[Edge] +) -> pw.Table[DistFromSource]: + relaxed_edges = edges + edges.select( + dist_from_source=vertices_dist.ix(edges.u).dist_from_source + edges.dist + ) + vertices_dist = vertices_dist.update_rows( + relaxed_edges.groupby(id=relaxed_edges.v).reduce( + dist_from_source=pw.reducers.min(relaxed_edges.dist_from_source), + ) + ) + + return vertices_dist + + +# %% [markdown] +# #### Fixed point +# The relaxations are iterated until a fixed point is reached. In this case, reaching a +# fixed point means that no new (shorter) path was found in the last iteration. +# %% +def bellman_ford(vertices: pw.Table[Vertex], edges: pw.Table[Edge]): + vertices_dist: pw.Table[DistFromSource] = vertices.select( + dist_from_source=pw.if_else(vertices.is_source, 0.0, math.inf) + ) + + fixed_point = pw.iterate( + lambda iterated, edges: dict( + iterated=bellman_ford_step(vertices_dist=iterated, edges=edges) + ), + # The `pw.iterate_universe` stanza informs iterate that `vertices_dist` grows with each loop iteration. Without it, the system assumes that iterations don't change the set of indices of a table. + iterated=pw.iterate_universe(vertices_dist), + edges=edges, + ).iterated + + return fixed_point.join(vertices, fixed_point.id == vertices.id).select( + vertices.key, fixed_point.dist_from_source + ) + + +# %% [markdown] +# +# ## Tests +# +# Now, let's see the code in action. The following test case runs Bellman-Ford +# algorithm on a graph depicted below. +# +# Graph image +# %% +# a directed graph +vertices = pw.debug.table_from_markdown( + """ + | key | is_source +1 | 1 | True +2 | 2 | False +3 | 3 | False +4 | 4 | False +5 | 5 | False +6 | 6 | False +7 | 7 | False +""" +).with_id_from(pw.this.key) + +edges = pw.debug.table_from_markdown( + """ + | u | v | dist +11 | 1 | 2 | 100 +12 | 1 | 3 | 200 +13 | 1 | 4 | 300 +14 | 3 | 5 | 100 +15 | 3 | 6 | 500 +16 | 5 | 6 | 100 +17 | 6 | 3 | -50 +""" +).with_columns( + u=vertices.pointer_from(pw.this.u), + v=vertices.pointer_from(pw.this.v), +) + +# %% [markdown] +# Pathway automatically reindexes the tables, so we need a key column of the `vertices` table and we need ask Pathway to reindex the table using those. +# In practice, Pathway uses pointers so the keys are automatically converted into pointers. +# +# For the edges, we have to convert the keys into their references in order to be able to use `vertices_dist.ix(edges.u)` as `ix` only works with pointers. + +# %% +pw.debug.compute_and_print(bellman_ford(vertices, edges)) + +# %% [markdown] +# That was a simple introduction to writing code and tests in Pathway. +# +# Feel free to take this code and experiment with it 🙂 Do you see any possibility to +# improve the code? (What happens when there is a negative cycle in the graph?) +# + +# %% [markdown] +# ## Summary +# The code above follows a pattern that is quite frequent when working with Pathway: +# - Define I/O data types +# - Define transformations on tables +# - Iterate the transformation until a fixed point is reached +# - usually transforms the data by a simple one-liner. +# - for example ```iterate(lambda foo, bar: {foo=fn(foo, bar), bar=bar}, foo=input_table_1, bar=input_table2).foo``` +# +# *Note:* It is also possible to define transformations from the perspective of +# a row and not of a table. Curious? Take a look at [the transformer class tutorial](/developers/user-guide/diving-deeper/transformer-recursion/) or some of our other cookbooks, like this one on +# [Pagerank](/developers/showcases/pagerank). diff --git a/docs/2.developers/7.showcases/.event_stream_processing_time_between_occurrences/__init__.py b/docs/2.developers/7.showcases/.event_stream_processing_time_between_occurrences/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.event_stream_processing_time_between_occurrences/article.py b/docs/2.developers/7.showcases/.event_stream_processing_time_between_occurrences/article.py new file mode 100644 index 00000000..f3ef6389 --- /dev/null +++ b/docs/2.developers/7.showcases/.event_stream_processing_time_between_occurrences/article.py @@ -0,0 +1,100 @@ +# --- +# title: Time between events in a multi-topic event stream +# description: Event stream processing +# author: 'przemek' +# aside: true +# article: +# date: '2022-11-01' +# thumbnail: '/assets/content/blog/th-time-between-events-in-a-multi-topic-event-stream.png' +# tags: ['tutorial', 'data-pipeline'] +# keywords: ['event stream', 'multi-topic', 'Debezium', 'ordering', 'sort'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Time between events in a multi-topic event stream +# +# We are processing a stream of events on different topics. +# We want to compute, for each event, how much time has passed since the previous event on the same topic. +# The issue is that the events do not appear *in the order of their timestamps at input*! +# With Pathway there is no need to worry about that! +# +# First we use Debezium to synchronize the input data from a database table with the following columns: +# * timestamp - logical date-time when the event happened +# * topic_id - topic in question +# * message - message content. + +# %% +import pathway as pw + +# _MD_COMMENT_START_ +# DO NOT MODIFY THIS WITHOUT MODIFYING the following file: +# public/pathway/python/pathway/tests/test_gist_event_streaming_time_between_occurrences.py # noqa E501 + + +def table_from_debezium(**kwargs): + return pw.Table.empty( + timestamp=int, # datetime.datetime + message=str, + topic_id=int, + ) + + +pw.io.debezium.read = table_from_debezium +del table_from_debezium + + +def table_to_postgres(*args, **kwargs): + pass + + +pw.io.postgres.write = table_to_postgres +del table_to_postgres +# _MD_COMMENT_END_ +events = pw.io.debezium.read( + rdkafka_settings={ + "group.id": "$GROUP_NAME", + "bootstrap.servers": "clean-panther-8776-eu1-kafka.upstash.io:9092", + "session.timeout.ms": "6000", + }, + topics=["important_events"], +) + +# %% [markdown] +# Then we sort the events from the table. Pathway provides a `sort` function to sort a table according to its `key` column: in this case we are going to sort according to the timestamps of the events. In addition, each topic is mapped to an `instance` field, which allows us to work on different streams simultaneously. +# The `prev` and `next` pointers are automatically extracted. + +# %% +sorted_events = events.sort(key=events.timestamp, instance=events.topic_id) + +# %% [markdown] +# Finally, we process events in order of their timestamps at input. +# %% +events_with_prev = events.having(sorted_events.prev) +differences = events_with_prev.select( + delta=events.restrict(events_with_prev).timestamp - pw.this.timestamp +) + +pw.io.postgres.write( + differences, + postgres_settings={ + "host": "localhost", + "port": "5432", + "dbname": "transactions", + "user": "pathway", + "password": "my_password", + }, + table_name="events_processed", +) +"do not print cell output _MD_SKIP_"; # fmt: skip diff --git a/docs/2.developers/7.showcases/.interval_over_gaussian_filter/article.py b/docs/2.developers/7.showcases/.interval_over_gaussian_filter/article.py new file mode 100644 index 00000000..9cce7822 --- /dev/null +++ b/docs/2.developers/7.showcases/.interval_over_gaussian_filter/article.py @@ -0,0 +1,490 @@ +# --- +# title: 'Real-time [Low-latency] Signal Processing in Streaming Mode: how to apply a Gaussian filter with irregular sampling in your Data Streams.' +# description: 'Tutorial on signal processing: how to apply a Gaussian filter with Pathway using windowby and intervals_over' +# notebook_export_path: notebooks/tutorials/gaussian-filtering-python.ipynb +# author: 'olivier' +# aside: true +# article: +# date: '2023-10-17' +# thumbnail: '/assets/content/tutorials/time_series/thumbnail-gaussian.png' +# tags: ['tutorial', 'Time Series'] +# related: ['/blog/signal-processing', '/developers/showcases/upsampling'] +# keywords: ['signal processing', 'Gaussian filter', 'out-of-order data', 'irregular sampling', 'intervals_over', 'window'] +# --- + +# # Gaussian Filtering in Python: Working with out-of-order data points +# +# In this tutorial, you will learn how to perform signal processing on out-of-order signal data. Specifically, you will apply a Gaussian filter on a signal data stream with irregular sampling. +# This is an important skill to master because most real-world signal processing scenarios do not follow the regular and precise timing assumptions on which many standard signal processing systems are built. +# Read our blog on [the challenges of real-time signal processing](/blog/signal-processing/) for more context. +# +# The tutorial will walk you through the following 3 steps: +# 1. Preprocess the noisy data using interval joins; +# 2. Apply a Gaussian filter to smoothen the signal; +# 3. Dive deeper into how Pathway handles out-of-order data points. +# +# ## Signal Processing with Pathway +# +# Standard DSP systems do not support the addition of late data points because they assume a perfectly regular sampling. This allows these systems to wait until all signals to arrive before starting the processing computation. +# But in real-world and real-time settings, these assumptions are rarely guaranteed. Data points may arrive late or, worse, not at all, in which case the computation would never start. +# Pathway allows you to manipulate streaming data as if it were static, i.e., as if all the data is already gathered. +# **You don't have to worry about the late or out-of-order data: Pathway will handle it by updating its results whenever late or out-of-order points are added to the system.** +# +# However, you still have to deal with data misalignment. +# Therefore, **you must consider the signal as a time series**, and data points should be seen as a list of (time, value) pairs rather than an array of regularly sampled values. +# While the time variable makes signal processing more complex, Pathway offers several functions to make signal processing on realtime data streams as simple as it can be. +# +# ## Gaussian Filtering in Python: Preprocess the data +# +# Because of the out-of-order data points, the use of IIR ([Infinite Impulse Response](https://en.wikipedia.org/w/index.php?title=Infinite_impulse_response&oldid=1079405495)) filters, which depend on the sequential order of data, is impractical as their outputs cannot be efficiently recomputed when late points arrive. +# Instead, Pathway offers ways to easily implement FIR ([Finite Impulse Response](https://en.wikipedia.org/w/index.php?title=Finite_impulse_response&oldid=1172063361)) filters, where each data point influences the filter's response for a limited duration. +# Consequently, you should primarily employ kernel smoothing techniques to process signals in a manner that accommodates unordered or non-sequential data points, ensuring accurate and efficient analysis. +# +# Furthermore, without sample numbers, you cannot rely on a prev/next order among your data points: not only some points may be missing, but there may also be an important temporal gap between two consecutive points. +# You need to connect your point to other points close in time. +# To do so, Pathway provides [interval joins](/developers/user-guide/temporal-data/interval-join). +# +# Interval joins merge records by comparing the timestamp from each record of two tables and evaluating whether the difference falls within a specified time window or `interval`. +# +# ![Illustration of interval join](assets/content/tutorials/fleet_eta_interval_join/illustrative_interval_join.png) +# +# Interval joins are a great tool to gather all the close points together: +# +# ```python +# results = table_a.interval_join_left( +# table_b, +# pw.left.timestamp, +# pw.right.timestamp, +# pw.temporal.interval(-50, 50), +# pw.left.timestamp != pw.right.timestamp +# ).select( +# pw.left.timestamp, +# neighbor_timestamp=pw.right.timestamp, +# ) +# ``` +# +# However, they create an entry for each point in the interval: +# +# ``` +# timestamp | neighbor_timestamp +# ------------------------------ +# ... +# 100 | 77 +# 100 | 97 +# 100 | 132 +# ... +# ------------------------------ +# ``` +# +# To gather all the entries corresponding to the same interval, we could use a `groupby` followed by a `reduce`. +# +# ```python +# results = results.groupby(results.timestamp).reduce( +# results.timestamp, +# neighbors=pw.reducers.tuple(pw.this.neighbors_timestamps), +# ) +# ``` +# +# ``` +# timestamp | neighbors +# ------------------------------ +# ... +# 100 | (77, 97, 132) +# ... +# ------------------------------ +# ``` +# +# In Pathway, you can obtain the same results directly using `windowby` with `intervals_over`. +# Both the `interval_join` and `intervals_over` approach arrive at the same result. The `intervals_over` is preferred in this case because of its compact syntax. We will use the `intervals_over` in this tutorial. +# +# ```python +# results = table_a.windowby( +# table_a.timestamp, +# window=pw.temporal.intervals_over( +# at=table_b.timestamp, lower_bound=-50.0, upper_bound=50.0 +# ), +# ).reduce( +# pw.this._pw_window_location, +# neighbors=pw.reducers.tuple(pw.this.timestamp), +# ) +# ``` +# +# ## 2. Apply Gaussian Filtering with Pathway +# +# Now that you know how to aggregate the data, let's do some signal processing! +# +# Let's start with a simple Gaussian filter in Pathway. +# A Gaussian filter is a linear filter used to reduce the noise of a signal. +# The filter works by smoothing the signal using a convolution, applying a Gaussian function to a local segment of the signal. +# I will not delve into details: if you are interested you can check out the [Wikipedia article](https://en.wikipedia.org/w/index.php?title=Gaussian_filter&oldid=1161463896). +# +# To apply a Gaussian filter, you first need to aggregate the local segments: each point is associated with its surrounding counterparts. +# This aggregation is done using an *interval join* in data streaming processing (note: an interval join is usually done on two different data streams, but it is possible to do a self-interval join. See our [tutorial](/developers/user-guide/temporal-data/interval-join) for more info about interval joins). +# You can easily perform a regular interval join using Pathway with `interval_join`, but in this case you will need to aggregate the points to perform a convolution, so we will use the `windowby` and `intervals_over` syntax we saw earlier. +# +# Before entering the details, let's generate some data to play with. +# +# ### The data +# +# For this example, you will work on static data. +# However, in a real use case, your data source might be a data stream. +# Fortunately, you won't need to change the code, as Pathway works transparently with static and streaming data: you will only need to update your input connector to connect to your live data. +# +# The raw data needs to be noisy to see the impact of a Gaussian filter. +# Let's generate noisy data by first generating "perfect data": data ranging from 0 to 10000 for the timestamps, and the values are generated by adding to an increasing trend some periodic patterns. +# Then, some normally distributed noise is added to both. + +import matplotlib.pyplot as plt + +# + +import numpy as np + +np.random.seed(10) +nb_points = 1000 + +# Generate x values +x = np.linspace(0, 10000, nb_points) +x_noise = np.random.normal(0, 2, nb_points) +x = x + x_noise +x = np.sort(x) + +# Generate y values with a globally increasing trend and periodic patterns +trend = 0.01 * x # Globally increasing trend +periodic_pattern = 50 * np.sin(2 * np.pi * x / 1000) # Periodic pattern +periodic_pattern2 = 70 * np.sin(2 * np.pi * (x % 10) / 1000 + np.pi) # Periodic pattern +noise = (0.001 * 10000) * np.random.normal(0, 1, nb_points) + +# Combine trend and periodic pattern to create y values +y = trend + periodic_pattern + periodic_pattern2 + noise + + +# - + +# Let's plot the signal: + + +# + +def set_params_plot(): + plt.xlabel("time") + plt.ylabel("value") + plt.xticks([], []) + plt.yticks([], []) + plt.title("") + + +# Plot the data points +plt.subplot(2, 1, 1) +set_params_plot() +plt.plot(x, y) + +plt.show() +# - + +# Now you can convert this data to Pandas and load it to Pathway + +import pandas as pd + +# + +import pathway as pw + + +def load_to_pathway(x, y): + time_series_pd = pd.DataFrame({"x": x, "y": y}) + return pw.debug.table_from_pandas(time_series_pd) + + +time_series = load_to_pathway(x, y) +# - + +time_series.typehints() + +# And that's it! Now, let's see how to build and apply a Gaussian filter in Pathway. +# +# ### Computing aggregations with `windowby` +# +# You want to obtain an aggregation of the closest points for each point. +# With streaming data, you do not have any assumption on the distribution of surrounding points. +# You can ask for a fixed number of previous/next points, but you'd take the risk to obtain points with very different timestamps. +# So the solution is to ask for the data points in an interval around your point. +# +# To do so, you can use `windowby`, that takes three arguments: +# 1. The column used for windowing: the timestamp in the column `x`. +# 2. The window: in your case you will use an `intervals_over` window. +# 3. The optional sharding: not relevant here. +# +# For `intervals_over`, you need to set the parameter `at`, defining the column of times for which windows are to be created and the `lower_bound` and the `upper_bound` of the interval. +# Let's try with an interval of length 100: + +points_within_50 = time_series.windowby( + time_series.x, + window=pw.temporal.intervals_over( + at=time_series.x, lower_bound=-50.0, upper_bound=50.0 + ), +).reduce( + pw.this._pw_window_location, + xs=pw.reducers.ndarray(pw.this.x), + ys=pw.reducers.ndarray(pw.this.y), +) + +# ### Gaussian Filtering in Python as a UDF +# +# Now that you have the intervals, you need the Gaussian filter. +# In Pathway, you can easily define a User-Defined Function (UDF) with the `@pw.udf` annotation. +# To perform a Gaussian filter, you need the data points gathered by the `windowby` and the associated time (called `window_location`). +# The times and the values are kept in separate columns, so we takes two different arrays as input: + +# + +import scipy.stats + + +# UDF for Gaussian filter +@pw.udf +def gaussian_filter( + times: np.ndarray, + values: np.ndarray, + window_location, +) -> float: + stdev = (max(times) - min(times)) / 2 + gaussian_distribution = scipy.stats.norm(window_location, stdev) + + coefficients = gaussian_distribution.pdf(times) + normalized_coefficients = coefficients / sum(coefficients) + return np.dot(values, normalized_coefficients) + + +# - + +# With the UDF, you can directly apply the filter to each interval, without using `pw.apply`: + +# Applying the Gaussian filter +time_series_filtered = points_within_50.select( + x=pw.this._pw_window_location, + y=gaussian_filter(pw.this.xs, pw.this.ys, pw.this._pw_window_location), +) + +# ### Displaying the output +# +# Let's plot the results. +# Let's first output the data in Pandas and then plot the timeseries. +# In practice you'd use a [output connector](/developers/user-guide/connecting-to-data/connectors/) to output the results to Kafka or Postgres. + +time_series_filtered_pd = pw.debug.table_to_pandas(time_series_filtered) +time_series_filtered_pd = time_series_filtered_pd.sort_values(by=["x"]) + +# + +plt.subplot(2, 1, 1) +plt.plot(x, y) +plt.plot(time_series_filtered_pd.x, time_series_filtered_pd.y) +set_params_plot() + +plt.subplot(2, 1, 2) +plt.plot(time_series_filtered_pd.x, time_series_filtered_pd.y, color="#ff7f0e") +set_params_plot() + +plt.tight_layout() +plt.show() +# - + +# See how smooth the new plot is compared to the noisy raw data! + +# ## Gaussian Filtering in Python: Handling Late and Out-of-Order data points +# +# Now that your pipeline is ready, Pathway will manage late and out-of-order points by updating its results whenever new (or late, in this case) data points come into the system. +# +# But what does it mean exactly? What happens when late points arrive? +# +# As a reminder, a standard DSP system will retain the data until all the data is ready to apply the Gaussian filter. +# **In a streaming system, you cannot afford to wait for data points for an undetermined time: some data points may be lost and never arrive! +# Pathway computes an output with whatever data is available then and revises the result whenever new data points arrive.** +# +# Let's see the impact on late data points in our example by delaying the arrival of several points. +# +# For clarity, let's focus on a small sample of the data: + +# + +sample_size = 20 +x_sample = x[0:sample_size] +y_sample = y[0:sample_size] +time_series_filtered_x_sample = time_series_filtered_pd.x.tolist()[0:sample_size] +time_series_filtered_y_sample = time_series_filtered_pd.y.tolist()[0:sample_size] + +# Plot the data points +plt.subplot(2, 1, 1) +set_params_plot() +plt.plot(x_sample, y_sample, "x-", label="Data sample") +plt.plot( + time_series_filtered_x_sample, + time_series_filtered_y_sample, + "x-", + label="Smoothed signal", +) +plt.legend() + +plt.show() +# - + +# Until now, the data was considered static and ingested all at once in a single batch: let's switch to streaming data by adding timestamp to the data! +# +# You will create two batches of points: one with all the "on-time" data points arriving at time 0, and another one with "late" data points arriving at a later time. +# To make the computation interesting, the late data points should be in the middle of the signal: + +# + +init_late = 6 +length_late = 7 + +# Extracting the late points +x_sample_late = x_sample[init_late : init_late + length_late] +y_sample_late = y_sample[init_late : init_late + length_late] + +# Removing the late points from the original sample +x_sample_ontime = x_sample.copy().tolist() +del x_sample_ontime[init_late : init_late + length_late] +y_sample_ontime = y_sample.copy().tolist() +del y_sample_ontime[init_late : init_late + length_late] + +# The on-time data points have a timestamp of 0, the other ones have a timestamp of 1 +time = [0 for i in range(len(x_sample_ontime))] +for i in range(len(x_sample_late)): + time.append(1) + +out_of_order_x_sample = x_sample_ontime + x_sample_late.tolist() +out_of_order_y_sample = y_sample_ontime + y_sample_late.tolist() + +out_of_order_time_series_pd = pd.DataFrame( + {"x": out_of_order_x_sample, "y": out_of_order_y_sample, "time": time} +) +out_of_order_time_series_pd.to_csv("./ooo_ts.csv", index=False) +# - + +# Let's plot the resulting data sample: the late points, in red, have been removed from the original time series: + +# + +plt.subplot(2, 1, 1) +set_params_plot() +plt.plot(x_sample_ontime, y_sample_ontime, "x-", label="On-time data points") +plt.plot(x_sample_late, y_sample_late, "x", color="red", label="Late data points") +plt.legend() + +plt.show() + + +# - + +# Now, you will apply the filter to this data source. +# You first need to create a pipeline to apply the Gaussian filter, as done in the previous sections: + + +def smooth_table(table): + points_within_50 = table.windowby( + table.x, + window=pw.temporal.intervals_over( + at=table.x, lower_bound=-50.0, upper_bound=50.0 + ), + ).reduce( + pw.this._pw_window_location, + xs=pw.reducers.ndarray(pw.this.x), + ys=pw.reducers.ndarray(pw.this.y), + ) + # Applying the Gaussian filter + time_series_filtered = points_within_50.select( + x=pw.this._pw_window_location, + y=gaussian_filter(pw.this.xs, pw.this.ys, pw.this._pw_window_location), + ) + return time_series_filtered + + +# Then, you need to read the input source. +# You are not working with a static data source anymore, but a streaming one. +# You can use `pw.demo.replay_csv_with_time` to replay a data source in a CSV file respecting the time between updates based on a timestamp columns (more information about this function in our dedicated [article](/developers/user-guide/connecting-to-data/artificial-streams/#replaying-static-csv-files-with-replay_csv-and-replay_csv_with_time)). Here the time is contained in the column `time`: + + +# + +# For streaming data, you need to define the schema +class InputSchema(pw.Schema): + x: float + y: float + time: int + + +ooo_input_table = pw.demo.replay_csv_with_time( + path="ooo_ts.csv", + schema=InputSchema, + time_column="time", + unit="s", + autocommit_ms=100, +) +ooo_smoothed_table = smooth_table(ooo_input_table) +pw.io.csv.write(ooo_smoothed_table, "ooo_results.csv") +pw.run() +# - + +# Let's take a look at the results: + +results = pd.read_csv("./ooo_results.csv") +results = results.sort_values(by=["x"]) +results.head() + +# There are two new columns: `time` and `diff`. +# The column `time` corresponds to the processing time at which Pathway generated the output, and `diff` represents whether the output was added or removed. +# An update is decomposed in two simultaneous atomic operations: the removal of the last value (`diff==-1`) and the addition of the new one (`diff==1`). +# +# Here we have only two batches, so there are only two different values in the column `time`. +# +# Let's see the results of the first batch when the "on-time" data points were added to the system: + +# The first batch has only additions, no need to filter on `diff` (all values are equal to 1). +first_results = results[results["time"] == min(results["time"])] + +plt.subplot(2, 1, 1) +plt.plot(x_sample_ontime, y_sample_ontime, "x-", label="On-time data points") +plt.plot(first_results.x, first_results.y, "x-", label="Smoothed time series") +plt.legend() +set_params_plot() + +# As you can see, there is a gap in the middle of the time series as some points are missing. +# However, the Gaussian filter did its job by smoothing the signal with the data points it had at its disposal. +# +# Now, let's see what was the output when the second batch of points was ingested: + +late_results = results[results["time"] == max(results["time"])] +# We only focus on the new values, i.e., with diff==1 +late_results = late_results[late_results["diff"] == 1] + +plt.subplot(2, 1, 1) +plt.plot(x_sample, y_sample, "x-", label="Total data points") +plt.plot( + late_results.x, late_results.y, "x-", color="red", label="Updates of the smoothing" +) +plt.legend() +set_params_plot() + +# You can see that the gap has disappeared, replaced by the late data points and the associated smoothed signal. +# +# Let's compare both outputs: + +plt.subplot(2, 1, 1) +plt.plot(x_sample, y_sample, label="Total data points") +plt.plot( + first_results.x, first_results.y, "x-", color="#ff7f0e", label="Original smoothing" +) +plt.plot(late_results.x, late_results.y, "x-", color="red", label="Updated smoothing") +plt.legend() +set_params_plot() +# _MD_COMMENT_START_ +# !rm ooo_results.csv +# !rm ooo_ts.csv +# _MD_COMMENT_END_ + +# The arrival of late data points has triggered an update of the output: not only the gap has been filled, but existing values have been updated to take into account the incoming data points. +# Furthermore, the smoothing has been updated only for data points close to the late points: Pathway did not apply the filter to the whole signal from scratch, but only updated the local parts that were impacted by the arrival of the late points. +# +# This is the power of Pathway: you can build your pipeline as if the data were static, and Pathway will take care of the late and out-of-order data for you. +# **Your results will always be up-to-date compared to the latest available data points.** + +# ## Gaussian Filtering in Python: Conclusion +# +# Signal processing is a fundamental tool for extracting meaningful insights from data, but signal processing on real-world data requires data alignment. +# Pathway provides all you need to tackle the challenges of real-world digital signal processing on data streams. +# Now that you understand the differences between idealized signal processing and data stream processing, I'm sure you can leverage Pathway's capabilities to process, filter, and analyze data efficiently, even when dealing with imperfect sampling rates and data imperfections. +# +# If you want to learn more about DSP in Pathway, try our [tutorial](/developers/showcases/upsampling/) about manipulating different data sources by *upsampling* a data source to match the sampling rate of given data source. diff --git a/docs/2.developers/7.showcases/.interval_over_upsampling/article.py b/docs/2.developers/7.showcases/.interval_over_upsampling/article.py new file mode 100644 index 00000000..b28f29fd --- /dev/null +++ b/docs/2.developers/7.showcases/.interval_over_upsampling/article.py @@ -0,0 +1,335 @@ +# --- +# title: 'Real-time [Low-latency] Signal Processing in Streaming Mode: how to combine your Data Streams with upsampling.' +# description: 'Tutorial on signal processing: how to do upsampling with Pathway using windowby and intervals_over' +# notebook_export_path: notebooks/tutorials/upsampling.ipynb +# author: 'olivier' +# aside: true +# article: +# date: '2023-10-18' +# thumbnail: '/assets/content/tutorials/time_series/thumbnail-time-series.png' +# tags: ['tutorial', 'Time Series'] +# related: ['/blog/signal-processing', '/developers/showcases/gaussian_filtering_python'] +# keywords: ['signal processing', 'upsampling', 'out-of-order data', 'resampling', 'interpolation', 'intervals_over', 'window'] +# --- + +# # Upsampling your Signal Processing Data: transforming multiple data streams. +# Learn how to perform upsampling on multiple data streams. The data streams will be modelled as time series data and will have different sampling rates to begin with. +# Using upsampling techniques in Python, you will align the sampling rates of the multiple data sources in order to perform data analysis over the entire stream. +# +# It is common practice to use [Digital Signal Processing](/glossary/digital-signal-processing) (DSP) in order to analyze and manipulate time series data. +# The time series can be considered as signals and by applying filters, you can remove the noise and do feature extraction on your data. +# However, when working with multiple time series you are likely to run into the issue of data misalignment due to different sampling rates. +# This issue needs to be addressed if you want to extract valuable insights from your time series data using digital signal processing. +# In this article, you will learn how to use upsampling to align the sampling rates of your multiple time series data sources. +# +# ## Upsampling: Working with Messy Real-World Data +# This tutorial builds on the [Gaussian Filtering](/developers/showcases/gaussian_filtering_python) tutorial which explains how you can smooth out a messy, out-of-order data stream. +# While digital signal processing systems assume neatly ordered signals arriving at a regular and precise sampling rate, this is rarely the case in [real-world applications](/blog/iot-data-analytics/). +# Irregular and out-of-order data streams are a common occurrence in the real world of data processing. +# Data streams may be sampled at irregular intervals, leading to variations in the time between samples. The resulting data is noisy and contains gaps. +# +# This upsampling tutorial adds another layer of complexity: you will learn how work with *multiple* data sources simultaneously. +# This is an important skill to master because analyzing multiple data sources together is a common method to improve the quality of an analysis. +# For example, you can improve the accuracy of any metric by averaging the measures of different sensors measuring it. +# +# In addition to the noisy, late, and out-of-order data points, you will now face a new challenge: **data streams have different sampling rates.** +# This will result in significant data misalignment which you will have to deal with in order to extract meaningful information from your data. +# This challenge requires a different approach to data processing compared to idealized signal processing, where the focus is on precise time alignment. +# In particular, to align two data sources with different sampling rate, you will need to do something called **upsampling**. +# +# # What is Upsampling? +# +# [Upsampling](/glossary/upsampling) is a [signal processing](/glossary/signal-processing) technique used to increase the sampling rate of a signal. +# Upsampling is generally used to improve a signal and applications include improving the quality of images, increasing the accuracy of digital filters, or data enrichment. +# +# ## Upsampling: Combining Data Streams with Different Sampling Rates +# +# In many real-world scenarios, you may need to process data from multiple sources, each with its own sampling rate. +# To work on both signals at the same time, it's important to have the same sampling rate in both signals. This can be done with [resampling](/glossary/resampling). +# Resampling is the process of changing the sampling rate or the number of samples in a signal or data set. There are two primary types of resampling: +# 1. **Upsampling** consists of increasing the sampling rate by adding new data points between existing ones to achieve a higher temporal or spatial resolution. It is used to increase the clarity of a signal or match the sampling rate of one signal to that of another for compatibility. New values are obtained by doing an **interpolation**. +# 2. **[Downsampling](/glossary/downsampling)**, on the other hand, consists of decreasing the sampling rates by removing data points. It is typically used to reduce data volume, decrease computational complexity, and meet application storage or transmission constraints. +# +# For example, consider a simple scenario: you are training to run your first marathon. +# You would like to monitor your performance, so you buy a GPS watch. +# After your first run, you read out the data and notice that sometimes the measurements of the watch are not very precise. +# Fortunately, you have another source of GPS data points: your phone! +# You plan to join the two streams to increase their accuracy. +# +# Unfortunately, the time series are very different. +# While your phone outputs roughly one value per second, your GPS watch has only one recording every five seconds. +# You will need to upsample your watch's data to match the sampling rate of your phone. +# +# Don't worry, Pathway allows you to aggregate and process data from these diverse sources easily. +# Let's see how to do upsampling with Pathway. +# +# ### The data +# First, we need data. For privacy issues, we will not work on real GPS traces, but we will generate artificial data. +# For simplicity, let's focus on a single non-normalized variable. It can represent the latitude, for example. +# You need two time series with the same distribution but different sampling rates. + +# + +import numpy as np +import matplotlib.pyplot as plt + + +# Generate x values +def generate_x_values(nb_points): + x = np.linspace(0, 10000, nb_points) + x_noise = np.random.normal(0, 2, nb_points) + x = x + x_noise + x = np.sort(x) + return x + + +nb_points_A = 1000 +nb_points_B = 200 + +# Generate x values +x_A = generate_x_values(nb_points_A) +x_B = generate_x_values(nb_points_B) + +# Generate y values with periodic patterns +periodic_pattern_A = 50 * np.sin(2 * np.pi * x_A / 1000) # Periodic pattern +periodic_pattern_B = 50 * np.sin(2 * np.pi * x_B / 1000) # Periodic pattern +noise_A = (0.001 * 10000) * np.random.normal(0, 1, nb_points_A) +noise_B = (0.001 * 10000) * np.random.normal(0, 1, nb_points_B) + +# Combine trend and periodic pattern to create y values +y_A = periodic_pattern_A + noise_A +y_B = periodic_pattern_B + noise_B + + +# - + +# Let's plot the signal: + + +# + +def set_params_plot(): + plt.xlabel("time") + plt.ylabel("value") + plt.xticks([], []) + plt.yticks([], []) + plt.title("") + + +# Plot the data points +plt.subplot(2, 1, 1) +plt.plot(x_A, y_A, "x", label="time series A") +plt.legend() +set_params_plot() + +plt.subplot(2, 1, 2) +plt.plot(x_B, y_B, "x", color="#ff7f0e", label="time series B") +plt.legend() +set_params_plot() + +plt.tight_layout() +plt.show() + + +# - + +# We can clearly see that the trend is the same, but the first source has clearly more data points. +# +# Let's zoom on the first data points: + + +# + +def set_params_plot_zoom(): + plt.xlabel("time") + plt.ylabel("value") + plt.xticks([], []) + plt.yticks([], []) + plt.xlim([0, 300]) + plt.title("") + + +plt.subplot(3, 1, 1) +plt.plot(x_A, y_A, "x", label="time series A") +plt.plot(x_B, y_B, "x", color="#ff7f0e", label="time series B") +plt.legend(loc=4) +set_params_plot_zoom() + +plt.tight_layout() +plt.show() +# - + +# As we can see, not only the sampling rates are not the same, but the times are not aligned. +# +# Let's load the data in Pathway before doing the upsampling: + +# + +import pathway as pw +import pandas as pd + + +def load_to_pathway(x, y): + time_series_pd = pd.DataFrame({"x": x, "y": y}) + return pw.debug.table_from_pandas(time_series_pd) + + +data_stream_A = load_to_pathway(x_A, y_A) +data_stream_B = load_to_pathway(x_B, y_B) +# - + +# ## Upsampling with `windowby` +# +# While you could [combine the data streams](/developers/showcases/combining_time_series) by keeping the union of the timestamps, this would mean altering two time series. Instead, upsampling alters only one data stream and thereby limits the imprecision introduced by the operation. +# +# To do upsampling, you want to obtain an aggregation of the closest points of time series B for each point of the time series A, and then do an interpolation to obtain the value at the given time for time series B. +# +# However, when dealing with streaming data, you do not have any assumption on the distribution of surrounding points. +# You can ask for a fixed number of previous/next points, but you'd take the risk to obtain points with very different timestamps. +# The solution is to ask for the data points in an interval around your point using a `windowby` with `intervals_over`. +# If you need a quick reminder on how it works, don't hesitate to read the [tutorial about Gaussian filter](/developers/showcases/gaussian_filtering_python#interval-joins-and-windowby). +# +# Unlike applying a Gaussian filter, upsampling requires using two different tables. +# You will do the `windowby` over the time series B (the one you want to upsample) and compute the intervals over the time series A (the one on which you want to align, with the higher sampling rate). +# You must keep all the times and values within the intervals to be able to do the interpolation. +# +# We set the length of the interval to 200 to be sure to have enough data points to do the interpolation: + +upsampled_stream = data_stream_B.windowby( + data_stream_B.x, + window=pw.temporal.intervals_over( + at=data_stream_A.x, lower_bound=-100.0, upper_bound=100.0 + ), +).reduce( + x=pw.this._pw_window_location, + xs=pw.reducers.ndarray(pw.this.x), + ys=pw.reducers.ndarray(pw.this.y), +) + + +# Now, let's define the interpolation as a UDF: + + +@pw.udf +def linear_interpolate(times: np.ndarray, values: np.ndarray, time_new: int) -> float: + idx = np.argsort(times) + times = times[idx] + values = values[idx] + return np.interp(time_new, times, values, left=values[0], right=values[-1]) + + +# With this, you simply need to apply the interpolation to each window: + +upsampled_stream = upsampled_stream.select( + pw.this.x, y=linear_interpolate(pw.this.xs, pw.this.ys, pw.this.x) +) + +upsampled_stream_pd = pw.debug.table_to_pandas(upsampled_stream) +upsampled_stream_pd = upsampled_stream_pd.sort_values(by=["x"]) + +# Let's plot the resulting plots: + +# + +plt.subplot(2, 1, 1) +plt.plot(x_A, y_A, "x", label="time series A") +plt.legend() +set_params_plot() + +plt.subplot(2, 1, 2) +plt.plot( + upsampled_stream_pd.x, + upsampled_stream_pd.y, + "x", + color="#ff7f0e", + label="time series B (upsampled)", +) +plt.legend() +set_params_plot() + +plt.tight_layout() +plt.show() +# - + +# Now the two time series have the same sampling rate! +# +# Let's zoom in to see the difference more precisely: + +# + +plt.subplot(3, 1, 1) +plt.plot(x_A, y_A, "x", label="time series A") +plt.plot(x_B, y_B, "x", color="#ff7f0e", label="time series B (raw)") +plt.legend() +set_params_plot_zoom() + +plt.subplot(3, 1, 2) +plt.plot(x_A, y_A, "x", label="time series A") +plt.plot( + upsampled_stream_pd.x, + upsampled_stream_pd.y, + "x", + color="#ff7f0e", + label="time series B (upsampled)", +) +plt.legend() +set_params_plot_zoom() + +plt.tight_layout() +plt.show() +# - + +# The new time series has been successfully upsampled: the values are sampled at the same times as the time series A. +# Because the time series has been upsampled using interpolation, the obtained time series is smoother: + +# + +plt.subplot(2, 1, 1) +plt.plot(x_A, y_A, label="time series A") +plt.legend() +set_params_plot() + +plt.subplot(2, 1, 2) +plt.plot( + upsampled_stream_pd.x, + upsampled_stream_pd.y, + color="#ff7f0e", + label="time series B (upsampled)", +) +plt.legend() +set_params_plot() + +plt.tight_layout() +plt.show() +# - + +# You can now combine those two time series. +# For example, you can compute the latitude by taking the average latitude at each time: + +average_stream = upsampled_stream.join(data_stream_A, pw.left.x == pw.right.x).select( + pw.this.x, y=(pw.left.y + pw.right.y) / 2 +) + +average_stream_pd = pw.debug.table_to_pandas(average_stream) +average_stream_pd = average_stream_pd.sort_values(by=["x"]) + +# + +plt.subplot(2, 1, 1) +plt.plot(average_stream_pd.x, average_stream_pd.y, color="green", label="Average value") +plt.legend() +set_params_plot() + +plt.tight_layout() +plt.show() +# - + +# ## Upsampling: Late and Out-of-Order Data Points +# +# Standard digital signal processing systems assume robust sampling and would deal with out-of-order and irregularly sampled points by buffering and delaying the response. +# This would be a problem for many real-world data applications in which irregular and out-of-order data is common place. +# The system would stall as there is no way to push a late point through the pipeline. +# +# This approach is impossible in a streaming setup where data points can be delayed indefinitely. +# The best we can do is to do the computation with the information at hand and then update our results whenever late data is received. +# This is exactly what Pathway does: your results are kept up-to-date with respect to the latest available data. +# You can see how it works in practice with [our example about the Gaussian filter](/developers/showcases/gaussian_filtering_python). + +# ## Upsampling: Conclusion +# +# You can now use Pathway to do signal processing on multiple data sources simultaneously, which is one of the common operations of DSP. +# With multiple data sources and filters, you should be able to do most of the common digital signal processing operations with Pathway! +# +# If you have trouble implementing a DSP operation, don't hesitate to join us on [discord](https://discord.com/invite/pathway) so we can help you! diff --git a/docs/2.developers/7.showcases/.linked_list/__init__.py b/docs/2.developers/7.showcases/.linked_list/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.linked_list/article.py b/docs/2.developers/7.showcases/.linked_list/article.py new file mode 100644 index 00000000..caf7f18e --- /dev/null +++ b/docs/2.developers/7.showcases/.linked_list/article.py @@ -0,0 +1,290 @@ +# --- +# title: Manipulating Linked Lists with transformers and transformer classes +# description: A guide to simple linked list operations +# aside: true +# article: +# date: '2023-01-09' +# thumbnail: '' +# tags: ['tutorial', 'connectors'] +# keywords: ['linked list', 'data structure', 'transformer classes'] +# author: 'pathway' +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Manipulating Linked Lists with transformers and transformer classes +# +# ## Prerequisites +# +# Understanding this recipe requires some familiarity with [transformer classes](/developers/user-guide/diving-deeper/transformer-introduction). +# +# ## Introduction +# +# In this recipe, we are going to use Pathway to implement one of the most common data structure: a [linked list](https://en.wikipedia.org/w/index.php?title=Linked_list&oldid=1113983256). +# We will discuss how to model the list itself, and how to write algorithms operating on lists. +# +# ## Code +# First things first - imports and constants. + +# %% +from __future__ import annotations + +from typing import Callable, Optional + +# %% +import pathway as pw +from pathway.tests.utils import T, assert_table_equality + +# %% [markdown] +# ## Linked list +# +# As a quick reminder, a linked list is a set of nodes which are connected by unidirectional links. +# Each node contains a pointer to the next one, the first node being called the *head* and the last one the *final node* or the tail. + + +# %% [markdown] +# ### I/O Data +# We define a simple schema where each node of the list points to either a next one, or is None if it's a final node. +# We also define an output schema for our transformer. +# +# The output of the transformer describes two algorithms useful for us. +# One is a numerical column named `len`, computing the number of next pointers until the end of the list. +# The second is a callable column, which for a particular node `X` and an int value `steps`, returns which node is `steps` next-jumps in front of node `X`. +# %% +class Node(pw.Schema): + next: Optional[pw.Pointer[Node]] + + +class Output(pw.Schema): + len: int + forward: Callable[..., Optional[pw.Pointer[Node]]] + + +# %% [markdown] +# ### Transformer logic +# We build a transformer class that operates on linked lists, named `linked_list_transformer`. Note the use of the keyword `class`. +# This transformer takes one table as the input, named `linked_list` below. Note that it is also defined using the nested `class` keyword. +# Since we plan the input table to be of schema `Node` (e.g. single column named `next` of proper type), we reflect this in two places: +# * in `input` parameter for `class linked_list` +# * in `next` attribute capturing a single column of the input table. +# +# The output attributes `len` and `forward` are defined as methods of the appropriate class, +# and are defined using recursive logic. +# %% + + +@pw.transformer +class linked_list_transformer: + class linked_list(pw.ClassArg, input=Node): + next = pw.input_attribute() + + @pw.output_attribute + def len(self) -> int: + if self.next is None: + return 1 + else: + return 1 + self.transformer.linked_list[self.next].len + + @pw.method + def forward(self, steps) -> Optional[pw.Pointer[Node]]: + if steps == 0: + return self.id + elif self.next is not None: + return self.transformer.linked_list[self.next].forward(steps - 1) + else: + return None + + +# %% [markdown] +# ### Inverting list +# Below we build a transformer that takes a linked list as the input, and returns a reversed list. +# This is effectively achieved by swapping `id` and `next` columns. But extra care is required +# in dealing with `Node` values: first we need to filter out rows with `next` being `None`, and +# then a proper `None` needs to be added to the output. +# %% + + +def reverse_linked_list(nodes: pw.Table[Node]) -> pw.Table[Node]: + reversed = ( + (filtered := nodes.filter(nodes.next.is_not_none())) + .select(next=filtered.id) + .with_id(filtered.next) + ) + return nodes.select(next=None).update_rows(reversed) + + +# %% [markdown] +# ### Tests +# We present some easy test cases here. +# %% +def test_linked_list_len(): + nodes = T( + """ + label | next_label + 1 | 2 + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 7 + 7 | 8 + 8 | + """, + id_from=["label"], + ) + nodes += nodes.select(next=nodes.pointer_from(nodes.next_label, optional=True)) + expected = T( + """ + | len + 1 | 8 + 2 | 7 + 3 | 6 + 4 | 5 + 5 | 4 + 6 | 3 + 7 | 2 + 8 | 1 + """ + ) + + ret = linked_list_transformer(linked_list=nodes).linked_list + + assert_table_equality(ret.select(ret.len), expected) + + +# %% +test_linked_list_len() + + +# %% + + +def test_linked_list_forward(): + nodes = T( + """ + label | next_label + 1 | 2 + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 7 + 7 | 8 + 8 | + """, + id_from=["label"], + ) + nodes += nodes.select(next=nodes.pointer_from(nodes.next_label, optional=True)) + + linked_list = linked_list_transformer(linked_list=nodes).linked_list + + queries = T( + """ + node_label | steps + 1 | 0 + 2 | 1 + 6 | 2 + 6 | 3 + 8 | 0 + 8 | 2 + """ + ) + ret = queries.select( + result=linked_list.ix_ref(queries.node_label).forward(queries.steps) + ) + expected = T( + """ + result_label + 1 + 3 + 8 + None + 8 + None + """ + ) + expected = expected.select( + result=nodes.pointer_from(expected.result_label, optional=True) + ) + assert_table_equality(ret, expected) + + +# %% +test_linked_list_forward() + +# %% + + +def test_linked_list_reversal(): + nodes = T( + """ + label | next_label + 1 | 2 + 2 | 3 + 3 | 4 + 4 | 5 + 5 | 6 + 6 | 7 + 7 | 8 + 8 | + """, + id_from=["label"], + ) + nodes += nodes.select(next=nodes.pointer_from(nodes.next_label, optional=True)) + + expected = T( + """ + label | next_label + 1 | + 2 | 1 + 3 | 2 + 4 | 3 + 5 | 4 + 6 | 5 + 7 | 6 + 8 | 7 + """, + id_from=["label"], + ) + expected = expected.select( + next=expected.pointer_from(expected.next_label, optional=True) + ) + + ret = reverse_linked_list(nodes) + + assert_table_equality(ret, expected) + + +# %% +test_linked_list_reversal() + +# %% [markdown] +# ## Summary +# As an added bonus, observe that all transformers actually work even when on the input we provide multiple linked lists at once. +# For example, let's consider the following table: +# +# | | next | +# |---|------| +# | 1 | 2 | +# | 2 | 3 | +# | 3 | None | +# | 4 | 5 | +# | 5 | 6 | +# | 6 | None | +# | 7 | 8 | +# | 8 | 9 | +# | 9 | None | +# +# This table is a valid input and would be properly reversed by a `reverse_linked_list` transformer. +# As always, feel free to play and experiment with this code or head to the next section. diff --git a/docs/2.developers/7.showcases/.live_data_jupyter/.gitignore b/docs/2.developers/7.showcases/.live_data_jupyter/.gitignore new file mode 100644 index 00000000..05fac093 --- /dev/null +++ b/docs/2.developers/7.showcases/.live_data_jupyter/.gitignore @@ -0,0 +1 @@ +ticker.csv diff --git a/docs/2.developers/7.showcases/.live_data_jupyter/__init__.py b/docs/2.developers/7.showcases/.live_data_jupyter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.live_data_jupyter/_posthook.sh b/docs/2.developers/7.showcases/.live_data_jupyter/_posthook.sh new file mode 100644 index 00000000..4af75d40 --- /dev/null +++ b/docs/2.developers/7.showcases/.live_data_jupyter/_posthook.sh @@ -0,0 +1,4 @@ +cat article.md | \ +perl -0777 -pe 's@```python\n# Due to the bug described in https://github.com/jupyter/notebook/issues/1622.*# *pw.run\(\)\n```@Run In Colab@sg' \ +> _article.md +mv _article.md article.md diff --git a/docs/2.developers/7.showcases/.live_data_jupyter/article.py b/docs/2.developers/7.showcases/.live_data_jupyter/article.py new file mode 100644 index 00000000..1cc82e50 --- /dev/null +++ b/docs/2.developers/7.showcases/.live_data_jupyter/article.py @@ -0,0 +1,303 @@ +# --- +# title: "Working with live data streams in Jupyter" +# description: '' +# aside: true +# author: +# name: Avril Aysha +# description: Developer Advocate +# img: /assets/pictures/image_avril.png +# linkedin: https://www.linkedin.com/in/avrilaysha/ +# article: +# date: '2023-11-29' +# thumbnail: '' +# tags: ['tutorial', 'data-pipeline', 'showcase'] +# keywords: ['Jupyter notebook', 'interactivity', 'bokeh', 'panel', 'Bollinger bands', 'vizualization', 'Weighted Average', 'rolling statistics', 'dashboard'] +# notebook_export_path: notebooks/showcases/live-data-jupyter.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Working with live data streams in Jupyter +# +# 💡 This notebook is one part of a [full-length tutorial](/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/) depicting a production-grade data science scenario from data exploration to interactive dashboarding and containerized deploy. +# +# ## Jupyter notebooks & streaming data +# Jupyter notebooks are a great tool for working with data. They give you a friendly interactive interface in which you can process data comfortably and visualise it in intuitive, human-readable ways. +# +# Most data analysts use Jupyter notebooks with static (or "batch") workflows. Using Jupyter with streaming workflows can feel intimidating or have mixed results due to inadequate tooling. +# +# In this notebook you will learn how to visualise a live data stream in real-time from the familiar context of a Jupyter Notebook. You will use Pathway, [Bokeh](https://bokeh.org/) and [Panel](https://panel.holoviz.org/) to build a real-time data visualisation that will alert you when the data hits a critical threshold. You can follow along on this page or run the code for yourself [in Colab](https://colab.research.google.com/github/pathwaycom/pathway/blob/main/examples/notebooks/showcases/live-data-jupyter.ipynb) or [from Github](https://github.com/pathwaycom/pathway/blob/main/examples/notebooks/showcases/live-data-jupyter.ipynb). +# +# Because Pathway [unites static and stream processing](/developers/user-guide/connecting-to-data/switch-from-batch-to-streaming/#switching-from-batch-to-streaming) in a single syntax, you can use the exact same code for both batch and streaming. This way you can test your workflow with a static snapshot and then simply switch out the data source for a live stream when you want to visualise the full data flow in real-time. +# +# Let's jump in! 🪂 +# +# ![image](https://github.com/pathwaycom/pathway-examples/blob/c20cd69a6c9c87fc70a9082de57666c50f2ab3c2/documentation/from_jupyter_to_deploy/jupyter-demo-final-smallest-compressed.gif?raw=true) +# +# ## What we'll do +# +# In this notebook, we will take a stream of financial data and implement a simple trading algorithm using [Bollinger Bands](https://en.wikipedia.org/wiki/Bollinger_Bands). This will be a helpful example of visualizing streaming data and will also show the usefulness of getting critical alerts in real-time. + +# You don't need to fully understand the algorithm or the financial terminology. Focus on the steps you will to take a live data stream, perform some computation on it in real-time, and output a data visualisation in your Jupyter Notebook that updates on-the-fly. These steps are generalizable to any data streaming use case. +# +# Concretely, we will compute the 1-minute running mean of Volume Weighted Average Price (`vwap`) and the 20-minute volatility, the Volume Weighted Standard Deviation (`wvstd`) on the price time series. This creates two bands around the mean price, with most price movements happening between the bands. Intuitively, when a price approaches the upper band, it is abnormally high and may likely drop - it is a good moment to _SELL_ and we'll raise an alert. Likewise, when the price approaches the lower band, it indicates it is low and may grow to revert to the mean - we'll raise an alert that it's a good moment to _BUY_. For further reliability, the BUY/SELL actions are performed only when there is a significant volume of trades, indicating that the outlying price is not a one-off event. +# %% [markdown] +# ## Imports and setup +# +# First, import the necessary libraries: +# +# - [`pathway`](/developers/user-guide/introduction/welcome/) for data processing, +# - `datetime` for date manipulation, +# - `panel` and `bokeh` for dashboarding. + +# %% +import datetime + +import bokeh.models +import bokeh.plotting +import panel + +import pathway as pw + +# %% [markdown] +# And then fetch the sample data from Github. This data was generated using [polygon.io](https://polygon.io). + +# %% +%%capture --no-display +!wget -nc https://gist.githubusercontent.com/janchorowski/e351af72ecd8d206a34763a428826ab7/raw/ticker.csv + +# %% [markdown] +# ## Data source setup +# +# Create a streaming data source that replays data in a CSV file. This is an easy way to simulate a live data stream without any infrastructure hassle. You can of course use Pathway with a real, production-grade data stream, for example [from Kafka](/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/#kafka-integration) or [Redpanda](/developers/user-guide/connectors/switching-to-redpanda/). +# +# The `input_rate` parameter controls how fast the data is replayed. +# +# 💡 No data processing actually happens when you run this cell. We are building a computational graph that will only be executed at the end of the notebook. This allows Pathway to optimise the computations and perform them as fast as possible when the data starts streaming. + +# %% +fname = "ticker.csv" +schema = pw.schema_from_csv(fname) +data = pw.demo.replay_csv(fname, schema=schema, input_rate=1000) + +# # For static data exploration use +# data = pw.io.csv.read(fname, schema=schema, mode="static") + +# Parse the timestamps +data = data.with_columns(t=data.t.dt.utc_from_timestamp(unit="ms")) + +# %% [markdown] +# ## Switching between static and stream processing +# +# As you probably noticed, the code block above also includes a commented-out section. You can use this line instead of `data = pw.demo.replay_csv(...)` to test the workflow with static data. This is the only change you need to make in your code to switch between static and stream processing. Learn more about Pathway's unified engine and single syntax in [our User Guide](/developers/user-guide/connecting-to-data/switch-from-batch-to-streaming#switching-from-batch-to-streaming). +# +# ## 20-minute rolling statistics +# +# Now it's time to build your trading algorithm. There is no need to fully understand the terminology or the math here. What's most important to grasp is how you are taking a stream of data and performing a windowing transformation to get more analytical value out of the raw data. +# +# Start by creating the first of our two Bollinger Bands: the 20-minute volatility measured as the Volume Weighted Standard Deviation. Use a [`sliding window`](/developers/user-guide/temporal-data/windows-manual/#temporal-sliding-windowing) to compute at every minute the volume weighted price mean and standard deviation aggregate on the past 20 minutes of data. The `behavior` option tells Pathway that the window should emit the statistics only when it is finished - we do not want to see incomplete results. +# +# To compute the standard deviation, use the identity: +# +# $$ +# \sigma(X) = \sqrt{\operatorname E\left[(X - \operatorname E[X])^2\right]} = \sqrt{\operatorname E\left[X^2\right] - (\operatorname E[X])^2}, +# $$ +# +# which is easily expressible using [Pathway reducers](/developers/api-docs/reducers/): we first compute the total $\mathrm{volume}$, $\mathrm{price}$, and $\mathrm{price}^2$. We then postprocess them to obtain the mean ($\mathrm{vwap}$), standard deviation ($\mathrm{vwstd}$), and Bollinger Bands places at $\mathrm{vwap} \pm 2\cdot \mathrm{vwstd}$. +# +# Or in simpler terms: the code block below takes your incoming data stream and calculates important statistics in real-time. These statistics are continually updated as the data comes in so that you can identify critical moments as they happen. + +# %% +minute_20_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.sliding( + hop=datetime.timedelta(minutes=1), duration=datetime.timedelta(minutes=20) + ), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + transact_total2=pw.reducers.sum(pw.this.volume * pw.this.vwap**2), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) + .with_columns( + vwstd=(pw.this.transact_total2 / pw.this.volume - pw.this.vwap**2) ** 0.5 + ) + .with_columns( + bollinger_upper=pw.this.vwap + 2 * pw.this.vwstd, + bollinger_lower=pw.this.vwap - 2 * pw.this.vwstd, + ) +) + +# %% [markdown] +# ## 1-minute rolling statistics +# +# Now it's time to compute the second Bollinger Bands: the 1-minute running mean. You will need to compute the mean price over the last minute of trades. +# +# The code is analogous to the 20-minute statistics but simpler: you can use a [`tumbling window`](/glossary/tumbling-window) and don't have to compute the standard deviation. + +# %% +minute_1_stats = ( + data.windowby( + pw.this.t, + window=pw.temporal.tumbling(datetime.timedelta(minutes=1)), + behavior=pw.temporal.exactly_once_behavior(), + instance=pw.this.ticker, + ) + .reduce( + ticker=pw.this._pw_instance, + t=pw.this._pw_window_end, + volume=pw.reducers.sum(pw.this.volume), + transact_total=pw.reducers.sum(pw.this.volume * pw.this.vwap), + ) + .with_columns(vwap=pw.this.transact_total / pw.this.volume) +) + +# %% [markdown] +# ## Joining the statistics +# +# Now join the 20-minute and 1-minute statistics, gathering all the information needed for alerting in one place. Alert triggering is now a breeze. + +# %% +joint_stats = ( + minute_1_stats.join( + minute_20_stats, pw.left.t == pw.right.t, pw.left.ticker == pw.right.ticker + ) + .select( + *pw.left, + bollinger_lower=pw.right.bollinger_lower, + bollinger_upper=pw.right.bollinger_upper, + ) + .with_columns( + is_alert=(pw.this.volume > 10000) + & ( + (pw.this.vwap > pw.this.bollinger_upper) + | (pw.this.vwap < pw.this.bollinger_lower) + ) + ) + .with_columns( + action=pw.if_else( + pw.this.is_alert, + pw.if_else(pw.this.vwap > pw.this.bollinger_upper, "sell", "buy"), + "hodl", + ) + ) +) +alerts = joint_stats.filter(pw.this.is_alert) + + +# %% [markdown] +# ## Dashboard creation +# +# Now create a `Bokeh` plot and `Panel` table visualization: the plot shows the Bollinger Bands along with the price running mean and indicates the price of buy and sell decisions. The table gathers all the decisions conveniently for further processing, such as reducing it to compute a historical evaluation of the gains of the strategy. +# +# When the cell is executed, placeholder containers are created for the plot and table visualization. They will be populated with live data when the computation is started (i.e. when running the final `pw.run()` at the end of this notebook). + + +# %% +def stats_plotter(src): + actions = ["buy", "sell", "hodl"] + color_map = bokeh.models.CategoricalColorMapper( + factors=actions, palette=("#00ff00", "#ff0000", "#00000000") + ) + + fig = bokeh.plotting.figure( + height=400, + width=600, + title="20 minutes Bollinger bands with last 1 minute average", + x_axis_type="datetime", + y_range=(188.5, 191), + ) + fig.line("t", "vwap", source=src) + band = bokeh.models.Band( + base="t", + lower="bollinger_lower", + upper="bollinger_upper", + source=src, + fill_alpha=0.3, + fill_color="gray", + line_color="black", + ) + + fig.scatter( + "t", + "vwap", + color={"field": "action", "transform": color_map}, + size=10, + marker="circle", + source=src, + ) + + fig.add_layout(band) + return fig + + +viz = panel.Row( + joint_stats.plot(stats_plotter, sorting_col="t"), + alerts.select(pw.this.ticker, pw.this.t, pw.this.vwap, pw.this.action).show( + include_id=False, sorters=[{"field": "t", "dir": "desc"}] + ), +) +# _MD_SHOW_viz + +# %% [markdown] +# ## Running the computation +# +# All the hard work is done! The final step is to start the Pathway data processing engine using the `pw.run()` command: +# +# %% +%%capture --no-display +pw.run() +# %% [markdown] +# Watch how the dashboard is updated in realtime! The basic Bollinger Bands action trigger seems to be working - the green buy decision markers are frequently followed by the red sell markers at a slightly higher price. +# +# While the computation is running, `pathway` prints important statistics such as message processing latency. +# +# Successful evaluation of the code should result in the animation: +# ![image](https://github.com/pathwaycom/pathway-examples/blob/c20cd69a6c9c87fc70a9082de57666c50f2ab3c2/documentation/from_jupyter_to_deploy/jupyter-demo-final-smallest-compressed.gif?raw=true) + +# %% +# _MD_COMMENT_START_ +# The code below is managed using the _posthook.sh please change/delete it accordingly +# Do not change the `# Due to ..` line!!!! +# _MD_COMMENT_END_ +# Due to the bug described in https://github.com/jupyter/notebook/issues/1622 +# Please execute `pw.run()` manually, after the widget above is initialized and visible. + +# pw.run() + +# %% [markdown] +# ## Jupyter Notebooks & Streaming Data in Production +# +# Congratulations! You have succesfully built a live data streaming pipeline with useful data visualisations and real-time alerts, right from a Jupyter notebook 😄 +# +# This is just a taste of what is possible. If you're interested in diving deeper and building a production-grade data science pipeline all the way from data exploration to deployment, you may want to check out the full-length [From Jupyter to Deploy](/developers/user-guide/exploring-pathway/from-jupyter-to-deploy/) tutorial. +# +# ## What else can you do with Pathway? +# +# * Perform machine learning in real time. e.g. [ real-time +# Classification](/developers/showcases/lsh/lsh_chapter1/) , [real-time fuzzy joins](/developers/showcases/fuzzy_join/fuzzy_join_chapter2/) +# +# * Transform unstructured data to structured data using [live LLM pipelines](https://github.com/pathwaycom/llm-app) +# +# * Making [joins](/developers/user-guide/temporal-data/interval-join) simple with timeseries data +# +# And so much more... Read more about what we can do in the [developer docs](/developers/user-guide/introduction/welcome/). +# +# We would love to have you try out [Pathway on GitHub](https://github.com/pathwaycom/pathway). diff --git a/docs/2.developers/7.showcases/.merging-time-series/__init__.py b/docs/2.developers/7.showcases/.merging-time-series/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.merging-time-series/article.py b/docs/2.developers/7.showcases/.merging-time-series/article.py new file mode 100644 index 00000000..206a6c66 --- /dev/null +++ b/docs/2.developers/7.showcases/.merging-time-series/article.py @@ -0,0 +1,480 @@ +# --- +# title: "How to combine two time series" +# description: "Learn how to combine between two time series with different timestamps in Pathway." +# aside: true +# article: +# date: '2023-04-28' +# thumbnail: '/assets/content/tutorials/time_series/thumbnail-time-series.png' +# tags: ['Time Series'] +# keywords: ['time series', 'multiple data sources', 'interpolation', 'connectors'] +# author: 'olivier' +# notebook_export_path: notebooks/tutorials/combining-time-series.ipynb +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Combining two time series in Pathway +# In this article, you will learn how to combine time series in Pathway. +# +# With the emergence of IoT technology, we now have a wide range of sensor devices at our disposal that can measure almost anything, from GPS positions to humidity and temperature. +# Since each measurement is timestamped, this generated data are time series: the data can be analyzed and modeled as a sequence of values that change over time. +# +# ![Example of combination of two time series](/assets/content/tutorials/time_series/time_series_join_process.svg) +# +# While combining time series data can be challenging due to the lack of matching timestamps between different datasets, it is a crucial operation. +# Combining values from multiple sensors measuring the same metric can improve measurement accuracy. +# Also, combining various metrics on a common time index allows in-depth analysis and modeling. +# +# ::article-img +# --- +# src: '/assets/content/tutorials/time_series/time_series_1.svg' +# alt: 'Example of two time series with different timestamps' +# width: '500px' +# class: 'mx-auto' +# --- +# :: +# +# In this article, we will explore how to combine time series data using Pathway by calculating the average speed of a fleet of cars. +# So, fasten your seatbelt and get ready to explore the world of time series! +# +# ## Time series +# +# A time series is a type of data that records observations or measurements over time. It can be anything from stock prices, weather data, or the number of customers visiting a store to more complex measures like physiological signals from medical devices. +# Consider a time series as a sequence of data points collected at regular or irregular intervals over time. Each data point in a time series represents a measurement or observation made at a specific point in time. +# +# ::inline +# +# ::article-img +# --- +# src: '/assets/content/tutorials/time_series/single_time_series.svg' +# alt: 'Example of a time series represented as an event stream' +# width: '75px' +# --- +# :: +# +# ::article-img +# --- +# src: '/assets/content/tutorials/time_series/time_series_graph.svg' +# alt: 'Example of a time series represented as a graph' +# width: '350px' +# --- +# :: +# +# ::article-img +# --- +# src: '/assets/content/tutorials/time_series/time_series_table-cropped.svg' +# alt: 'Example of a time series represented as a table' +# width: '150px' +# --- +# :: +# +# :: +# +# +# +# The concept of time series is critical in many real-world applications, from finance and economics to healthcare and meteorology. Analyzing time series data allows us to gain insights into patterns, trends, and relationships between variables over time. For example, we can use time series data to forecast future values, identify anomalies, or monitor changes in a system. +# Time series data can be represented in various formats, from simple spreadsheets to complex databases. However, analyzing and modeling time series data can be challenging due to its complex nature and different types of noise and anomalies. +# +# +# ### Combining time series +# +# Combining time series is the process of combining different time series into a single time series based on a common timestamp or index. In other words, combining time series consists in merging data from various sources into a comprehensive time series, allowing for deeper analysis and modeling. +# +# ![Example of combination of two time series](/assets/content/tutorials/time_series/time_series_join_process.svg) +# +# Combining time series are essential for several reasons. Firstly, it can improve the accuracy of the measurements by combining the values of several sensors measuring the same metric. For example, imagine you have multiple sensors measuring the temperature of a room. By combining the time series data from these sensors, you can get a more accurate representation of the temperature in the room. +# +# Secondly, by combining and analyzing various time series data streams, valuable insights can be derived across different domains, enabling performance optimization, predictive maintenance, resource management, and strategic decision-making: +# - Website analytics 💻: Combining time series data on website traffic, user engagement, conversion rates, and marketing campaign metrics can provide insights into user behavior, measure the effectiveness of marketing efforts, and optimize website performance. +# - Health monitoring 🩺: Combining time series data from wearable devices, such as heart rate, sleep patterns, and physical activity, can help track and analyze individuals' health and wellness trends, enabling personalized healthcare interventions. +# - Environmental monitoring 🌡️: Combining time series data from air quality sensors, weather stations, and pollutant levels can provide a comprehensive understanding of the environment's condition, aid in pollution control efforts, and support urban planning initiatives. +# - Supply chain management 📦: Combining time series data on inventory levels, production rates, transportation delays, and customer demand can optimize inventory management, improve delivery schedules, and enhance overall supply chain efficiency. +# - Analyzing stock market data 📈: Combining time series of stock prices, trading volumes, and financial indicators can provide a comprehensive view of the market's behavior and aid in identifying trends, correlations, and investment opportunities. +# - Technical analysis and development 🏎️: Combining time series data on car sensors readings, engine performance, aerodynamic data, and telemetry can aid in the technical analysis and development of Formula 1 cars, leading to improvements in speed, reliability, and overall performance. +# +# Combining time series is an essential operation in time series analysis, but it can be challenging due to the need for matching timestamps between different datasets. Nonetheless, various techniques and tools can help us merge time series data effectively, such as interpolation or merging on the closest timestamp. +# %% [markdown] jp-MarkdownHeadingCollapsed=true +# ## The average speed of a taxi and its passenger +# Imagine you are working for a taxi company, and people are using your app to book a ride. +# You can access the GPS traces of the taxi and the passengers. +# By using those two traces jointly, you can better approximate the average speed of the ride. +# This allows you to identify potential bottlenecks or inefficiencies in the system and take corrective measures to improve traffic flow and reduce travel time. +# +# %% [markdown] +# ## Creating the dataset +# +# Unfortunately, it is hard to find publicly available datasets with the characteristics we are interested in. +# Existing publicly available GPS datasets contain individual taxi traces without the passengers' GPS traces. +# You will generate our pairs of traces from one of such individual traces. +# +# ### The dataset: GeoLife +# You will use one trace of the [GeoLife dataset](https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/) from Microsoft Research. +# GeoLife is a GPS trajectory dataset containing 17,621 trajectories obtained using different GPS loggers and phones. +# +# ### Generation of a fleet of two cars +# Let's generate two traces by splitting the trace: each GPS data point is alternatively assigned to one of the new traces. +# This ensures that the two traces have the same trajectory while having different timestamps. +# +# You can download the trace of this example by uncommenting and executing the following command: + +# %% +# %%capture --no-display +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/time-series-raw-trace.plt -O raw-trace.plt + +# %% [markdown] +# To split the trace into two different traces, we simply read the file line by line and write each line into new CSV files: + +# %% +input_trace = "./raw-trace.plt" +trace_taxi = "./trace_taxi.csv" +trace_passenger = "./trace_passenger.csv" +header_row = "lat,lng,const,alt,elapsed-time,date,time\n" + +with open(input_trace, "r") as file: + line = file.readline() + for _ in range(5): + line = file.readline() + + trace1 = open(trace_taxi, "w+") + trace2 = open(trace_passenger, "w+") + trace1.write(header_row) + trace2.write(header_row) + + dataset = 1 + + while line: + line = file.readline() + if dataset: + trace1.write(line) + dataset = 0 + else: + trace2.write(line) + dataset = 1 + trace1.close() + trace2.close() + +# %% [markdown] +# ## Loading the data sources in Pathway +# +# To connect Pathway to the two data sources, you have to use Pathway's input connectors. +# You do not need all the provided data; you can keep only the interesting ones, namely the latitude, longitude, altitude, date, and time: + +# %% +import pathway as pw + +columns = ["lat", "lng", "alt", "date", "time"] + + +class InputSchema(pw.Schema): + lat: float + lng: float + alt: float + date: str + time: str + + +t1 = pw.io.csv.read(trace_taxi, schema=InputSchema, mode="static") +t2 = pw.io.csv.read(trace_passenger, schema=InputSchema, mode="static") + +# %% [markdown] +# Note that you use the static mode in this example, but in practice, you should use the streaming mode. +# You can also [generate a datastream from a static file](/developers/user-guide/connectors/custom-python-connectors). + +# %% [markdown] +# ### Formatting the data +# After loading the raw data, it's important to format it properly to ensure it can be processed efficiently. +# +# First, the time is currently represented by two strings, one for the date and one for the time. +# Having a single timestamp can simplify the data processing task. +# You can use `dateutil` and `datetime` to parse them into a timestamp. + +# %% +from datetime import datetime, time + +from dateutil import parser, tz + +default_date = datetime.combine( + datetime.now(), time(0, tzinfo=tz.gettz("America/New_York")) +) + + +def convert_to_timestamp(date, time) -> int: + datestring = date + " " + time + yourdate = parser.parse(datestring, default=default_date) + return int(datetime.timestamp(yourdate)) + + +def format_table(t): + return t.select( + *pw.this.without(pw.this.date, pw.this.time), + timestamp=pw.apply(convert_to_timestamp, pw.this.date, pw.this.time), + ) + + +t1_timestamp = format_table(t1) +t2_timestamp = format_table(t2) + + +# %% [markdown] +# Now your time series are ready to be combined and processed! + +# %% [markdown] +# ### Obtaining a preview of the data +# +# Pathway is a framework capable of handling both static and streaming data. +# However, it is primarily designed to handle streaming data and enable real-time data processing. +# +# To ensure that the data is being processed correctly, you can define a function `preview_table` to preview the data. +# ⚠️ It's important to note that **this function should not be used in a production environment** since it relies on `pw.debug.compute_and_print` and static data. +# Its primary purpose is to check that the data is being processed correctly and to help with the development and testing phase of the data processing pipeline. +# You should use [Pathway's output connectors](/developers/user-guide/connecting-to-data/connectors/) to access the data in streaming mode. +# +# In this case, you can filter all the entries with a timestamp higher than a given value to only display a small portion of the dataset. + +# %% +def preview_table(table, max_timestamp=1224744825): + table = table.filter(pw.this.timestamp < max_timestamp) + pw.debug.compute_and_print(table) + + +preview_table(t1_timestamp) +preview_table(t2_timestamp) + +# %% [markdown] +# The value 1224744825 is hand-picked to fit this dataset; you can replace it by any desired value. + +# %% [markdown] +# ## Combining the time series and computing the average speed +# To compute the average speed of the traces, you will proceed as follows: +# 1. Concatenate the time series to obtain a table with the timestamps of both traces. +# 2. Add the existing positions: each timestamp should have a position and a missing position. +# 3. Do an interpolation to estimate the missing values. +# 4. Compute the average speed +# +# ### 1. Concatenation +# +# The first step to combine the two time series is to concatenate them: you want a table with all the timestamps. +# +# ![Concatenating the timestamps of two time series](/assets/content/tutorials/time_series/concatenating_timestamps.svg) +# +# You can do it easily in Pathway with `pw.Table.concat_reindex`: + +# %% +merged_timestamps = pw.Table.concat_reindex( + t1_timestamp[["timestamp"]], t2_timestamp[["timestamp"]] +) + +# %% +preview_table(merged_timestamps) + + +# %% [markdown] +# ### 2. Adding existing positions +# Now that you have a table with the timestamps, you must add the positions. +# You can add the positions by doing a left join on the timestamp table. +# +# ![Time series with missing values](/assets/content/tutorials/time_series/time_series_missing_values.svg) +# +# +# You need to rename the columns `lat`, `lng`, and `alt` to `lat_1`, `lng_1`, `alt_1`, `lat_2`, `lng_2`, and `alt_2` to make the distinction between the two data sources. + +# %% +joined_table = ( + merged_timestamps.join_left(t1_timestamp, pw.left.timestamp == pw.right.timestamp) + .select( + *pw.left, + **pw.right[["lat", "lng", "alt"]].with_suffix("_1"), + ) + .join_left(t2_timestamp, pw.left.timestamp == pw.right.timestamp) + .select( + *pw.left, + **pw.right[["lat", "lng", "alt"]].with_suffix("_2"), + ) +) +preview_table(joined_table) + +# %% [markdown] +# Your table now has all the existing positions, but the columns with the positions are half-empty. +# To fill the missing positions, you must compute an interpolation between the previous and last positions. + +# %% [markdown] +# ### 3. Interpolating the positions +# Now, you will fill the missing values by interpolating with the previous and next values you found. +# +# ![Filling the missing values](/assets/content/tutorials/time_series/filling_missing_values.svg) +# + +# %% [markdown] +# You need to do a linear interpolation on each column, using the column timestamp as index. +# This can be done using Pathway's `interpolate` function: + +# %% +interpolated_table = joined_table.interpolate( + pw.this.timestamp, + pw.this.lat_1, + pw.this.lng_1, + pw.this.alt_1, + pw.this.lat_2, + pw.this.lng_2, + pw.this.alt_2, +) + +# %% +preview_table(interpolated_table) + +# %% [markdown] +# And voila! You have successfully combined two time series! 🎉 +# +# All you need to do it to compute the average speed now. + +# %% [markdown] +# ### 4. Computing the average speed +# +# To compute the average speed, you will first calculate the speed by dividing the traveled distance by the time spent between the next and previous time. +# +# To simplify the computation of the speed, you can group the different values into two columns `position_1` and `positions_2`: + +# %% +interpolated_table = interpolated_table.select( + pw.this.timestamp, + interpolated_position_1=pw.make_tuple(pw.this.lat_1, pw.this.lng_1, pw.this.alt_1), + interpolated_position_2=pw.make_tuple(pw.this.lat_2, pw.this.lng_2, pw.this.alt_2), +) +preview_table(interpolated_table) + +# %% [markdown] +# To compute the distance, you can use the `pyproj` package, which you can install by uncommenting and executing the following command: + +# %% +# %%capture --no-display +# # !pip install pyproj + +# %% +from pyproj import Geod + +g = Geod(ellps="WGS84") + + +def compute_speed(t_prev, position_prev, t_next, position_next): + try: + _, _, distance_2d = g.inv( + position_prev[1], position_prev[0], position_next[1], position_next[0] + ) + except: + return 0.0 + return float(distance_2d / (t_next - t_prev)) + + +# %% [markdown] +# Note this is the simple 2-dimensional distance, but you can use your favorite distance metric. + +# %% [markdown] +# You need to order the table to obtain the previous and next values. +# This can be done with the `sort` function which provides `prev` and `next` columns, containing pointers to the previous and next rows: + +# %% +interpolated_table += interpolated_table.sort(key=pw.this.timestamp) +preview_table(interpolated_table) + + +# %% [markdown] +# Then, you can use transformer classes to compute the instant speed on all the points for both sources: + +# %% +@pw.transformer +class computing_speed: + class ordered_ts(pw.ClassArg): + timestamp = pw.input_attribute() + interpolated_position_1 = pw.input_attribute() + interpolated_position_2 = pw.input_attribute() + prev = pw.input_attribute() + next = pw.input_attribute() + + @pw.output_attribute + def speed_1(self) -> float: + t = self.timestamp + t_prev = t + position_prev = self.interpolated_position_1 + t_next = t + position_next = self.interpolated_position_1 + if self.prev is not None: + t_prev = self.transformer.ordered_ts[self.prev].timestamp + position_prev = self.transformer.ordered_ts[ + self.prev + ].interpolated_position_1 + if self.next is not None: + t_next = self.transformer.ordered_ts[self.next].timestamp + position_next = self.transformer.ordered_ts[ + self.next + ].interpolated_position_1 + return compute_speed(t_prev, position_prev, t_next, position_next) + + @pw.output_attribute + def speed_2(self) -> float: + t = self.timestamp + t_prev = t + position_prev = self.interpolated_position_2 + t_next = t + position_next = self.interpolated_position_2 + if self.prev is not None: + t_prev = self.transformer.ordered_ts[self.prev].timestamp + position_prev = self.transformer.ordered_ts[ + self.prev + ].interpolated_position_2 + if self.next is not None: + t_next = self.transformer.ordered_ts[self.next].timestamp + position_next = self.transformer.ordered_ts[ + self.next + ].interpolated_position_2 + return compute_speed(t_prev, position_prev, t_next, position_next) + + +table_speed = computing_speed(ordered_ts=interpolated_table).ordered_ts +table_speed = pw.Table.from_columns(interpolated_table.timestamp) + table_speed +preview_table(table_speed) + +# %% [markdown] +# Finally, you can compute the average of the two speeds at each timestamp: + +# %% +average_speed_table = table_speed.select( + pw.this.timestamp, + speed=(pw.this.speed_1 + pw.this.speed_2) / 2.0, +) +preview_table(average_speed_table) + +# %% [markdown] +# You have now the speed for all the timestamps. +# +# Although initially designed for static GPS traces, this configuration can easily be adapted for a [streaming setup](/developers/user-guide/connecting-to-data/streaming-and-static-modes/) by modifying the [connectors](/developers/user-guide/connecting-to-data/connectors/). In streaming mode, all calculations will be automatically refreshed whenever a new GPS position is received. + +# %% [markdown] +# ## Conclusions +# Congratulations, you now have the skills to combine time series and unlock new insights in your data! +# +# This article focused on combining two GPS traces to compute the average speed. +# However, the principles we discussed can be applied to a wide range of time series data, from stock prices to weather data to sensor readings. +# +# By combining and analyzing time series data, you can uncover patterns, trends, and relationships that may not be immediately apparent from individual series. +# This can be especially useful in complex modeling scenarios that involve multiple data sources. +# +# For instance, imagine you want to predict weather patterns using a combination of humidity, temperature, and other metrics. +# By combining these time series, you could create a more comprehensive picture of the conditions affecting your area's weather patterns. +# +# The possibilities for time series analysis are endless, and if you're interested in learning more, be sure to check out [our article on log monitoring](/developers/showcases/realtime-log-monitoring). +# In it, we explore how to use time series analysis to detect anomalies in your web server's logs. diff --git a/docs/2.developers/7.showcases/.merging-time-series/raw-trace.plt b/docs/2.developers/7.showcases/.merging-time-series/raw-trace.plt new file mode 100644 index 00000000..e163505e --- /dev/null +++ b/docs/2.developers/7.showcases/.merging-time-series/raw-trace.plt @@ -0,0 +1,30 @@ +Geolife trajectory +WGS 84 +Altitude is in Feet +Reserved 3 +0,2,255,My Track,0,0,2,8421376 +0 +39.984702,116.318417,0,492,39744.1201851852,2008-10-23,02:53:04 +39.984683,116.31845,0,492,39744.1202546296,2008-10-23,02:53:10 +39.984686,116.318417,0,492,39744.1203125,2008-10-23,02:53:15 +39.984688,116.318385,0,492,39744.1203703704,2008-10-23,02:53:20 +39.984655,116.318263,0,492,39744.1204282407,2008-10-23,02:53:25 +39.984611,116.318026,0,493,39744.1204861111,2008-10-23,02:53:30 +39.984608,116.317761,0,493,39744.1205439815,2008-10-23,02:53:35 +39.984563,116.317517,0,496,39744.1206018519,2008-10-23,02:53:40 +39.984539,116.317294,0,500,39744.1206597222,2008-10-23,02:53:45 +39.984606,116.317065,0,505,39744.1207175926,2008-10-23,02:53:50 +39.984568,116.316911,0,510,39744.120775463,2008-10-23,02:53:55 +39.984586,116.316716,0,515,39744.1208333333,2008-10-23,02:54:00 +39.984561,116.316527,0,520,39744.1208912037,2008-10-23,02:54:05 +39.984536,116.316354,0,525,39744.1209490741,2008-10-23,02:54:10 +39.984523,116.316188,0,531,39744.1210069444,2008-10-23,02:54:15 +39.984516,116.315963,0,536,39744.1210648148,2008-10-23,02:54:20 +39.984523,116.315823,0,541,39744.1211226852,2008-10-23,02:54:25 +39.984574,116.315611,0,546,39744.1211805556,2008-10-23,02:54:30 +39.984568,116.315407,0,551,39744.1212384259,2008-10-23,02:54:35 +39.984538,116.315148,0,556,39744.1212962963,2008-10-23,02:54:40 +39.984501,116.314907,0,560,39744.1213541667,2008-10-23,02:54:45 +39.984532,116.314808,0,564,39744.121412037,2008-10-23,02:54:50 +39.984504,116.314625,0,569,39744.1214699074,2008-10-23,02:54:55 +39.984485,116.314426,0,574,39744.1215277778,2008-10-23,02:55:00 diff --git a/docs/2.developers/7.showcases/.pagerank/__init__.py b/docs/2.developers/7.showcases/.pagerank/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.pagerank/article.py b/docs/2.developers/7.showcases/.pagerank/article.py new file mode 100644 index 00000000..29192be2 --- /dev/null +++ b/docs/2.developers/7.showcases/.pagerank/article.py @@ -0,0 +1,149 @@ +# --- +# title: Computing PageRank +# description: Demonstration of a PageRank computation +# notebook_export_path: notebooks/tutorials/pagerank.ipynb +# aside: true +# article: +# date: '2022-11-07' +# thumbnail: '/assets/content/blog/th-computing-pagerank.png' +# tags: ['tutorial', 'machine-learning'] +# keywords: ['pagerank', 'graph'] +# author: 'pathway' +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Computing PageRank +# +# ## Introduction +# PageRank is best known for its success in ranking web pages in Google Search engine. +# Here is a [quick description](https://en.wikipedia.org/w/index.php?title=PageRank&oldid=1111494883): +# > PageRank works by counting the number and quality of links to a page to determine a +# > rough estimate of how important the website is. The underlying assumption is that +# > more important websites are likely to receive more links from other websites. +# +# In fact, the algorithm outputs a probability distribution that represents the +# likelihood of arriving at any particular page after randomly clicking on links for a while. +# We will simulate this behavior by the following 'surfing the Internet' procedure: +# - Initially, at each page, some amount of people start surfing the internet from that page. +# - In each turn, some users decide to click on a random link and visit a new page. +# - We iterate for a fixed number of rounds. +# +# This article assumes that you are already familiar with some basics of [Pathway transformations](/developers/user-guide/introduction/concepts#processing-the-data-with-transformations). +# +# ## Code +# First things first - imports and constants. + +# %% +from typing import Any + +import pathway as pw + + +# %% [markdown] +# ### I/O Data +# We use an `Edge` schema to represent the graph and `Result` schema to represent the final ranks. +# %% +class Edge(pw.Schema): + u: Any + v: Any + + +class Result(pw.Schema): + rank: float + + +# %% [markdown] +# `pagerank` performs one turn of 'surfing the Internet' procedure by uniformly +# distributing rank from each node to all its adjacent nodes, for a fixed number of rounds. +# +# %% +def pagerank(edges: pw.Table[Edge], steps: int = 5) -> pw.Table[Result]: + in_vertices = edges.groupby(id=edges.v).reduce(degree=0) + out_vertices = edges.groupby(id=edges.u).reduce(degree=pw.reducers.count()) + degrees = pw.Table.update_rows(in_vertices, out_vertices) + base = out_vertices.difference(in_vertices).select(flow=0) + + ranks = degrees.select(rank=6_000) + + grouper = edges.groupby(id=edges.v) + + for step in range(steps): + outflow = degrees.select( + flow=pw.if_else( + degrees.degree == 0, 0, (ranks.rank * 5) // (degrees.degree * 6) + ) + ) + + inflows = edges.groupby(id=edges.v).reduce( + flow=pw.reducers.sum(outflow.ix(edges.u).flow) + ) + + inflows = pw.Table.concat(base, inflows) + + ranks = inflows.select(rank=inflows.flow + 1_000).with_universe_of(degrees) + return ranks + + +# %% [markdown] +# ### Tests +# We present two easy test cases here. +# A test case with a single 3-vertices loop with one backward edge. +# %% +# directed graph +vertices = pw.debug.table_from_markdown( + """ + | + a | + b | + c | + """ +).select() +edges = pw.debug.table_from_markdown( + """ + u | v + a | b + b | c + c | a + c | b + """, +).select(u=vertices.pointer_from(pw.this.u), v=vertices.pointer_from(pw.this.v)) + +pw.debug.compute_and_print(pagerank(edges)) + +# %% [markdown] +# Why these numbers? 3945, 6981, 7069? Feel free to skip the quite mathy explanation below. +# +# Let us calculate what the correct answer should be. +# PageRank actually finds a [stationary distribution](https://en.wikipedia.org/wiki/Markov_chain#Stationary_distribution_relation_to_eigenvectors_and_simplices) +# of a random walk on a graph in which the probability of each move depends only on the +# currently visited state, i.e. it is a Markov Chain. +# +# One may think that the transition matrix of the Markov chain in our example is +# $$ +# P=\left(\begin{array}{cc} +# 0.05 & 0.9 & 0.05\\ +# 0.05 & 0.05 & 0.9\\ +# 0.475 & 0.475 & 0.05 +# \end{array}\right) +# $$ +# We move to a new page with probability 5/6 uniformly distributed among all the linked (adjacent) pages, +# and with probability 1/6 we mix uniformly at random. +# The result is a stationary distribution roughly of $(x = ( 0.215 \quad 0.397 \quad 0.388) )$ which is proportional to the rank returned. +# However, we output only the approximation of this result, and our output is not normalized. +# +# ### Summary +# As always, feel free to play and experiment with this code! In case you are looking for cool real-world +# graphs to experiment with, the [Stanford Network Analysis Project](https://snap.stanford.edu/) is an excellent source +# of reference instances, big and small. diff --git a/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/__init__.py b/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/article.py b/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/article.py new file mode 100644 index 00000000..3f125131 --- /dev/null +++ b/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/article.py @@ -0,0 +1,105 @@ +# --- +# title: Tumbling Window group-by - detect suspicious user activity +# description: Detecting suspicious login attempts +# notebook_export_path: notebooks/tutorials/suspicious_user_activity.ipynb +# author: 'przemek' +# aside: true +# article: +# date: '2023-05-30' +# thumbnail: '/assets/content/blog/th-shield.png' +# tags: ['tutorial', 'machine-learning'] +# keywords: ['window', 'tumbling', 'alert'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Detecting suspicious user activity with Tumbling Window group-by +# +# In this tutorial you will learn how to perform a [tumbling window](/glossary/tumbling-window) operation to detect suspicious activity. +# +# Your task is to detect suspicious user login attempts during some period of time. +# You have a record of login data. Your goal is to detect suspicious users who have logged in more than 5 times in a single minute. +# +# To do this, you will be using the `windowby` syntax with a `pw.temporal.tumbling()` object. Let's jump in! +# +# Your input data table has the following columns: +# * `username`, +# * whether the login was `successful`, +# * `time` of a login attempt, +# * `ip_address` of a login. +# +# +# Let's start by ingesting the data: +# +# +# First ingest the data. +# %% +# Uncomment to download the required files. +# # %%capture --no-display +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/suspicious_users_tutorial_logins.csv -O logins.csv + +# %% +import pathway as pw + + +class InputSchema(pw.Schema): + username: str + successful: str + time: int + ip_address: str + + +logins = pw.io.csv.read( + "logins.csv", + schema=InputSchema, + mode="static", +) + +# %% [markdown] +# The CSV data has the string values "True" and "False" in the `successful` column. +# +# Let's convert this to a Boolean column: + +# %% +logins = logins.with_columns(successful=(pw.this.successful == "True")) + +# %% [markdown] +# Then, let's filter attempts and keep only the unsuccessful ones. + +# %% +failed = logins.filter(~pw.this.successful) + +# %% [markdown] +# Now, perform a tumbling window operation with a duration of 60 (i.e. 1 minute). +# +# Use the `instance` keyword to separate rows by the `ip_address` value. + +# %% +result = failed.windowby( + failed.time, window=pw.temporal.tumbling(duration=60), instance=pw.this.ip_address +).reduce( + ip_address=pw.this._pw_instance, + count=pw.reducers.count(), +) + +# %% [markdown] +# ...and finally, let's keep only the IP addresses where the number of failed logins exceeded the threshold (5): +# %% +suspicious_logins = result.filter(pw.this.count >= 5) +pw.debug.compute_and_print(suspicious_logins) + +# %% [markdown] +# And that's it! You have used a tumbling window operation to identify suspicious user activity and can now act on this information to increase the security of your platform. +# +# Reach out to us on [Discord](https://discord.gg/pathway) if you'd like to discuss [real time anomaly detection](/glossary/real-time-anomaly-detection) use cases like this one in more detail! diff --git a/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/logins.csv b/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/logins.csv new file mode 100644 index 00000000..b4cb65fa --- /dev/null +++ b/docs/2.developers/7.showcases/.suspicious_activity_tumbling_window/logins.csv @@ -0,0 +1,31 @@ +username,successful,time,ip_address +user_B,False,1545730216,50.37.169.241 +user_B,False,1545730191,50.37.169.241 +user_B,False,1545730226,50.37.169.241 +user_B,False,1545730201,50.37.169.241 +user_C,True,1545729904,51.181.222.161 +user_B,False,1545730206,50.37.169.241 +user_A,True,1545766962,197.215.20.132 +user_A,True,1545742450,197.215.20.132 +user_A,True,1545770672,197.215.20.132 +user_A,True,1545769776,197.215.20.132 +user_C,True,1545729904,51.181.222.161 +user_A,True,1545808913,197.215.20.132 +user_B,False,1545730221,50.37.169.241 +user_C,True,1545729904,51.181.222.161 +user_A,True,1545758652,197.215.20.132 +user_C,False,1545729904,51.181.222.161 +user_A,True,1545762855,197.215.20.132 +user_A,True,1545776951,197.215.20.132 +user_C,True,1545729904,51.181.222.161 +user_A,True,1545799825,197.215.20.132 +user_B,False,1545730186,50.37.169.241 +user_B,False,1545730211,50.37.169.241 +user_B,False,1545730231,50.37.169.241 +user_A,True,1545793712,197.215.20.132 +user_C,True,1545729904,51.181.222.161 +user_C,True,1545729904,51.181.222.161 +user_C,True,1545729904,51.181.222.161 +user_C,True,1545729904,51.181.222.161 +user_C,True,1545729904,51.181.222.161 +user_B,False,1545730196,50.37.169.241 diff --git a/docs/2.developers/7.showcases/.user_pairs_fuzzy_join/__init__.py b/docs/2.developers/7.showcases/.user_pairs_fuzzy_join/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/.user_pairs_fuzzy_join/article.py b/docs/2.developers/7.showcases/.user_pairs_fuzzy_join/article.py new file mode 100644 index 00000000..2bcbe2ed --- /dev/null +++ b/docs/2.developers/7.showcases/.user_pairs_fuzzy_join/article.py @@ -0,0 +1,131 @@ +# --- +# title: Mining hidden user pair activity with Fuzzy Join +# description: An example of a cryptocurrency exchange +# author: 'przemek' +# aside: true +# article: +# date: '2023-01-09' +# thumbnail: '/assets/content/blog/th-mining-hidden-user-pair-activity-with-fuzzy-join.png' +# tags: ['tutorial', 'data-pipeline'] +# keywords: ['fuzzy join', 'alert', 'cryptocurrency', 'bitcoin', 'BTC', 'ETH', 'Ethereum'] +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Mining hidden user pair activity with Fuzzy Join +# +# In this article, we want to analyze a stream of transactions in a crypto exchange. +# We find all the pairs of users A and B such as A sells B the ETH coin, and buys from B BTC in a separate transaction. +# +# First we import Pathway and load the two transactions logs. + +# %% +import pathway as pw + +# _MD_COMMENT_START_ + +# DO NOT MODIFY THIS WITHOUT MODIFYING the following file: +# public/pathway/python/pathway/tests/test_gist_user_pair_fuzzy_join.py # noqa E501 + + +def table_from_kafka(**kwargs): + return pw.Table.empty( + sender=str, + receiver=str, + currency=str, + amount=int, + timestamp=pw.DATE_TIME_UTC, + usd_estimate=int, + ) + + +pw.io.kafka.read = table_from_kafka +del table_from_kafka + + +def table_to_postgres(*args, **kwargs): + pass + + +pw.io.postgres.write = table_to_postgres +del table_to_postgres +# _MD_COMMENT_END_ +transactions = pw.io.kafka.read( + rdkafka_settings={ + "group.id": "$GROUP_NAME", + "bootstrap.servers": "clean-panther-8776-eu1-kafka.upstash.io:9092", + "session.timeout.ms": "6000", + }, + topics=["eth_transactions"], +) + +eth_transactions = transactions.filter(pw.this.currency == "ETH") +btc_transactions = transactions.filter(pw.this.currency == "BTC") + +# %% [markdown] +# Now we just need to find all the pairs of buyers/sellers in both transactions and use our `fuzzy_match_tables` to extract the matching pairs. + +# %% +# ETH movements one way +eth_movement_totals = eth_transactions.groupby(pw.this.sender, pw.this.receiver).reduce( + user_A=pw.this.sender, + user_B=pw.this.receiver, + usd_total_estimate=pw.reducers.sum(pw.this.usd_estimate), +) +# BTC movements the other way +btc_movement_totals = btc_transactions.groupby(pw.this.sender, pw.this.receiver).reduce( + user_A=pw.this.receiver, + user_B=pw.this.sender, + usd_total_estimate=pw.reducers.sum(pw.this.usd_estimate), +) +# We run fuzzy join on the two aggregate transaction tables. +# We project users into different spaces to avoid to +# catch a user sending both ETH and BTC to the same user. +left_projection = {"user_A": "C1", "user_B": "C2"} +right_projection = {"user_A": "C1", "user_B": "C2"} +matches = pw.ml.smart_table_ops.fuzzy_match_tables( + eth_movement_totals, + btc_movement_totals, + left_projection=left_projection, + right_projection=right_projection, +) + +# The matched user pairs are now output to an output postgres table. +matched_users = matches.select( + btc_sender=btc_movement_totals.ix(matches.right).user_B, + btc_receiver=btc_movement_totals.ix(matches.right).user_A, + eth_sender=eth_movement_totals.ix(matches.left).user_A, + eth_receiver=eth_movement_totals.ix(matches.left).user_B, + confidence=matches.weight, +) + +# %% [markdown] +# We can now store the resulting table in postgres, or any other database supported by Pathway. + +# %% +pw.io.postgres.write( + matched_users, + postgres_settings={ + "host": "localhost", + "port": "5432", + "dbname": "transactions", + "user": "pathway", + "password": "my_password", + }, + table_name="matched_users_btc_eth_swapping", +) +"do not print cell output _MD_SKIP_"; # fmt: skip +# %% [markdown] +# Would you like to find users that match within given time-window? Take a look at +# recipes on [group-by with a tumbling window](/developers/showcases/suspicious_activity_tumbling_window). diff --git a/docs/2.developers/7.showcases/1.index.md b/docs/2.developers/7.showcases/1.index.md new file mode 100644 index 00000000..2bd75d4f --- /dev/null +++ b/docs/2.developers/7.showcases/1.index.md @@ -0,0 +1,14 @@ +--- +title: "Showcases" +navTitle: "Browse: Showcases" +description: "Pathway showcases" +navigation: true +layout: default +aside: true +toc: false +single: true +--- + +# Showcases + +:ArticlesFromPath{path="developers/showcases"} \ No newline at end of file diff --git a/docs/2.developers/7.showcases/10.suspicious_activity_tumbling_window.md b/docs/2.developers/7.showcases/10.suspicious_activity_tumbling_window.md new file mode 120000 index 00000000..ed6036e2 --- /dev/null +++ b/docs/2.developers/7.showcases/10.suspicious_activity_tumbling_window.md @@ -0,0 +1 @@ +.suspicious_activity_tumbling_window/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/100.gaussian_filtering_python.md b/docs/2.developers/7.showcases/100.gaussian_filtering_python.md new file mode 120000 index 00000000..2e5304c1 --- /dev/null +++ b/docs/2.developers/7.showcases/100.gaussian_filtering_python.md @@ -0,0 +1 @@ +.interval_over_gaussian_filter/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/1015.adaptive-rag.md b/docs/2.developers/7.showcases/1015.adaptive-rag.md new file mode 120000 index 00000000..111aa5c3 --- /dev/null +++ b/docs/2.developers/7.showcases/1015.adaptive-rag.md @@ -0,0 +1 @@ +.adaptive-rag/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/110.upsampling.md b/docs/2.developers/7.showcases/110.upsampling.md new file mode 120000 index 00000000..2a2389f3 --- /dev/null +++ b/docs/2.developers/7.showcases/110.upsampling.md @@ -0,0 +1 @@ +.interval_over_upsampling/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/130.alerting-significant-changes.md b/docs/2.developers/7.showcases/130.alerting-significant-changes.md new file mode 120000 index 00000000..08594191 --- /dev/null +++ b/docs/2.developers/7.showcases/130.alerting-significant-changes.md @@ -0,0 +1 @@ +.alerting_on_significant_changes/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/140.kafka-etl.md b/docs/2.developers/7.showcases/140.kafka-etl.md new file mode 100644 index 00000000..55405ad0 --- /dev/null +++ b/docs/2.developers/7.showcases/140.kafka-etl.md @@ -0,0 +1,362 @@ +--- +title: "Kafka ETL: Processing event streams in Python" +description: "Learn how to build a Kafka ETL pipeline in Python with Pathway and process event streams in real-time." +author: 'olivier' +article: + date: '2024-02-15' + tags: ['showcase', 'data-pipeline'] + thumbnail: '/assets/content/showcases/ETL-Kafka/ETL-Kafka.png' +keywords: ['ETL', 'Kafka', 'datetime', 'time zone', 'timezone', 'concat_reindex'] +--- + +# Kafka ETL: Processing event streams in Python + +In this tutorial, you will learn how to write a Kafka ETL in Python using Pathway, an open-source event processing framework. +You will use Pathway connectors and transformations to extract, transform, and load event streams across multiple Kafka topics. + +Python's simplicity and ease of use make it a popular choice for data processing tasks. +However, Kafka, one of the most widely used tools to build ETL pipelines, is implemented in Java and mainly used by Java and Scala users. +Pathway, a Python stream processing framework with a Rust engine, makes building ETL pipeline over Kafka in pure Python simple without compromising on performance. + +Imagine you've been hired by a fraud-detection company. +The company monitors the logs from different servers and raises an alert whenever a pattern is suspicious. +Your job is to manage your company's data and ensure the data received by the data science team is clean and ready to use. +A new challenge emerges when the company expands its monitoring to servers in Paris. +Suddenly, the times you're receiving are no longer uniform. +From New York, they bear the Eastern Standard Time signature, while those from Paris are stamped with Central European Time. + +Until now, the times you received were from servers in New York: +``` +2024-02-05 10:01:52.884548 -0500 +``` + +And this is what the new times from the new servers in Paris look like: +``` +2024-02-05 16:02:34.934749 +0100 +``` + +You must unify these disparate times into a single format to maintain data integrity. + +Enter [ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load), a three-step process used to clean and unify the data before sharing it for training a model or doing business analytics. +First, you need to **extract** (E) the data, then **transform** (T) it, before finally **load**ing (L) it. +This task is crucial, especially when data comes from different data sources, +to ensure the data used within the company follows a given data type and various requirements. + +![ETL](/assets/content/showcases/ETL-Kafka/ETL.svg) + + +This article shows how to create a Kafka ETL pipeline in Python to extract data from multiple topics and transform it to load it into a combined Kafka topic. +More precisely, you will learn how to do the different ETL steps with Pathway: +- (E) extracting different data streams from Kafka using Pathway Kafka input connector, +- (T) converting the times with varying time zones into timestamps using Pathway datetime module, +- (T) concatenating the resulting data streams using the Pathway concatenation function, +- (L) loading the final data stream back into Kafka. + +The full Pathway script is [here](#full-solution) for those in a hurry. + +## ETL architecture with Kafka in, Kafka out + +The logs are sent to two distinct Kafka topics, one per time zone. +You want to use Pathway to do the Kafka ETL: connect to the topics, extract the data, do the time zone +conversion, and concatenate the resulting data streams into one. +Finally, you want to return the result to a third Kafka topic. + +![Kafka ETL with Pathway](/assets/content/showcases/ETL-Kafka/ETL-Kafka.svg) + + +### Docker containers + +The project uses several docker containers: +- one for Kafka +- one for Zookeeper +- one for Pathway, for the ETL +- one imitating company servers, creating the data. It will be called "producer". + +Kafka and Zookeeper are directly managed in the `docker-compose.yml` file. +Pathway and the producer are managed using dedicated docker files. +Here is the final architecture of the project: + +``` +. +├── pathway-src/ +│ ├── Dockerfile +│ ├── etl.py +│ └── read-results.py +├── producer-src/ +│ ├── create-stream.py +│ └── Dockerfile +├── docker-compose.yml +└── Makefile +``` + +You can find the whole project [on GitHub](https://github.com/pathwaycom/pathway/tree/main/examples/projects/kafka-ETL). + +### Data generation + +The data is generated using a Python script. +Every second, a new log is generated with the current datetime. +The message is randomly associated with one of the two time zones and sent to the associated Kafka topic. +To simplify the logs identification, the log also contains a field "message" with the log's number. + + +```python [create-stream.py] +timezone1 = ZoneInfo("America/New_York") +timezone2 = ZoneInfo("Europe/Paris") + +producer1 = KafkaProducer( + bootstrap_servers=["kafka:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) +producer2 = KafkaProducer( + bootstrap_servers=["kafka:9092"], + security_protocol="PLAINTEXT", + api_version=(0, 10, 2), +) + +def send_message(timezone: ZoneInfo, producer: KafkaProducer, i: int): + timestamp = datetime.now(timezone) + message_json = {"date": timestamp.strftime(str_repr), "message": str(i)} + producer.send(topic1, (json.dumps(message_json)).encode("utf-8")) + +for i in range(input_size): + if random.choice([True, False]): + send_message(timezone1, producer1, i) + else: + send_message(timezone2, producer2, i) + time.sleep(1) +``` + +You can find the entire file [here](https://github.com/pathwaycom/pathway/tree/main/examples/projects/kafka-ETL/producer-src/create-stream.py). + +## ETL with Pathway: Concatenating data streams + +Now that the setup is ready, Pathway can manage all the rest! + +### Extract from Kafka +Pathway provides [connectors](/developers/user-guide/connecting-to-data/connectors/) to connect and extract data from different data sources. +You can connect to Kafka using Pathway Kafka connectors. + +In Pathway, data is represented using [tables](/developers/user-guide/introduction/concepts#tables-dynamic-content-with-static-schema) and you need to define the data types of the extracted data using [schemas](/developers/user-guide/types-in-pathway/schema/): + +```python +class InputStreamSchema(pw.Schema): + date: str + message: str +``` + +You need one connector per topic, but the connectors can use the same settings. + +```python +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", + "auto.offset.reset": "earliest", +} + +timestamps_timezone_1 = pw.io.kafka.read( + rdkafka_settings, + topic="timezone1", + format="json", + schema=InputStreamSchema, + autocommit_duration_ms=100, +) + +timestamps_timezone_2 = pw.io.kafka.read( + rdkafka_settings, + topic="timezone2", + format="json", + schema=InputStreamSchema, + autocommit_duration_ms=100, +) +``` + +You can learn more about Pathway Kafka connectors in our [dedicated tutorial](/developers/user-guide/connectors/kafka_connectors). + +### Transform into a single data stream + +Now that you have the logs, you need to do the conversion. +Dealing with time can be [very frustrating](https://www.youtube.com/watch?v=-5wpm-gesOY). +Fortunately, Pathway provides all the datetime functions to make it easy. + +Here is how to concatenate the datetime from different time zones into a single data stream with timestamps with Pathway: + +```python +def convert_to_timestamp(table): + table = table.select( + date=pw.this.date.dt.strptime(fmt=str_repr, contains_timezone=True), + message=pw.this.message, + ) + table_timestamp = table.select( + timestamp=pw.this.date.dt.timestamp(unit="ms"), + message=pw.this.message, + ) + return table_timestamp + + +timestamps_timezone_1 = convert_to_timestamp(timestamps_timezone_1) +timestamps_timezone_2 = convert_to_timestamp(timestamps_timezone_2) + +timestamps_unified = timestamps_timezone_1.concat_reindex(timestamps_timezone_2) +``` + +First, since both values (`date` and `message`) are strings, you need to convert the `date` into a `datetime` using the [`strptime`](/developers/api-docs/pathway/#pathway.internals.expressions.DateTimeNamespace.strptime) function of the [datetime module](/developers/api-docs/pathway#pathway.internals.expressions.DateTimeNamespace). +The time zone is automatically determined in this step. +Then, you can convert it to a timestamp (independent of time zone) using the [`timestamp` function](/developers/api-docs/pathway/#pathway.internals.expressions.DateTimeNamespace.timestamp). +Finally, you can concatenate the two tables using the [`concat_reindex` function](/developers/api-docs/pathway#pathway.Table.concat_reindex). + +This is a very simple example of what Pathway can do. +Pathway supports more complex operations such as stateful ([groupby](/developers/user-guide/data-transformation/groupby-reduce-manual), [windows](/developers/user-guide/temporal-data/windows-manual), etc.) and temporal (e.g., [ASOF join](/developers/user-guide/temporal-data/asof-join)) operations. + +### Load to Kafka + +Now that you have successfully transformed the data, you still need to send it back to Kafka. +Using a Kafka output connector should do the trick: + +```python +pw.io.kafka.write( + timestamps_unified, rdkafka_settings, topic_name="unified_timestamps", format="json" +) +``` +The settings are the same as the input connectors since the data is sent to the same Kafka instance. + +### Run it + +Congratulations, your Kafka ETL pipeline is ready! +Until now, you were building the pipeline: defining the connectors and the different operators. +No data is actually loaded into the system so if you were to run your Pathway code now, the pipeline would be built, but there would be no computation. +To run the pipeline by ingesting data, you need to use the Pathway `run` function: + +```python +pw.run() +``` + +Now the input connectors will connect and load the data! +Thanks to its powerful Rust engine, [Pathway computations are extremely fast](/blog/streaming-benchmarks-pathway-fastest-engine-on-the-market/). +You are not bound by the usual limits of Python, Pathway natively supports multithreading, multiprocessing and distributed computing. + +If you are curious about how the pipeline works, don't hesitate to read our [article about Pathway core concepts](/developers/user-guide/introduction/concepts). + +### Read the output + +The unified logs are now available on Kafka's `unified_timestamps` topic. +You can access this topic using your favorite tool. + +Nonetheless, you can easily use Pathway to check everything works well. +Create a file `read-results.py` in `pathway-src/` to access the data: + +```python [read-results.py] +table = pw.io.kafka.read( + rdkafka_settings, + topic=topic_name, + schema=InputStreamSchema, + format="json", + autocommit_duration_ms=100, +) +pw.io.csv.write(table, "./results.csv") +pw.run() +``` +The entire script is available [here](https://github.com/pathwaycom/pathway/tree/main/examples/projects/kafka-ETL/pathway-src/read-results.py). +This script reads the data and outputs it as a CSV file: + +```csv [results.csv] +timestamp,message,time,diff +1707217879632.242,"11",1707217879944,1 +1707217876629.236,"8",1707217879944,1 +1707217872469.24,"4",1707217879944,1 +1707217868355.006,"0",1707217879944,1 +1707217870466.797,"2",1707217879944,1 +1707217873626.241,"5",1707217879944,1 +1707217869465.5308,"1",1707217879944,1 +1707217871468.065,"3",1707217879944,1 +1707217874627.24,"6",1707217879944,1 +1707217877630.239,"9",1707217879944,1 +1707217875628.488,"7",1707217879944,1 +1707217878631.242,"10",1707217879944,1 +1707217880633.24,"12",1707217880644,1 +1707217881634.5,"13",1707217881644,1 +1707217882635.752,"14",1707217882644,1 +``` + +The times have been uniformized and are now timestamps. 🎉 + +You can learn more about the output [here](/developers/user-guide/introduction/first_realtime_app_with_pathway/#interpret-the-output). + +## Full solution + +The entire project is publicly available on [GitHub](https://github.com/pathwaycom/pathway/tree/main/examples/projects/kafka-ETL). +Here is the complete `etl.py` file: + +```python [etl.py] +import time + +import pathway as pw + +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", + "auto.offset.reset": "earliest", +} + +str_repr = "%Y-%m-%d %H:%M:%S.%f %z" + + +class InputStreamSchema(pw.Schema): + date: str + message: str + + +timestamps_timezone_1 = pw.io.kafka.read( + rdkafka_settings, + topic="timezone1", + format="json", + schema=InputStreamSchema, + autocommit_duration_ms=100, +) + +timestamps_timezone_2 = pw.io.kafka.read( + rdkafka_settings, + topic="timezone2", + format="json", + schema=InputStreamSchema, + autocommit_duration_ms=100, +) + + +def convert_to_timestamp(table): + table = table.select( + date=pw.this.date.dt.strptime(fmt=str_repr, contains_timezone=True), + message=pw.this.message, + ) + table_timestamp = table.select( + timestamp=pw.this.date.dt.timestamp(unit="ms"), + message=pw.this.message, + ) + return table_timestamp + + +timestamps_timezone_1 = convert_to_timestamp(timestamps_timezone_1) +timestamps_timezone_2 = convert_to_timestamp(timestamps_timezone_2) + +timestamps_unified = timestamps_timezone_1.concat_reindex(timestamps_timezone_2) + +pw.io.kafka.write( + timestamps_unified, rdkafka_settings, topic_name="unified_timestamps", format="json" +) + +pw.run() +``` + +## Going further with Pathway +Congratulations! +You are now ready to do Kafka ETL with Pathway. + +Your setup probably differs slightly, and your ETL pipeline may require different operators. +Pathway offers many [connectors](/developers/user-guide/connecting-to-data/connectors/) for extracting and loading the data from and to various sources. +In addition to standard [table operations](/developers/user-guide/data-transformation/table-operations/), Pathway also supports temporal operations such as [ASOF joins](/developers/user-guide/temporal-data/asof-join/) and [interval joins](/developers/user-guide/temporal-data/interval-join/). + +Don't hesitate to take a look at [Pathway documentation](/developers/user-guide/introduction/welcome/) and reach out to us on [Discord](https://discord.com/invite/pathway) if you don't find the operator you are looking for. diff --git a/docs/2.developers/7.showcases/150.etl-python-airbyte.md b/docs/2.developers/7.showcases/150.etl-python-airbyte.md new file mode 100644 index 00000000..40148f00 --- /dev/null +++ b/docs/2.developers/7.showcases/150.etl-python-airbyte.md @@ -0,0 +1,177 @@ +--- +title: "Streaming ETL pipelines in Python with Airbyte and Pathway" +description: "How to use Pathway for Airbyte sources." +author: 'sergey' +article: + date: '2024-02-28' + tags: ['showcase', 'data-pipeline'] + thumbnail: '/assets/content/showcases/airbyte/airbyte-diagram-th.png' +keywords: ['airbyte', 'processing', 'airbyte python', 'python etl', 'airbyte etl'] +--- + +# Streaming ETL pipelines in Python with Airbyte and Pathway + +In the world of data management, the Extract, Transform, Load ([ETL](https://en.wikipedia.org/wiki/Extract,_transform,_load)) process plays a crucial role in handling information effectively. ETL involves three key steps: first, extracting data from different sources; next, transforming it to fit specific needs and standards; and finally, loading it into a destination where it can be analyzed and used for decision-making. As businesses increasingly rely on data for insights, mastering the ETL process becomes essential for maximizing the value of information. + +ETL is particularly useful when you don't want to store raw data directly in your warehouse. For example, the [personally identifiable information](https://en.wikipedia.org/wiki/Personal_data) (PII) and sensitive data need to be anonymized before being loaded and analyzed. ETL allows you to process your data, sanitize it, and anonymize it before sending it for further analysis. + +In this article, you will see how to combine Pathway with Airbyte to do a streaming ETL pipeline. + +[Airbyte](https://airbyte.com/) is an open-source data integration platform designed to simplify the process of moving and consolidating data from various sources to data warehouses or other destinations with 350+ input connectors. It is used for the extract step. On the other hand, Pathway is a fast and easy event-processing engine built for Python & ML/AI developers: it will transform and load the data. + +This tutorial will show you how to process the data stream from an Airbyte source with Pathway. You will learn how to set up the **extraction** of an Airbyte data source with [airbyte-serverless](https://github.com/unytics/airbyte_serverless/), how to **transform** it real-time with Pathway, and finally, how to **load** the transformed stream into the storage. + +![ETL pipeline with Airbyte and Pathway](/assets/content/showcases/airbyte/airbyte_diagram.svg) + +For the demonstration, imagine you have the following task: you need to fetch the commits from a given GitHub repository in real-time, with a lag of seconds. Then, you need to process this real-time stream of commits by removing all e-mails from the obtained payloads. Actually, with Pathway, you are not restricted to just filtering some of the data, but you can do joins, use machine learning models, and much more if your task is more complex. + +## Setting up Airbyte sources with airbyte-serverless + +First of all, you need to start reading the stream of commits from the repository. Here you can pick [Pathway](https://github.com/pathwaycom/pathway/) source code. To read this repository, the [GitHub connector](https://docs.airbyte.com/integrations/sources/github) provided by Airbyte can be used. + +To configure it, you first need to install the `airbyte-serverless` tool from pip: + +```bash +pip install airbyte-serverless +``` + +> Note that the use of `airbyte-serverless` requires to have Docker installed. + +The configuration of the Airbyte source depends on the source: each source requires its own set of parameters, tokens, and keys. So, the next step is to generate the template with the configuration for the GitHub source. It can be done with the following console command: + +```bash +abs create github --source "airbyte/source-github" +``` + +The file `./connections/github.yaml` now contains the created configuration template. You need to fill the template so that the connector has the repository to read from. + +This configuration can be done as follows. + +First, there are two authentication ways to select from. Let's configure the simpler one: PAT authorization. To do that, you need to remove the uncommented `option_title`, `access_token`, `client_id`, and `client_secret` fields in the config, and then uncomment the section "Another valid structure for credentials". It requires the PAT token, which can be obtained at the [Tokens]() page in GitHub - please note that you need to be logged in. The scope you need in this token is `public_repo` (Access public repositories). + +Then, you also need to set up the repository name in the `repositories` field. As decided before, it's the Pathway source code repository, located at `pathwaycom/pathway`. Finally, you need to remove the unused optional fields, and you're ready to go. + +Eventually, the source configuration looks as follows: + +```yaml +source: + docker_image: "airbyte/source-github" # Here the airbyte connector type is specified + config: + credentials: + option_title: "PAT Credentials" # The second authentication option you've uncommented + personal_access_token: # Taken from https://github.com/settings/tokens + repositories: + - pathwaycom/pathway # Pathway repository + api_url: "https://api.github.com/" + streams: commits +``` + +## Using Airbyte connectors from Pathway to extract data + +Having the source configuration in place, you can proceed with writing some Pathway code. First of all, you need to import Pathway: + +```python +import pathway as pw +``` + +Having it imported, you can configure the data source. It's done in the same way as you usually do with other data sources: + +```python +commits_table = pw.io.airbyte.read( + "./connections/github.yaml", + streams=["commits"], +) +``` + +Here, only two parameters need to be specified: the path to the configuration file and the list of Airbyte streams you want to read. In the GitHub connector, the commits data is provided in the stream named `commits`, as it's specified above. + +Please note that this code will run indefinitely, fetching the new commits when they are made and appending them into the `commits_table`. There is also a way to run this code so that it reads the list of commits that exist at present and terminates, without waiting for the new ones. To do that, you need to set the keyword argument `mode` of the method `pw.io.airbyte.read` to `"static"`. + +In addition, you can control how frequently the read method polls the new data from the source. This frequency is denoted by the parameter `refresh_interval_ms`, defaulting at 1000, hence giving the poll frequency of one second. + +## Transforming data streams with Pathway + +Now you have the stream of commits in the form of a [Pathway table](/developers/api-docs/pathway-table/#pathway.Table). Let's process it with the means the Pathway framework has. + +As stated before, it's needed to remove the e-mails from the payload. Since the payload from the Airbyte Github connector is a JSON, one can come up with the following simple algorithm for personal data removal: traverse this JSON with a simple depth-first method, and any time there is an `@` character within a group of non-whitespace characters, remove this non-empty group. We do it this way to keep the approach simple while having the goal of removing all e-mails with the maximum recall, so that there is absolutely no PII: + +This way, the removal code would look as follows: + +```python +import json + + +def remove_emails_from_data(payload): + if isinstance(payload, str): + # The string case is obvious: it's getting split and then merged back after + # the email-like substrings are removed + return " ".join([item for item in payload.split(" ") if "@" not in item]) + + if isinstance(payload, list): + # If the payload is a list, one needs to remove emails from each of its + # elements and then return the result of the processing + result = [] + for item in payload: + result.append(remove_emails_from_data(item)) + return result + + if isinstance(payload, dict): + # If the payload is a dict, one needs to remove emails from its keys and + # values and then return the clean dict + result = {} + for key, value in payload.items(): + # There are no e-mails in the keys of the returned dict + # So, we only need to remove them from values + value = remove_emails_from_data(value) + result[key] = value + return result + + # If the payload is neither str nor list or dict, it's a primitive type: + # namely, a boolean, a float, or an int. It can also be just null. + # + # But in any case, there is no data to remove from such an element. + return payload +``` + +The transformation is done entirely in Python, there are no calls to a Pathway function. To apply it to the Pathway table, you need to use [`pw.apply`](/developers/api-docs/pathway#pathway.apply): + +```python +def remove_emails(raw_commit_data: pw.Json) -> pw.Json: + # First, parse pw.Json type into a Python dict + data = json.loads(raw_commit_data.as_str()) + + # Next, just apply the recursive method to delete e-mails + return remove_emails_from_data(data) + + +commits_table = commits_table.select(data=pw.apply(remove_emails, pw.this.data)) +``` + +Now the `commits_table` contains the data without such personal information as e-mails. + +## Loading data with Pathway connectors + +Finally, you can output the data. To do so, you can pick any of the [output connectors](/developers/api-docs/pathway-io) Pathway has. + +There are plenty options available. Some of them are: + +* A topic in [Kafka](/developers/api-docs/pathway-io/kafka/#pathway.io.kafka.write); +* An endpoint in [Logstash](/developers/api-docs/pathway-io/logstash/); +* A table in [Postgres](/developers/api-docs/pathway-io/postgres/); +* And even a [Python callback](/developers/api-docs/pathway-io#pathway.io.subscribe). + +Of course, there is a simple disk storage option as well. For instance, you can use the [jsonlines](/developers/api-docs/pathway-io/jsonlines/#pathway.io.jsonlines.write) output connector to write to a local file: + +```python +pw.io.jsonlines.write(commits_table, "commits.jsonlines") +``` + +This would output the results in the file `commits.jsonlines`. + +Finally, don't forget to run! + +```python +pw.run() +``` + +This is it! Now, your data is anonymized on the fly and stored in the file. This is only a simple example of what you can do with the variety of Airbyte sources and the freedom of real-time pipelines in Pathway. Note that it extends the number of sources you can use by virtually all available sources: [there are 350+](https://airbyte.com/connectors) to try! \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/.gitignore b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/.gitignore new file mode 100644 index 00000000..e21fc533 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/.gitignore @@ -0,0 +1,3 @@ +*.csv* +!transactionsA.csv +!transactionsB.csv diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/__init__.py b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/article.py b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/article.py new file mode 100644 index 00000000..21b944df --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/article.py @@ -0,0 +1,213 @@ +# --- +# title: 'Part 1: Realtime Fuzzy-Join' +# description: Article introducing Fuzzy Join. +# notebook_export_path: notebooks/showcases/fuzzy_join_part1.ipynb +# aside: true +# article: +# date: '2022-10-18' +# thumbnail: '' +# tags: ['tutorial', 'data-pipeline'] +# keywords: ['Fuzzy join', 'reconciliation', 'unstructured'] +# author: 'pathway' +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] jp-MarkdownHeadingCollapsed=true +# # Realtime Fuzzy-Join in Pathway +# +# ## Part 1: Fuzzy joins: 'errare humanum est' +# +# As the ancient maxim says, ['errare humanum est'](https://en.wiktionary.org/w/index.php?title=errare_humanum_est): to err is human. +# More than two thousands years later, this lesson is still very accurate in our modern world. +# Everyone makes mistakes and writing does not escape this fate: the longer the text the more mistakes there will be. +# However, most mistakes we usually make are small and do not hinder understanding. +# +# Unfortunately, computers, just like accountants, don't like mistakes. Computers cannot cope with mistakes. No matter how small the mistake, the computer will just reject the whole answer and throw an error. +# You have written your 10-digit password but finished with a lower case 'a' instead of a capital 'A'? The passwords obviously do not match, and you shall enter your password again! +# +# While this zero tolerance policy may make sense for security processes, it can be terrible when users have to enter long texts. +# For example, accountants may have to enter long logs of transactions by hand, creating many opportunities for mistakes. +# If those logs have to be compared to other logs (e.g. a log automatically generated by a pay station) then mismatches would appear: 'mr' instead of 'Mr'. +# Mistakes can also come from the way the data has been collected: using nicknames instead of full names, different email addresses etc. +# While humans could be able to match those logs despite the mistakes, computers cannot. +# +# Does it mean the computer is helpless in those cases, shifting all the tedious work of matching similar but different entries to human? +# Fortunately not, several mechanisms exist to assist or even perform the matching, and **fuzzy join** is one of them: a fuzzy join is a process which automatically matches entries from different logs despite not having a perfect matching between their keys. + +# %% [markdown] +# ## Fuzzy join in Pathway +# +# Fuzzy join is used to perform a join on datasets when the keys do not match exactly. +# Simple use cases include matching lower case strings with camelCase strings or matching +# floats with some precision threshold. +# +# Pathway's standard library comes with a powerful `smart_fuzzy_join` functionality. +# This tutorial is a showcase of its capabilities. We will develop a Data Application which allows for fuzzy-joining +# two streams of data against each other, and also for maintaining audit entries and updating results on the fly. Here is a sneak preview: +# +# ![Demo animation](https://pathway.com/assets/content/showcases/fuzzy_join/demo.gif) +# +# ## The data +# +# We will be doing the fuzzy-join between two datasets on money transfers’ banking logs. +# When doing banking or bookkeeping, this operation would be known as [reconciliation](https://en.wikipedia.org/w/index.php?title=Reconciliation_(accounting)&oldid=1100237463) of +# two sets of transactions records. +# One dataset comes in a perfectly organized format - csv, the other dataset consists of +# 'human written' lines describing the transactions. +# +# +# Here are samples from the datasets: +# +# **Data sourced automatically from a bank feed, in 'standard' CSV format** +# +# |id |date |amount|recipient |sender |recipient_acc_no |sender_acc_no | +# |------|----------|------|----------|--------------|----------------------------|----------------------------| +# |0 |2020-06-04|8946 |M. Perez |Jessie Roberts|HU30186000000000000008280573|ES2314520000000006226902 | +# |1 |2014-08-06|8529 |C. Barnard|Mario Miller |ES8300590000000002968016 |PL59879710390000000009681693| +# |2 |2017-01-22|5048 |S. Card |James Paletta |PL65889200090000000009197250|PL46193013890000000009427616| +# |3 |2020-09-15|7541 |C. Baxter |Hector Haley |PL40881800090000000005784046|DE84733500000003419377 | +# |4 |2019-05-25|3580 |L. Prouse |Ronald Adams |PL44124061590000000008986827|SI54028570008259759 | +# +# +# The first dataset is sourced automatically from a bank feed. Every few seconds a new batch of transactions is saved to `transactions/formatA/batch_timestamp.csv`. +# +# **Transaction logs entered by hand** +# +# |id |description| +# |---|-----------| +# |0 |Received 8521 € on 2014-08-07 by INTERNATIONAL interest payment from ??? to C. Barnard, recipient acc. no. 000002968016 by BANCO DE MADRID, amount EUR €, flat fee 8 € | +# |1 |EUR 8944 on 2020-06-06 by INTERNATIONAL transfer credited to 00000000008280573 (M. Perez) by BNP Paribas Securities Services, fee EUR 2, amount EUR 8946. | +# |2 |Finally got 5M quid on 2017-01-23 by DOMESTIC payment from Sergio Marquina to Bella Ciao, r. acc. 0000000009197250, oryg. amount 5_000_048, fees 5 quid. | +# |3 |3578 EUR am 2019-05-25 von INTERNATIONAL dividend payment by Pathway Inc. an L. Prouse, Empfängerkonto 8986827, Betrag 3580 EUR | +# |4 |Received 7540 EUR on 2020-09-15. Invoice, recipient C. Baxter, 0000000005784046, amount EUR 7541, fees EUR 1 | +# +# +# As you can see, it seems that each entry in the first dataset (data sourced automatically) has a corresponding entry in the other dataset (transaction logs entered by hand). +# In this example we will use the `smart_fuzzy_join` function from Pathway's standard library to make sure all is correctly matched. +# +# ## What are we going to obtain? +# We want to obtain a table in which the matchings are expressed, e.g. the entry 0 for the first table corresponds to the entry 1 in the second table. +# In addition, we will include the confidence, a number expressing how confident we are in the matching. +# +# ## Code +# First things first - imports: + +# %% +import pandas as pd + +import pathway as pw + +# %% [markdown] +# And now, here come a few lines of code that read two datasets, try to match rows, and report matchings by writing to a csv file. +# +# +# The data is read from csv files. +# For the purpose of this demonstration we will simply print a table with matchings found on the data sample presented above. +# But the code below works also in a production environment. In production: +# - All csv files will be dynamically ingested from these directories in their order of appearance. +# - The output will be updated immediately as new data appears at input. + +# %% +# Uncomment to download the required files. +# # %%capture --no-display +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_1_transactionsA.csv -O transactionsA.csv +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_1_transactionsB.csv -O transactionsB.csv + +# %% [markdown] +# We use [our csv connectors](/developers/user-guide/connecting-to-data/connectors/) to read the csv files: + + +# %% +class TransactionsA(pw.Schema): + recipient_acc_no: str = pw.column_definition(primary_key=True) + date: str + amount: str + recipient: str + sender: str + sender: str + sender_acc_no: str + + +class TransactionsB(pw.Schema): + description: str = pw.column_definition(primary_key=True) + + +transactionsA = pw.io.csv.read( + "./transactionsA.csv", + schema=TransactionsA, + mode="static", +) +transactionsB = pw.io.csv.read( + "./transactionsB.csv", + schema=TransactionsB, + mode="static", +) +pw.debug.compute_and_print(transactionsA) +pw.debug.compute_and_print(transactionsB) + + +# %% [markdown] +# Then we use our fuzzy join functions to do the reconciliation between the two tables. + + +# %% +def match_transactions(transactionsA, transactionsB): + matching = pw.ml.smart_table_ops.fuzzy_match_tables(transactionsA, transactionsB) + transactionsA_reconciled = ( + pw.Table.empty(left=pw.Pointer, right=pw.Pointer, confidence=float) + .update_rows(transactionsA.select(left=None, right=None, confidence=0.0)) + .update_rows( + matching.select( + pw.this.left, pw.this.right, confidence=pw.this.weight + ).with_id(pw.this.left) + ) + ) + return transactionsA_reconciled + + +pw.debug.compute_and_print(match_transactions(transactionsA, transactionsB)) +# %% [markdown] +# Success, all matchings were found! +# +# Super easy, few lines of code and you flawlessly manage datasets in different formats. +# Hassle-free. +# +# ## Scaling with Pathway +# +# `smart_fuzzy_join` is able to handle much bigger datasets. +# Feel free to test it on your own data or use the full datasets from this tutorial, +# available [in this Google Spreadsheet](https://docs.google.com/spreadsheets/d/1cXAPcmkq0t0ieIQCBrdKPG2Fq_DimAzzxfHsDWrtdW0/edit?usp=sharing). +# +# +# +# In the tutorial we just printed a matching found on a small data sample. In a dynamic production environment: +# - All csv files will be dynamically ingested from these directories in order of appearance. +# - **The output will be updated immediately as new data appears at input.** +# +# ## Conclusion and follow-up tasks +# +# While errors are human and we are unlikely to stop making some, we can free ourselves of the pain of correcting them each time something goes wrong. +# Sometimes, entries are harder to match and may require help: in that case you can check out our [extension](/developers/showcases/fuzzy_join/fuzzy_join_chapter2) and see how we extend our pipeline with an auditor that supervises the process of reconciliation. +# From now on, you have no excuses for having mismatching logs: 'errare humanum est, perseverare diabolicum'! +# +# +# If you would like to get some more experience with Pathway, you can try those two challenges: +# +# **Challenge 1** +# +# Extend the `match_transactions` function so that, after finding a matching, it extends the first input table (standard csv format) with columns 'fees' and 'currency'. +# +# **Challenge 2** +# +# Try to augment the datasets so that they are still reasonable but `smart_fuzzy_join` fails to find all matchings 😉 diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactions b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactions new file mode 120000 index 00000000..92c103bc --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactions @@ -0,0 +1 @@ +../../../../../../../projects/pathway/experimental/pycremental/docs/data/transactions/ \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsA.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsA.csv new file mode 100644 index 00000000..2375f61e --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsA.csv @@ -0,0 +1,6 @@ +key,date,amount,recipient,sender,recipient_acc_no,sender_acc_no +0,2020-06-04,8946,M. Perez,Jessie Roberts,HU30186000000000000008280573,ES2314520000000006226902 +1,2014-08-06,8529,C. Barnard,Mario Miller,ES8300590000000002968016,PL59879710390000000009681693 +2,2017-01-22,5048,S. Card,James Paletta,PL65889200090000000009197250,PL46193013890000000009427616 +3,2020-09-15,7541,C. Baxter,Hector Haley,PL40881800090000000005784046,DE84733500000003419377 +4,2019-05-25,3580,L. Prouse,Ronald Adams,PL44124061590000000008986827,SI54028570008259759 diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsB.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsB.csv new file mode 100644 index 00000000..1bade264 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter1/transactionsB.csv @@ -0,0 +1,6 @@ +key,description +0,"Received EUR 8521 on 2014-08-07 by INTERNATIONAL interest from M. Miller to C. Barnard, recipient account 000002968016 by BANCO DE MADRID, amount EUR 8529, fees EUR 8 " +1,"Received EUR 8944 on 2020-06-06 by INTERNATIONAL transaction from J. Roberts to M. Perez, recipient account 00000000008280573 by BNP Paribas Securities Services, amount EUR 8946, fees EUR 2 " +2,"Received EUR 5043 on 2017-01-23 by DOMESTIC payment from J. Paletta to S. Card, recipient account 0000000009197250 by None, amount EUR 5048, fees EUR 5 " +3,"Received EUR 3578 on 2019-05-25 by INTERNATIONAL dividend from R. Adams to L. Prouse, recipient account 0000000008986827 by None, amount EUR 3580, fees EUR 2 " +4,"Received EUR 7540 on 2020-09-15 by INTERNATIONAL invoice from H. Haley to C. Baxter, recipient account 0000000005784046 by None, amount EUR 7541, fees EUR 1 " diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/.gitignore b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/.gitignore new file mode 100644 index 00000000..4aa87c9d --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/.gitignore @@ -0,0 +1,5 @@ +*.csv* +!transactionsA.csv +!transactionsB.csv +!audit1.csv +!audit2.csv \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/__init__.py b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/article.py b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/article.py new file mode 100644 index 00000000..fd59b714 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/article.py @@ -0,0 +1,245 @@ +# --- +# title: 'Part 2: Realtime Fuzzy-Join' +# description: Article introducing Fuzzy Join. +# notebook_export_path: notebooks/showcases/fuzzy_join_part2.ipynb +# aside: true +# article: +# date: '2022-10-19' +# thumbnail: '' +# tags: ['tutorial', 'data-pipeline'] +# keywords: ['Fuzzy join', 'reconciliation', 'unstructured'] +# author: 'pathway' +# jupyter: +# jupytext: +# formats: py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Realtime Fuzzy-Join in Pathway +# +# ## Part 2: Fuzzy Join - reconciliation with audit: when the computer is not enough +# +# In this article, we are going to show you how Pathway interacts with incremental data flows with a **feedback loop**. +# +# In the [first part of this showcase](/developers/showcases/fuzzy_join/fuzzy_join_chapter1) we explained how `smart_fuzzy_join` may be helpful in bookkeeping. +# Previously, we had a simple pipeline that matched entries of two different tables, such as two logs of bank transfers, in two different formats. +# Many matchings can be inferred automatically, but some can be really tricky without help: while the fans of Harry Potter can instantaneously make the connection between 'You-Know-Who' and 'Voldemort', it is impossible for a computer to do so, at least without help. +# +# Human audit is unavoidable in many areas such as accounting or banking. +# As such, we extend our pipeline with an auditor that supervises the process of reconciliation. +# The auditor may help the system by providing some hints, i.e. suggesting difficult matchings by hand. +# +# ## Feedback loop in Pathway +# ![Graph image](https://pathway.com/assets/content/showcases/fuzzy_join/reconciliation_chapter3_trim.png) +# +# This figure represents an architecture with a feedback loop to understand how the pieces work together. +# +# Reconciliation by SmartFuzzyJoin lies at the heart of the architecture: +# - it consumes inputs from 3 sources: +# - two tables with transactions in different formats; +# - a table with manual corrections provided by the auditor; +# - it outputs one table with matched records. +# +# You might think of the auditor as a simple automaton. +# Either they are satisfied with presented results and simply save them in some storage, or they provide some hints for the algorithm to find a better matching. +# +# **Note:** Although the architecture contains a feedback loop, all tables here are either inputs or outputs of the system. +# +# +# ## The data +# +# Human audit is certainly needed to handle the sample dataset below. +# +# **Recipient and sender in a 'standard' CSV format** +# +# |id |recipient|sender | +# |------|---------|-------------| +# |1 |Bill H. |Nancy R. | +# |2 |Harry P. |Hermione G. | +# |3 |Julian S.|Dick F. | +# +# +# **Messages describing the transactions** +# +# |id |message | +# |------|---------| +# |A |Dear William, thank you for your patience. Regards, Ann| +# |B |Dear Colleague! I think they might have sent me your particle! Yours, Richard| +# |C |Paying back for two Chocolate Frogs, cheers Hermione!| +# +# ## Automatic reconciliation +# Let's see how many records we can match without any human help. +# We reuse code from [Part 1 of this showcase](/developers/showcases/fuzzy_join/fuzzy_join_chapter1). +# %% +import pandas as pd + +import pathway as pw + +# %% [markdown] +# We need to read the csv files: + +# %% +# Uncomment to download the required files. +# # %%capture --no-display +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_2_transactionsA.csv -O transactionsA.csv +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_2_transactionsB.csv -O transactionsB.csv +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_2_audit1-v2.csv -O audit1.csv +# # !wget https://public-pathway-releases.s3.eu-central-1.amazonaws.com/data/fuzzy_join_part_2_audit2-v2.csv -O audit2.csv + + +# %% +class TransactionsA(pw.Schema): + key: int = pw.column_definition(primary_key=True) + recipient: str + sender: str + + +class TransactionsB(pw.Schema): + key: str = pw.column_definition(primary_key=True) + message: str + + +transactionsA = pw.io.csv.read( + "./transactionsA.csv", + schema=TransactionsA, + mode="static", +) +transactionsB = pw.io.csv.read( + "./transactionsB.csv", + schema=TransactionsB, + mode="static", +) +pw.debug.compute_and_print(transactionsA) +pw.debug.compute_and_print(transactionsB) + + +# %% [markdown] +# We use the provided column `key` as indexes: Pathway will generate indexes based on those. +# +# We add a wrapper `reconcile_transactions` to replace the generated indexes by the corresponding key. + + +# %% +def match_transactions(transactionsA, transactionsB, by_hand_matching): + matching = pw.ml.smart_table_ops.fuzzy_match_tables( + transactionsA, transactionsB, by_hand_match=by_hand_matching + ) + + transactionsA_reconciled = transactionsA.select( + left=None, right=None, confidence=0.0 + ).update_rows( + matching.select(pw.this.left, pw.this.right, confidence=pw.this.weight).with_id( + pw.this.left + ) + ) + return transactionsA_reconciled + + +def reconcile_transactions( + transactionsA, + transactionsB, + audit=None, +): + by_hand_matching = pw.Table.empty(left=pw.Pointer, right=pw.Pointer, weight=float) + if audit is not None: + by_hand_matching = audit + by_hand_matching = by_hand_matching.select( + left=transactionsA.pointer_from(pw.this.left), + right=transactionsB.pointer_from(pw.this.right), + weight=pw.this.weight, + ) + transactionsA_reconciled = match_transactions( + transactionsA, transactionsB, by_hand_matching + ) + transactionsA_reconciled = transactionsA_reconciled.join_left( + transactionsA, pw.left.left == pw.right.id + ).select(pw.left.right, pw.left.confidence, left_key=pw.right.key) + transactionsA_reconciled = transactionsA_reconciled.join_left( + transactionsB, pw.left.right == pw.right.id + ).select(pw.left.left_key, pw.left.confidence, right_key=pw.right.key) + return transactionsA_reconciled, by_hand_matching + + +matching, _ = reconcile_transactions(transactionsA, transactionsB) +pw.debug.compute_and_print(matching) + + +# %% [markdown] +# Not a perfect matching. It seems that the help of an auditor is needed. +# +# +# ## Incremental reconciliation with an auditor +# The correct matching is 1 - A, 2 - C and 3 - B. Why? [Tip 1](https://en.wikipedia.org/wiki/Bill_%28given_name%29), [Tip 2](https://www.nobelprize.org/prizes/physics/1965/summary/). +# +# Previously, the algorithm identified matching 2 - C correctly but failed to find the connections between the other pairs. +# Now, we run it with a hint - feedback from an auditor. +# +# To include the hint (nothing complicated), we just need to launch our function with the parameter `audit`: +# %% +class AuditSchema(pw.Schema): + left: int + right: str + weight: float + + +audit = pw.io.csv.read("./audit1.csv", schema=AuditSchema, mode="static") +matching, suggested_matchings = reconcile_transactions( + transactionsA, transactionsB, audit +) + +# %% [markdown] +# Here is the author's feedback, the pair 1 - A: + +# %% +pw.debug.compute_and_print(suggested_matchings) +# %% [markdown] +# Given this feedback, we check that the new matching took into account this pair: +# %% +pw.debug.compute_and_print(matching) + +# %% [markdown] +# Still not perfect but better. It seems that more help from the auditor is needed. +# Now, with one more extra hint the algorithm matches all the records correctly. +# %% +audit = pw.io.csv.read("./audit2.csv", schema=AuditSchema, mode="static") +pw.debug.compute_and_print(audit) + +# %% +matching, suggested_matchings = reconcile_transactions( + transactionsA, transactionsB, audit +) + +# %% [markdown] +# This time we provide the last pair, 3 - B: +# %% +pw.debug.compute_and_print(suggested_matchings) +# %% [markdown] +# Given those, we should obtain a full --and hopefully correct -- matching. +# %% +pw.debug.compute_and_print(matching) + +# %% [markdown] +# Bingo! +# +# It may sound long and tedious but in practice most of the matchings should have been done automatically. +# This process is only performed for the few remaining cases, where the linkages are hard to make. + +# %% [markdown] +# ## Conclusion +# In conclusion, writing pipelines with a feedback loop is as easy as can be. +# When writing such a data processing algorithm, a tip is to always clearly separate inputs from outputs. +# It is important because the Pathway engine observes inputs for any changes and recalculates parts of the computation when needed. +# +# +# In the next chapter, we will show you how to make a Pathway installation which provides a full Fuzzy-Matching application, complete with frontend. (Coming soon!) +# +# ![Demo animation](/assets/content/showcases/fuzzy_join/demo.gif) diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit1.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit1.csv new file mode 100644 index 00000000..75e2c25d --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit1.csv @@ -0,0 +1,2 @@ +left,right,weight +1,A,1 diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit2.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit2.csv new file mode 100644 index 00000000..5bebf539 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit2.csv @@ -0,0 +1,3 @@ +left,right,weight +1,A,1 +3,B,1 diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit_transactions b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit_transactions new file mode 120000 index 00000000..ce70466d --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/audit_transactions @@ -0,0 +1 @@ +../../../../../../../projects/pathway/experimental/pycremental/docs/data/audit_transactions/ \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactions b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactions new file mode 120000 index 00000000..92c103bc --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactions @@ -0,0 +1 @@ +../../../../../../../projects/pathway/experimental/pycremental/docs/data/transactions/ \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsA.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsA.csv new file mode 100644 index 00000000..ea76967d --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsA.csv @@ -0,0 +1,4 @@ +key,recipient,sender +1,Bill H.,Nancy R. +2,Harry P., Hermione G. +3,Julian S.,Dick F. \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsB.csv b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsB.csv new file mode 100644 index 00000000..80001822 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/.fuzzy_join_chapter2/transactionsB.csv @@ -0,0 +1,4 @@ +key,message +A,"Dear William, thank you for you patience. Regards, Ann" +B,"Dear colleague! I think they might have send me your part! Yours, Richard" +C,"Paying back for two Chocolate Frogs, cheers Hermione" \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/1.fuzzy_join_chapter1.md b/docs/2.developers/7.showcases/2.fuzzy_join/1.fuzzy_join_chapter1.md new file mode 120000 index 00000000..120ce911 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/1.fuzzy_join_chapter1.md @@ -0,0 +1 @@ +.fuzzy_join_chapter1/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.fuzzy_join/2.fuzzy_join_chapter2.md b/docs/2.developers/7.showcases/2.fuzzy_join/2.fuzzy_join_chapter2.md new file mode 120000 index 00000000..3e924c99 --- /dev/null +++ b/docs/2.developers/7.showcases/2.fuzzy_join/2.fuzzy_join_chapter2.md @@ -0,0 +1 @@ +.fuzzy_join_chapter2/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.live_data_jupyter.md b/docs/2.developers/7.showcases/2.live_data_jupyter.md new file mode 120000 index 00000000..fe3f0a6b --- /dev/null +++ b/docs/2.developers/7.showcases/2.live_data_jupyter.md @@ -0,0 +1 @@ +.live_data_jupyter/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/2.twitter.md b/docs/2.developers/7.showcases/2.twitter.md new file mode 100644 index 00000000..42013282 --- /dev/null +++ b/docs/2.developers/7.showcases/2.twitter.md @@ -0,0 +1,232 @@ +--- +title: "Realtime Twitter Analysis App" +description: "Pathway Twitter showcase" +author: 'mateusz' +aside: true +article: + date: '2022-10-31' + thumbnail: '/assets/content/blog/th-twitter.png' + tags: ['tutorial', 'machine-learning', 'showcase'] +keywords: ['Twitter', 'tweets', 'sentiment analysis', 'geolocation', 'influence', 'unstructured'] +github_link: "https://github.com/pathwaycom/pathway/tree/main/examples/projects/twitter" +--- + +# Realtime Twitter Analysis App with Pathway + +![](/assets/content/showcases/twitter/pathway-twitter-mtva.gif) + +Have you ever thought of analyzing public sentiment by looking at +tweets? It is rather cool to see how information spreads in a social +network and across the globe. After all, information shapes the world - +brands care about it, politicians care about it, we at Pathway also care +about it. + +Today we are going to show you how to build a Twitter streaming app for +sentiment analysis in 15 minutes. Sit tight and catch a manul to warm +up! + +::article-img +--- +src: '/assets/content/showcases/twitter/PathwayManul-1.svg' +alt: '' +class: 'mx-auto' +--- +:: + +### What are we going to build? + +The final application displays a map of Twitter users who posted popular +tweets on a given topic (e.g. #vmas tag for the [MTV +Awards](https://www.mtv.com/vma), you can easily configure it to any +topic you want). The size of each dot corresponds to the total influence +of authors located in a given area on this topic. The influence is +computed in real-time based on a current structure of retweets and +replies to tweets (we will see how this is computed by Pathway at the +end of this article). These users are clustered based on the sentiment +their tweets are causing: green denotes positive reactions, cyan - +neutral ones, magenta - negative ones. + +All the results are visible on the map and summarized in the dynamic +table next to it. +![](/assets/content/showcases/twitter/pathway-twitter-mtva-long.gif) + +We are going to use Pathway to achieve this result. Pathway is capable +of handling different sources of live data, including geographical +location data.  + +You will now see how Pathway can be used to: + +- process a real-time stream of Twitter data, +- intelligently improve geolocation, +- perform sentiment analysis. + +Thanks to Pathway, we can observe trends in real-time - the results are +constantly updated as new tweets are posted. The app also allows for +inspecting the past (by easily moving a time-window slider). + +A unique feature of Pathway is that previously generated insights are +constantly updated when new data is analyzed. Indeed, as Pathway gathers +more and more data, it can update its beliefs about the past. In this +showcase, Pathway updates measures of influence of users' Twitter +activity as it is picked up on by others across the globe. Pathway also +intelligently improves geolocation quality by flagging incorrect user +locations, and retrospectively updates previous results. + +### A bird's eye view + +Before we get to know how to write the real time Twitter analysis +application, let's understand how Pathway fits in the final picture.  + +![](/assets/content/showcases/twitter/pathway-twitter-app-architecture.png) + +Pathway is the key technology that performs all the heavy backend +computations for real-time data analysis.  + +To ingest data, our Python code sets Pathway up to connect via an input +connector with a tiny Python script ([tweepy](https://www.tweepy.org/)) +that fetches real-time tweets from the Twitter API.  + +To provide outputs, Pathway is sending updates to the postgres database +via a [connector](/developers/api-docs/pathway-io/postgres). +The database is further queried by a webserver +(here built with [FastAPI](https://fastapi.tiangolo.com/)) which serves +the requested results. The requests come from the viewer's browser, and +responses are visualized (here we wrote some code using popular +javascript libraries like react, materialUI, and +[deck.gl](https://deck.gl/)). + +### What happens inside the "Pathway" block? + +The code of Pathway apps is written in Python, and can call upon any +external services and external Machine Learning models of the +programmer's choice. In this way, Pathway apps fit nicely into the +Python ecosystem.  + +For example, look inside +[our app code](https://github.com/pathwaycom/pathway/blob/main/examples/projects/twitter/services/pathway-app/app/processing.py#:~:text=def%20_compute_sentiment) to see just how easy it is to call +the [TextBlob](https://textblob.readthedocs.io/en/dev/) library for +computing sentiment of the tweets. + +Our Twitter analysis app also connects to a lightweight external service +that provides some basic geocoding. The geocoding helps us obtain +important information on longitude and latitude, as free Twitter data +comes only in the form of a user-typed text, e.g. "Paris, France". As a +matter of fact, since Twitter users often put misleading text as their +location, basic geocoding is not enough to attain sufficient data +quality. We add a couple of filters inside Pathway to clean erroneous +locations at scale.   + +The Python code of Pathway apps provides for a mix of a data pipeline +feel (through annotated Python functions) and a query-service feel +(through annotated Python classes). In our Twitter case app, the input +data goes through a pipeline built in Pathway which covers four steps. + +### Pipeline step 1: Tweets preprocessing + +After connecting a real-time stream of tweets to Pathway, we have to +clean them up a little bit. We do this entirely in Python, in Pathway's +dataframe-flavored programming framework. We are interested only in +retweets and replies to other tweets as we want to measure both local +and global impact. We also want to retain tweets that have location +data. For this, we fetch user data along with a tweet (it's already done +by tweepy) and lookup the location field of the user. The location +string is not useful by itself, as we need coordinates to measure the +distances between the retweeters and the authors of the original tweets. +We obtain users' coordinates by using  +[placeholder](https://github.com/pelias/placeholder)) - the free coarse +geocoder, which doesn't require tons of data and can be set up with a +single line of code. Then, we can leverage Pathway to correct +imperfections in the resulting geolocations from the placeholder +geocoder.  + +### Pipeline step 2: Iterative geolocation improvement with Pathway + +Some Twitter users put weird locations, like instead of a place name +they would put the text "*turn on notifications*" which gets [geolocated +to an incorrect +place](https://placeholder.demo.geocode.earth/parser/search?text=Turn+on+notifications) +(the geocoding service does not provide any measures of confidence). To +filter these out, we use an iterative process, in which we keep users +that have a sufficiently large fraction of retweets that are near the +tweeting user (within a radius of 200km). This can be expressed by the +following pseudocode: + +``` +REPEAT UNTIL CONVERGENCE: + For each tweeting user: + close_fraction = fraction of nearby retweets + if close_fraction < CUTOFF: + * make sure there is no other user in the same location + with its "close_fraction" > CUTOFF + * filter out all tweets and retweets with this location +``` + +Note that this main loop may take more than one iteration. In the end, +we take most of the broken locations off the map (the tweets stay in the +data and are still used for computing statistics.) + +Indeed, the key differentiator of Pathway is that it allows for writing +such iterative workflows! + +### Pipeline step 3: Sentiment analysis with Pathway + +The last step is to classify users by the general sentiment with which +their tweets are received. To do that, for each retweet/reply, we find +out the number in range \[-1,1\] representing the sentiment of the text +("\<0" is negative, 0 neutral, "\>0" is positive) and take the mean. +Each of the numbers in question is computed via a one-liner by using the +Pathway map function [apply](/developers/user-guide/data-transformation/table-operations/#operations) +and calling the [TextBlob](https://textblob.readthedocs.io/en/dev/) library. +The aggregation takes place in another line with a call to Pathway's +["group-by"](/developers/user-guide/data-transformation/groupby-reduce-manual/).  + +We don't have to worry about updating aggregates as new data appears - +Pathway will do that for us.  + +That's really it! + +### Pipeline step 4: Computing influence with Pathway + +Now, let's turn to the importance of users' activity, as the voice of +some of them carries more weight than that of others. In network +science, we call it influence. The simplest way to measure the influence +of a user would be to count the number of retweets. This would be a +one-line "group by" aggregation in Pathway!  + +For this app, we used instead a slightly more involved formula, taking +into account the number of followers and the overall activity of the +retweeting users. Our influence measure is actually a pretty good +"predictor" for the number of upcoming retweets - we can typically say +which tweets are likely to create a significant buzz, before this +actually happens.  + +Of course, you could come up with even better predictors of who's going +to create a stir, for example using +[formulas](http://kupavskii.com/wp-content/uploads/2016/07/2012-Kupavskii-et-al-Prediction-of-retweet-cascades.pdf) +which take into account parameters such as the length of the tweet, or +[diving deeper](https://www.youtube.com/watch?v=lbCmFZpMNxA) into the +structure of the network.  + +With Pathway it is straightforward to formulate any rule or iterative +algorithm we want - give it a try!  + +### Wrapping up + +As we have seen, Pathway allowed us to express complicated dataflows +(even with iteration!) with a simple and intuitive syntax. It seamlessly +integrates with any library within the Python ecosystem and is easy to +connect and set up. Moreover, it can handle large quantities of data +coming from different sources. Twitter is only one of many examples - +we highly encourage you to explore Pathway's capabilities on your own +and when you find your own applications, please share them with us at +! + +### Run it on your own + +All the code used for this showcase is +[available on the pathway-examples github repository](https://github.com/pathwaycom/pathway/tree/main/examples/projects/twitter). +You can easily run it on your own on the topics that are interesting to you. + +We are very curious to see what you will come up with! + +In the meantime, please reach out, take care, and avoid caressing manuls.  diff --git a/docs/2.developers/7.showcases/20.event_stream_processing_time_between_occurrences.md b/docs/2.developers/7.showcases/20.event_stream_processing_time_between_occurrences.md new file mode 120000 index 00000000..4db8016d --- /dev/null +++ b/docs/2.developers/7.showcases/20.event_stream_processing_time_between_occurrences.md @@ -0,0 +1 @@ +.event_stream_processing_time_between_occurrences/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/3.llm-alert-pathway.md b/docs/2.developers/7.showcases/3.llm-alert-pathway.md new file mode 100644 index 00000000..e410f286 --- /dev/null +++ b/docs/2.developers/7.showcases/3.llm-alert-pathway.md @@ -0,0 +1,187 @@ +--- +title: 'Use LLMs for notifications' +description: 'Use LLMs for notifications: crafting a RAG app with real-time alerting' +author: 'pathway' +aside: true +article: + date: '2023-11-17' + thumbnail: '/assets/content/showcases/llm-app/vector-index-th.png' + tags: ['showcase', 'llm'] +keywords: ['LLM', 'RAG', 'GPT', 'OpenAI', 'Google Docs', 'Slack', 'alert', 'KNN', 'HTTP connector', 'Streamlit', 'unstructured'] +--- + +# Use LLMs for notifications: crafting a RAG app with real-time alerting + +How to get real-time alerts with changes in Google Docs using [Pathway](https://pathway.com/). +## + +::article-img +--- +src: 'https://github.com/pathwaycom/llm-app/raw/main/examples/pipelines/drive_alert/drive_alert_demo.gif' +alt: 'Alerting in action' +class: 'mx-auto' +zoomable: false +--- +:: + + +Real-time alerting with Large Language Models (LLMs) like GPT-4 can be useful in many areas such as progress tracking for projects (e.g. notify me when coworkers change requirements), regulations monitoring, or customer support (notify when a resolution is present). In a corporate setting, teams often collaborate on documents using [Google Docs](https://www.google.com/docs/about/). These documents can range from project plans and reports to policy documents and proposals. + +This guide shows you how to build a Large Language Model (LLM) application that provides real-time [Slack](https://slack.com/) alerts about changes to Google documents that you or your team care about. + +The program that we will create answers questions based on a set of documents. However, after an initial response is provided, the program keeps on monitoring the document sources. It efficiently determines which questions may be affected by a source document change, and alerts the user when a revision - or a new document - significantly changes a previously given answer. + +The basic technique of feeding chunks of information from external documents into an LLM and asking it to provide answers based on this information is called RAG - Retrieval Augmented Generations. So, what we are doing here is ****real-time RAG with alerting** 🔔**. + +Worried that deadlines for a project change, and you are not in the loop? + +You set the alert once and don’t need to worry about data synchronization ever again! + + +## Architecture of our alerting application + +Our alerting app will have a Streamlit UI used to define new alerts. It will synchronize documents from a Google Drive data source, and send alerts to Slack. For the document processing and analysis, we rely on an LLM xpack provided as part of Pathway library, which then allows us to run our alerting app in a Docker container. This is a standalone application, except that it needs to call into a Large Language Model (LLM) to understand whether your document changes are relevant to the alert. For the sake of simplicity of launching, we do not host our own open-source LLM but rely on OpenAI API integration instead. + + + + +::article-img +--- +src: '/assets/content/showcases/drive_alert/architecture_alert.svg' +alt: 'Illustration of LLMs in Pathway' +class: 'mx-auto' +zoomable: true +--- +:: + + + +Let’s break down each component in the above architectural diagram and understand the role of various components: + +**Making an always up-to-date vector index of Google Drive documents**: The system accesses documents stored in Google Drive and monitors them for changes using the [Pathway connector for Google Drive](/developers/api-docs/pathway-io/gdrive/#pathway.io.gdrive.read). Next, all documents inside a chosen folder are parsed (we support native Google Docs formats, Microsoft’s docx and many others) and split into short, mostly self-contained chunks that are embedded using the OpenAI API and indexed in real time using the [Pathway KNN index](/developers/user-guide/data-transformation/indexes-in-pathway). + +**Answering queries and defining alerts**: Our application running on Pathway LLM-App exposes the HTTP REST API endpoint to send queries and receive real-time responses. It is used by the Streamlit UI app. Queries are answered by looking up relevant documents in the index, as in the Retrieval-augmented generation (RAG) [implementation](/developers/user-guide/llm-xpack/llm-app-pathway/). Next, queries are categorized for intent:an LLM probes them for natural language commands synonymous with _notify or send an alert_. + +**Alert Generation and Deduplication**: Pathway LLM-App automatically keeps the document index up to date and can efficiently update answers whenever significant documents change! To learn more, please see our [indexing tutorial](/developers/user-guide/data-transformation/indexes-in-pathway). However, sometimes a change in a source document is non-consequential, a colleague might for example fix some typos. To prevent the system from sending spurious alerts, we use [`pw.stateful.deduplicate`](/developers/showcases/alerting-significant-changes). The deduplicator uses an LLM “acceptor function” to check if the new answer is significantly different. + + Finally, relevant alerts are sent to Slack using a Python callback registered using [`pw.io.subscribe`](/developers/api-docs/pathway-io#pathway.io.subscribe). + + +## Our goal today: alerts for marketing campaigns + +We focus on an example where we would like to have real-time alerts for significant changes or updates in marketing campaigns. This system can monitor various aspects such as content changes, campaign performance metrics, audience engagement, and budget alterations. Real-time alerts enable marketing teams to respond quickly to changes, ensuring campaigns remain on track and are optimized for performance. + +After successfully running the **Google Drive Alerts with the LLM** app, + +Either go to Streamlit and try typing in “_When does the Magic Cola campaign start? Please notify me about any changes._” + + +## + +::article-img +--- +src: 'https://github.com/pathwaycom/llm-app/raw/main/examples/pipelines/drive_alert/drive_alert_demo.gif' +alt: 'Alerting in action' +class: 'mx-auto' +zoomable: false +--- +:: + + +Or send a curl to the endpoint with + +The response we will get is something like “_The campaign for Magic Cola starts on December 12, 2023_” based on the document you have in your Google Drive folder. The app also activates an alert for future changes. + +Then you go to the folder called “**Product Marketing**” and open the document called “**campaign-cola”** in Google Drive, modify the line with the “_Campaign Launch_” and set the date to “January 1st, 2024”. You should receive a Slack notification immediately “_Change Alert: The campaign for Magic Cola starts on July 1st, 2024”_. + +Depending on captured changes in real-time and predefined thresholds (like a certain percentage drop in click-through rate or a significant budget overrun), the system triggers an alert. + +You can also try setting up a new document with revised information about the campaign date, and see how the system picks up on pieces of information from different source files. As we will see later, we can adjust how the system reacts to different pieces of information through a technique called “prompting”. + + +For example, you can explain to the LLM, in natural language, how it should best reply if it sees a conflict between information seen in two different places. + +The same solution can be applied for monitoring the marketing campaign across different platforms including content management systems, social media tools, and email marketing software. + + +## Tutorial - let’s write the application together + +The app development consists of two parts: backend code and frontend UI. The [full source code](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/drive_alert) can be found on the GitHub repo. + +If you would like to run the working example immediately and test in action first, please move on directly to the next section - [running the application](/developers/showcases/llm-alert-pathway/#how-to-run-our-application). + + +### Part 1: Build the app + +Let’s first develop the logic for the backend part. The app ingests Google Docs in real-time, detects changes, creates indexes, responds to user queries, and sends alerts. See the full source code in the [app.py](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/drive_alert/app.py) file. These are the parameters of its main `run` function: + +Everything that happens in the main `run()` function accepts several parameters, many of which have default values. These include paths OpenAI API keys (`api_key`), server configuration (`host`, `port`), model identifiers (`embedder_locator`, `model_locator`), and Slack channel ID where alerts are sent (`slack_alert_channel_id`) and Slack token (`slack_alert_token`) to secure authenticate with the Slack. + +**Building an Index** + +Next, the app reads the Google Docs files from the path specified in the `data_dir` and processes them into documents. These documents are then chunked (cut up into smaller fragments that fit more snugly into natural language processing toolboxes), flattened, and then enriched with OpenAI embeddings. A K-Nearest Neighbors [(KNN) index](/developers/api-docs/ml/#pathway.stdlib.ml.index.KNNIndex) is created using these embeddings. + +**Query Processing** + +Next, we add a function to set up an HTTP connector to receive queries. Queries are processed to detect intent using the OpenAI Chat completion endpoint and prepare them for response generation. This includes splitting answers and embedding the query text. + +**Responding to Queries** + +The processed user queries are used to find the nearest items in the KNN index we built. A prompt is built using the query and the documents retrieved from the index. The OpenAI model generates responses based on these prompts. Finally, the responses are formatted and sent back to the UI using the `response_writer`. + +**Sending Alerts** + +The below code filters responses that require alerts. A custom logic (`acceptor`) is used to determine if an alert should be sent based on the content of the response. Alerts are constructed and sent to a specified Slack channel. + + +### **Optional: Streamlit UI** + +We will start with constructing Streamlit UI and create a simple web application with Streamlit. It interacts with an LLM App over REST API and displays a chat-like interface for the user sending prompts and notifying the user when an alert is activated. See the full source code in the[ server.py](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/drive_alert/ui/server.py) file. + +**Execution** + +This is a place where all magic happens. The function ends with a call to `pw.run`, indicating that this is part of a data pipeline that runs continuously. Optionally, we also enable a real-time monitoring feature. + + +## **How to run our application** + +**Step 0. ☑️Your checklist: what we need to get started** + + +* A running Python environment on MacOS or Linux +* A Google account for connecting to your own Drive + * Before running the app, you will need to give the app access to Google Drive folder, please follow the steps provided in the[ Readme](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/drive_alert). +* (Optional) A slack channel and API token + * For this demo, Slack notification is optional and notifications will be printed if no Slack API keys are provided. See:[ Slack Apps](https://api.slack.com/apps) and[ Getting a token](https://api.slack.com/tutorials/tracks/getting-a-token) + * If no Slack token is provided, notifications will be printed. + +**Step 1. Get started with LLM-App and test out the ready example** + +Next, navigate to the repository: + +Almost there! + +**Step 2. Get the app running** + + + +* Edit the `.env` file with the instructions provided in[ the Readme](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/drive_alert/README.md). +* We need to execute `python [app.py](<http://app.py>)` , follow the instructions in[ Running the project](https://github.com/pathwaycom/llm-app/tree/main/examples/pipelines/drive_alert/README.md#run-the-project) to get the app up and ready! + + +## **What is next** + +As we have seen in the example of the marketing campaign demo, real-time alerts with LLMs keep the entire team updated on critical changes and help teams stay agile, adjusting strategies as needed.[ LLM App](https://github.com/pathwaycom/llm-app)’s alerting feature can also be used for **monitoring model performance** when LLMs can occasionally produce unexpected or undesirable outputs. In cases where LLMs are used for processing sensitive data, real-time alerting can be useful for **security and compliance** too. + +Consider also visiting another blog post on[ How to build a real-time LLM app without vector databases](https://blog.streamlit.io/build-a-real-time-llm-app-without-vector-databases-using-pathway/). You will see a few examples showcasing different possibilities with the LLM App in the GitHub Repo. Follow the instructions in[ Get Started with Pathway](https://github.com/pathwaycom/llm-app#get-started) to try out different demos. + +::shoutout-banner +--- +href: "https://discord.gg/pathway" +icon: "ic:baseline-discord" +--- +#title +Discuss tricks & tips for RAG +#description +Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +:: diff --git a/docs/2.developers/7.showcases/3.lsh/.lsh_chapter1/__init__.py b/docs/2.developers/7.showcases/3.lsh/.lsh_chapter1/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docs/2.developers/7.showcases/3.lsh/.lsh_chapter1/article.py b/docs/2.developers/7.showcases/3.lsh/.lsh_chapter1/article.py new file mode 100644 index 00000000..9839f268 --- /dev/null +++ b/docs/2.developers/7.showcases/3.lsh/.lsh_chapter1/article.py @@ -0,0 +1,178 @@ +# --- +# title: 'Realtime Classification with Nearest Neighbors (1/2)' +# description: 'Pathway Showcase: kNN+LSH classifier' +# notebook_export_path: notebooks/showcases/lsh.ipynb +# author: 'olivier' +# aside: true +# article: +# date: '2022-10-25' +# thumbnail: '/assets/content/blog/th-realtime-classification.png' +# tags: ['tutorial', 'machine-learning'] +# keywords: ['Classification', 'KNN', 'LSH', 'index', 'Locality Sensitive Hashing', 'MNIST', 'euclidean distance', 'unstructured'] +# jupyter: +# jupytext: +# formats: py:percent,ipynb +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Realtime classification in Pathway +# +# ## Part 1: Using the Nearest-Neighbors Classifier +# +# ![Classification with Pathway](https://pathway.com/assets/content/showcases/lsh/KNN.gif) +# +# +# ## Classification - what is it all about? +# +# What is this handwritten digit? Is this new movie an action movie or a drama movie? Is this email a spam or an email from a real [foreign prince reaching out for help](https://en.wikipedia.org/wiki/Advance-fee_scam)? +# +# All these questions have one thing is common: they are forms of **classification problems**. +# +# ![Classification of e-mails](https://pathway.com/assets/content/showcases/lsh/classification_2.png) +# +# Classification is the process of giving a label to an unknown element. +# It is used to label or tag new content and has various areas of applications such as marketing personalization. +# For example, tagging new content on an e-commerce website or a streaming service will ease browsing and allow for better content recommendations. +# +# This showcase explains how to achieve high quality classification using Pathway. +# +# ## Why use Pathway for classification? +# +# In a traditional streaming system, the classification of a query is done based on the available data at the time of the query. +# However, as time goes on, the available data grows, providing a better insight on the classification tasks done on previous queries. +# The prediction which made sense with a partial view of a data can be seen as wrong with more complete data. +# +# Pathway guarantees classifications with the most up-to-date-model. Under the hood, the system does this by automatically revisiting the classifications of past queries in the stream as new training data becomes available. + +# %% [markdown] +# ![Pathway performance](https://pathway.com/assets/content/showcases/lsh/lsh_mnist_streaming.png) + +# %% [markdown] +# Unlike a classic streaming system, Pathway updates the previous query as data arrive, resulting in a lower error rate. +# +# The source code in this article is completely self-contained. With Pathway installed, you can run it directly! + +# %% [markdown] +# ## How does Pathway perform classification? +# +# Classifiers are just regular Pathway library functions. In Pathway' standard library, we provide you with a choice of neat classifiers for your convenience - but if you prefer to create your own, be our guest: the logic will only take a couple dozen lines of Python in our framework. As with all code written in Pathway, Pathway takes care of making sure classifiers work correctly on top of data streams. +# +# In this showcase, we will show you how to use Pathway to make your own classification app. We will be using the kNN+LSH classifier from Pathway's standard library: if you are interested about how it works, you can find out more about those topics in our [article](/developers/showcases/lsh/lsh_chapter2) about it. + + +# %% [markdown] +# ## And here comes: Your real-time classification Data App in Pathway + +# %% [markdown] +# In Pathway, everything you need to perform efficient real-time classification is already implemented for you. +# +# All you need is to load your data, and use our provided classifier functions to train the model and label your queries. +# +# Let's take a look how Pathway performs on a real-time classification problem. The kNN+LSH classifier we will use in this case is available with several metrics, such as cosine or Euclidean distance - we stick to the defaults. +# + +# %% [markdown] +# ### Connecting to streaming data + +# %% [markdown] +# To illustrate how Pathway performs on real-time classification, we use Pathway to classify handwritten images fed into the system in streaming mode. +# +# We will use the well-known [MNIST](http://yann.lecun.com/exdb/mnist/) as an example. MNIST is composed of 70,000 handwritten digits, each image has a 28x28 resolution and is labeled. +# +# However, we work here with a **streaming data set**: we suppose that the MNIST data and the queries are arriving in a streaming fashion. In streaming, the data is incomplete and the stream progresses over time, until the full data is received. +# +# As MNIST is so standard, we provide a standard loader which simulates just such a data stream. Both the data and the queries are fed in at the same rate, with a 6:1 ratio of data to queries. (For production deployment of your application, you would normally use Pathway's input connectors instead of the simulator.) + +# %% +import pathway as pw + +# _MD_COMMENT_START_ +( + X_train_table, + y_train_table, + X_test_table, + y_test_table, +) = pw.ml.datasets.classification.load_mnist_stream(100) +# _MD_COMMENT_END_ +# _MD_SHOW_X_train_table, y_train_table, X_test_table, y_test_table = pw.ml.datasets.classification.load_mnist_stream() +# %% [markdown] +# ## Setting up classification + +# %% [markdown] +# Here comes the actual training and classification source code, in Pathway. + +# %% + +lsh_index = pw.ml.classifiers.knn_lsh_train( + X_train_table, d=28 * 28, L=10, M=10, A=0.5, metric="euclidean" +) +predicted_labels = pw.ml.classifiers.knn_lsh_classify( + lsh_index, y_train_table, X_test_table, k=3 +) + +# %% [markdown] +# What does this code do? +# We first show how to prepare the classifier with the function `knn_lsh_classifier_train`. (Under the hood, this logic computes the LSH index, i.e. the LSH projector and the buckets of the training data.) +# +# Then `lsh_index` is used to predict the label for the queries (`X_test_table`) using the function `knn_lsh_classify`. +# +# What about parameter choices? The dimension $d=28*28$ is simply the pixel size of the classified images - something specific to MNIST. The other parameter choices $(M=10,L=10,A=0.5)$ are set to provide the sweet spot between quality and efficiency: the number of comparisons LSH does for computing the kNN of a query over the full dataset is 830 on average instead of 60,000! Don't hesitate to play with the settings, you'll get the hang of the right "strings to pull" in a couple of minutes. +# +# Now, the resulting labels are then compared to the real ones to measure the accuracy of our approach with our accuracy function `classifier_accuracy`. + +# %% +accuracy = pw.ml.utils.classifier_accuracy(predicted_labels, y_test_table) + +# %% [markdown] +# That's it! You can now preview the `accuracy` table during stream execution, and see how the classification outcomes improve as the stream progresses: + +# %% +# _MD_SHOW_pw.debug.compute_and_print(accuracy) +# _MD_COMMENT_START_ +from pathway.tests.utils import T + +results = T( + """ + | cnt | value + 829666364173114985 | 547 | False + 447971229764681380 | 9453 | True + """ +) +pw.debug.compute_and_print(results) +# _MD_COMMENT_END_ + +# %% [markdown] +# We obtain 9453 correct classifications out of the 10000: an error rate of 5.5%. + +# %% [markdown] +# ### Results + +# %% [markdown] jp-MarkdownHeadingCollapsed=true +# ![Results image](https://pathway.com/assets/content/showcases/lsh/lsh_mnist_streaming.png) + +# %% [markdown] +# As we can see, a normal streaming system exhibits poor performances at first due to an incomplete data set. +# +# Pathway, on the other hand, improves the accuracy of those previous queries by revisiting its predictions at each update. As the data grows, its error rate decreases until it converges to an error close to $5\% $. Here is a sample of outcomes we get at the end of the stream. +# +# ![Classification results](https://pathway.com/assets/content/showcases/lsh/lsh_example_classification.png) +# + +# %% [markdown] +# +# ## Conclusion +# +# When doing classification on a stream in Pathway, the model is kept up to date automatically. As the model improves over time, results get better and previous classification decisions are also updated to the most up-to-date-model, without having to worry about it. +# +# In many streaming scenarios, the kNN+LSH approach we used provides a sweet-spot between speed and quality. If you want to know more about how all of this works, you can read our [article](/developers/showcases/lsh/lsh_chapter2) about it. +# +# In our next articles, we will show you how to use Pathway to build streaming recommender systems and real-time anomaly detection. (Coming soon.) diff --git a/docs/2.developers/7.showcases/3.lsh/1.lsh_chapter1.md b/docs/2.developers/7.showcases/3.lsh/1.lsh_chapter1.md new file mode 120000 index 00000000..f0b4e71c --- /dev/null +++ b/docs/2.developers/7.showcases/3.lsh/1.lsh_chapter1.md @@ -0,0 +1 @@ +.lsh_chapter1/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/3.lsh/2.lsh_chapter2.md b/docs/2.developers/7.showcases/3.lsh/2.lsh_chapter2.md new file mode 100644 index 00000000..55133d8b --- /dev/null +++ b/docs/2.developers/7.showcases/3.lsh/2.lsh_chapter2.md @@ -0,0 +1,139 @@ +--- +title: 'Realtime Classification with Nearest Neighbors (2/2)' +description: 'Pathway Showcase: technical explanation of our kNN+LSH classifier' +author: 'olivier' +article: + date: '2022-10-26' + thumbnail: '' + tags: ['tutorial', 'machine-learning'] +keywords: ['Classification', 'regression', 'KNN', 'LSH', 'index', 'Locality Sensitive Hashing', 'MNIST', 'euclidean distance'] +--- + +# Realtime classification in Pathway + +## Part 2: The Nearest-Neighbors classifier under the hood: classification using kNN and LSH + +![Dimensional segmentations of space made by four different LSH projections](/assets/content/showcases/lsh/LSH.png) + +Figure: Dimensional segmentations of space made by four different LSH projections. + +Today, we will explain *how* our classifier works and present to you the two main concepts behind it: **kNN** and **LSH**. + +In this article, we are not going to explain what classification is and how easy it is to create a classifier with Pathway, as we already have an awesome [article](/developers/showcases/lsh/lsh_chapter1) about it. + +## Deep dive: how we wrote the kNN+LSH classifier + +### kNN explained + +The **k-Nearest-Neighbors (kNN)** classifier relies on the following assumption: if some datapoints have a given label, and your query is similar to those datapoints, then your query is likely to have the same label as them. + + +The kNN classifier assumes that a pool of already labeled data is available. +The kNN approach connects each query to its k closest counterparts in the dataset, called 'neighbors'. +In a nutshell, each query is connected to the k other data points of the dataset which are the most similar to it. +The assumption is that similar data points are likely to share the same characteristics: the query is likely to share the same label as those of its neighbors. + +> "Friends are like mirrors. You can see yourself just by looking at them." + +For classification, the label is chosen by a majority vote among the labels of neighbors of the query. + +![KNN MNIST example image](/assets/content/showcases/lsh/lsh_knn_example_MNIST.png) + +In this example, the image we want to label is connected to its k=3 nearest neighbors. Given that those images are labeled as '0', we can also label the image as a '0' too. + +One reason of the success of the kNN approach is its **simplicity**: its vanilla version can be implemented very easily and is **highly accurate**. +Furthermore, contrary to many of its competitors, the kNN approach is not a black box: the decisions are straightforwardly **explainable**. +Explainability highly increases the trust of users in the system. + +>The kNN approach is: +>* simple +>* highly accurate +>* explainable + +Those are the reasons the kNN approach is widely used for classification or regression, in many fields such as computer vision or item recommendation. + +### Making sure kNN is fast enough + +The 'vanilla' kNN approach relies on a brute force approach to provide the exact k closest data points for each query: a query is compared to all the data points in the dataset. +The k closest datapoints, k being a user-defined constant, are returned for each query point. + +In Pathway, we are working with large datasets with high-dimensional data. +On such datasets, this naive approach suffers from the following issues: + +- Time complexity is large: + - Computing distance between every pair of points is $ O(dn_tn_q) $, where $ d $ is the number of dimensions, $ n_t, n_q $ number of training, query points, respectively. + - That can get costly pretty easily. **The naive approach is unusable for large data sets**. +- Handling updates is expensive/non trivial: + - When a new batch of data points arrive then distances to all the queries need to be updated. This can be quite a waste of resources. + - When a batch of data points is deleted or updated then answers to all the queries need to be recomputed. + +This vanilla approach is likely to be too slow when a lot of labeled data is available. Fortunately, we can trade quality slightly in exchange for a big speed increase. + +The key intuition is to lower the number of potential candidates to be neighbors to limit the number of distance computation. Reducing this pool of candidates speeds up the process: for example, by considering only half the dataset via random sampling, we can reduce the query time by half. + +This process comes with a loss in quality: by taking the risk of missing the 'real' neighbors, we take the risk of misclassifying the queries. + +The major challenge is then **how to select the best candidates** to compute the distances from? + +To hit a sweet spot, we use a technique called Locality Sensitive Hashing (LSH), to get a kNN+LSH classifier. + + +### Introducing Locality Sensitive Hashing (LSH) + +**Locality Sensitive Hashing (LSH)** is one of the most widely used techniques for speeding up kNN computation. LSH clusters the data into buckets and the distances are only computed between the query and the data points in the same buckets. +LSH refers to both hashing functions used for the clustering and the kNN algorithm relying on such functions. +LSH functions cluster data points so that **the closer the data, the more likely they will be clustered in the same buckets**. +Such a function highly depends on the targeted distance: the choice of the LSH function is generally imposed by the considered distance. +We recommend this [great explanation of LSH](https://randorithms.com/2019/09/19/Visual-LSH.html). + +LSH is entirely different from typical hash functions, e.g. for cryptographic purposes, which are designed so that similar objects are hashed to a very dissimilar buckets. + +LSH can be described for different distance metrics. When we want to consider Euclidean distance between data points, LSH partitions the space by doing random projections. +A random vector $ v $ is chosen and a random bias $ b $ is used to offset the vector. +All the data points are projected onto the resulting line and are assigned in contiguous buckets of width $ A $. + +In a more formal way, each data point $ p $ is assigned by the function $ h_{v,b,w} $ to its 'bucket' $ h_{v,b,w}(p)$: $ h_{v,b,w}(p) = \lfloor \frac{p . v + b}{A} \rfloor $ + +![LSH explanation first step image](/assets/content/showcases/lsh/lsh_projection_1.png) + +![LSH explanation first step image](/assets/content/showcases/lsh/lsh_projection_1.png) + +However, the resulting clustering can be quite coarse. In order to limit the size of the clusters, those are split again by using the same process $ M $ times: for two data points to be in the same bucket, they shall have been projected $ M $ in the same 'sub-bucket'. + +![LSH explanation second step image](/assets/content/showcases/lsh/lsh_projection_3.png) + +This whole process is repeated $ L $ times in order to increase the probability that two close data points are into the same bucket at least once. + +> LSH clustering scheme: +> 1. Consider a line using a random vector and partition this line in buckets of width $A$. +> 2. Project all the points on the line, and put the points in the associated buckets. +> 3. Repeat steps 1-2 $M$ times and merge the intersecting buckets. +> 4. Repeat steps 1-2-3 $L$ times. + +The LSH index is now ready for computing kNN queries! + +The kNN of a query $ q $ is obtained by gathering all the data points which are in the same buckets as $ q $. +Then a standard kNN algorithm on this subset of data points is performed. + +> LSH query scheme: +> 1. Find the buckets associated to the query +> 2. Compute the distance between the query and all the points in those buckets +> 3. Return the k closest data points + +## kNN+LSH classifier, Pathway style: + +Depending on your classification problem, you may need different distance metrics and thus different projection schemes. + +Don't worry, Pathway has you covered, and already provides several such classifiers. + +Here is an example of the results Pathway provides: + +![Results image](/assets/content/showcases/lsh/lsh_mnist_streaming.png) + +If you haven't done it yet, you can read this [article](/developers/showcases/lsh/lsh_chapter1) to know how we got this graph, and why Pathway outperforms standard streaming systems. + +## Conclusion + +You now have a good insight on how to do a classifier using kNN queries and how to use LSH to make it scalable. + +Pathway already provides ready-to-use classifiers, but the best classifier is one made specifically for your problem: you can easily create your own classifier using Pathway, this is exactly what Pathway is made for! diff --git a/docs/2.developers/7.showcases/3.lsh/_dir.yml b/docs/2.developers/7.showcases/3.lsh/_dir.yml new file mode 100644 index 00000000..42c8e49e --- /dev/null +++ b/docs/2.developers/7.showcases/3.lsh/_dir.yml @@ -0,0 +1 @@ +title: 'Realtime Classification' diff --git a/docs/2.developers/7.showcases/30.user_pairs_fuzzy_join.md b/docs/2.developers/7.showcases/30.user_pairs_fuzzy_join.md new file mode 120000 index 00000000..99188237 --- /dev/null +++ b/docs/2.developers/7.showcases/30.user_pairs_fuzzy_join.md @@ -0,0 +1 @@ +.user_pairs_fuzzy_join/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/4.logistics.md b/docs/2.developers/7.showcases/4.logistics.md new file mode 100644 index 00000000..e63b4405 --- /dev/null +++ b/docs/2.developers/7.showcases/4.logistics.md @@ -0,0 +1,27 @@ +--- +title: Pathway Logistics Application +description: Pathway Logistics Showcase +aside: true +article: + date: '2022-10-31' + thumbnail: '/assets/content/blog/th-logictics-app.png' + tags: ['tutorial', 'machine-learning'] +keywords: ['Logistics', 'IoT', 'Dashboard'] +author: 'pathway' +--- + +# Pathway Logistics Application + + + +The Pathway Logistics application is a one-stop-shop cloud-based application to provide immediately actionable insights on top of data for logistics assets, including IoT data and status data. Its operation remains under full control of the owner of the data. + +Thanks to the data model and real-time streaming pipelines we used, dozens of logistics use cases are now reduced to "adding yet another dashboard" in Pathway Logistics. You may want to take a look at some of [our clients' success stories](/success-stories/db-schenker/). + +Stay tuned for more information! + diff --git a/docs/2.developers/7.showcases/40.bellman_ford.md b/docs/2.developers/7.showcases/40.bellman_ford.md new file mode 120000 index 00000000..f4c47765 --- /dev/null +++ b/docs/2.developers/7.showcases/40.bellman_ford.md @@ -0,0 +1 @@ +.bellman_ford/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/5.linear_regression_with_kafka.md b/docs/2.developers/7.showcases/5.linear_regression_with_kafka.md new file mode 100644 index 00000000..7854a33b --- /dev/null +++ b/docs/2.developers/7.showcases/5.linear_regression_with_kafka.md @@ -0,0 +1,319 @@ +--- +title: 'Linear regression on a Kafka stream' +description: '' +author: 'olivier' +aside: true +article: + date: '2022-12-23' + thumbnail: '' + tags: ['tutorial', 'data-pipeline'] +keywords: ['linear regression', 'Kafka', 'upstash'] +--- + + +# Doing a linear regression on data from Kafka +Start doing ML on a streaming data source with Pathway. + +In this article, we are going to see how to do a simple linear regression on streaming data from Kafka. +This article can be seen as an extension of our [realtime sum using CSV connectors](/developers/user-guide/introduction/first_realtime_app_with_pathway/). + + + +We have a data stream of data points $(x_i,y_i)$, and we want to compute a [simple linear regression](https://en.wikipedia.org/wiki/Simple_linear_regression) on those points: we want to compute the two parameters $(a,b)$ so that, for each point $(x_i,y_i)$, $y_i$ can be approximated by $y_i \approx a + b \times x_i$. + +We are not going to explain the mathematical details here, but you can find all the details in the [Wikipedia article](https://en.wikipedia.org/wiki/Simple_linear_regression). + +## Connectors + +First, we need a connector to connect to our input data streams and receive values on which the regression will be computed. +In this article, we will set up a Kafka connector. + +To be able to reproduce this example, you may want to use [upstash](https://upstash.com/) which provides a free Kafka instance. + +To use a Kafka connector, we need to set all the Kafka parameters by using a dictionary, following the format of [librdkafka](https://github.com/confluentinc/librdkafka/blob/master/CONFIGURATION.md). +We also need to define a Kafka topic on which to connect onto: we will go with `"linear-regression"`. +Here is an example of settings to connect to Kafka using SASL-SSL authentication over SCRAM-SHA-256 mechanism: + +```python +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} +``` + +You need, of course, to replace the server address and the associated credentials. + +With this, setting the connector is straightforward, you just need to specify the topic and the schema for your table: + +```python +class InputSchema(pw.Schema): + x: float + y: float + + +t = pw.io.kafka.read( + rdkafka_settings, + topic="linear-regression", + schema=InputSchema, + format="csv", + autocommit_duration_ms=1000 +) +``` + +We used the `csv` format, but there are two other ways to read from Kafka: `raw` which reads a table with only one column `data` in which the whole message is dumped and `json` which reads JSON messages. You can see more about this connector in its dedicated [tutorial](/developers/user-guide/connectors/kafka_connectors/). In our case we expect CSV messages. + +💡 If you only want to test the linear regression, without generating a data stream through Kafka, you can directly use our stream generator: +```python +t = pw.demo.noisy_linear_stream() +``` + +For the output, we use a CSV connector, which is set up as follows: + +```python +pw.io.csv.write(t, "regression_output_stream.csv") +``` + +For more details on how this connector works, checkout our [example](/developers/user-guide/introduction/first_realtime_app_with_pathway/) or the [tutorial](/developers/user-guide/connectors/csv_connectors/) about it. + +## Doing a linear regression +To do the regression, we need to compute the sum of the $x_i$, of the $x_i^2$, of the $y_i$ and of the $x_i\times y_i$ and the total number of data points received until then. +This is done as follows: + +```python +t = t.select( + *pw.this, + x_square=t.x * t.x, + x_y=t.x * t.y +) +statistics_table = t.reduce( + count=pw.reducers.count(), + sum_x=pw.reducers.sum(t.x), + sum_y=pw.reducers.sum(t.y), + sum_x_y=pw.reducers.sum(t.x_y), + sum_x_square=pw.reducers.sum(t.x_square), +) +``` + +Then we can compute the estimation of $a$ and $b$: + +```python +def compute_a(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (sum_y * sum_x_square - sum_x * sum_x_y) / d + +def compute_b(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (count * sum_x_y - sum_x * sum_y) / d + +results_table = statistics_table.select( + a=pw.apply(compute_a, **statistics_table), + b=pw.apply(compute_b, **statistics_table), +) +``` + +## Creating the input stream + +> You can skip this section if you use our stream generator `pw.demo.noisy_linear_stream()` + +To use the Kafka connector, we have to follow a few rules. +First, the Kafka connector expects the first message to contain the names of the columns. +The connector will not properly work without this message, however it must be sent only once: if sent twice, the second message will be treated like a normal row. + +We can use the KafkaProducer API provided by Kafka to send message using Python: + +```python + +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", +) +producer.send(topic, ("x,y").encode("utf-8"), partition=0) +producer.send( + "linear-regression", ("0,0").encode("utf-8"), partition=0 +) +producer.send( + "linear-regression", ("1,1").encode("utf-8"), partition=0 +) +producer.close() +``` + +This code sample sends $(0,0)$ and $(1,1)$ and then closes the Kafka Producer. +For our example, we are going to send more messages containing different pairs $(x,y)$ which are samples from the line $y=x$. +However, for the example not to be too simple, we are going to add a small random error to each $y$. + +> Note that, depending on your version of Kafka, you may need to specify the API version to make this code work: `api_version=(0,10,2)`. + +## Gathering everything into one piece +The final version of our project contains two files: `realtime_regression.py` which processes the stream using Pathway and `generating_kafka_stream.py` which generates the streams. + +Here is `realtime_regression.py`: + +```python [realtime_regression.py] +import pathway as pw + +rdkafka_settings = { + "bootstrap.servers": "server-address:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "********", +} + +class InputSchema(pw.Schema): + x: float + y: float + + +t = pw.io.kafka.read( + rdkafka_settings, + topic="linear-regression", + schema=InputSchema, + format="csv", + autocommit_duration_ms=1000, +) +pw.io.csv.write(t, "regression_input.csv") + +t += t.select( + x_square=t.x * t.x, + x_y=t.x * t.y, +) +statistics_table = t.reduce( + count=pw.reducers.count(), + sum_x=pw.reducers.sum(t.x), + sum_y=pw.reducers.sum(t.y), + sum_x_y=pw.reducers.sum(t.x_y), + sum_x_square=pw.reducers.sum(t.x_square), +) + +def compute_a(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (sum_y * sum_x_square - sum_x * sum_x_y) / d + +def compute_b(sum_x, sum_y, sum_x_square, sum_x_y, count): + d = count * sum_x_square - sum_x * sum_x + if d == 0: + return 0 + else: + return (count * sum_x_y - sum_x * sum_y) / d + +results_table = statistics_table.select( + a=pw.apply(compute_a, **statistics_table), + b=pw.apply(compute_b, **statistics_table), +) + +pw.io.csv.write(results_table, "regression_output_stream.csv") +pw.run() +``` + +Don't forget the `pw.run()` otherwise no computation will be done! +Once `pw.run()` is called, the computation will be run forever until it gets killed. + +And the `generating_kafka_stream.py`: + + +```python [generating_kafka_stream.py] +from kafka import KafkaProducer +import time +import random + +topic = "linear-regression" + +random.seed(0) +def get_value(i): + return i + (2 * random.random() - 1)/10 + +producer = KafkaProducer( + bootstrap_servers=["server-address:9092"], + sasl_mechanism="SCRAM-SHA-256", + security_protocol="SASL_SSL", + sasl_plain_username="username", + sasl_plain_password="********", +) +producer.send(topic, ("x,y").encode("utf-8"), partition=0) + +time.sleep(5) +for i in range(10): + time.sleep(1) + producer.send( + topic, (str(i) + "," + str(get_value(i))).encode("utf-8"), partition=0 + ) + +producer.close() +``` + +## Output + +There are two outputs in this project: the CSV file `regression_input.csv` which keeps all the updates received from Kafka and the CSV file `output_stream.csv` in which all the successive updates of the sum values are displayed. + +As in our [previous example](/developers/user-guide/introduction/first_realtime_app_with_pathway/), the outputs are tables of changes. +Each new message of Kafka triggers a new computation and the new values are output in the CSV files! + +First, we can check that the generated values are correct: + +```shellscript +x,y,time,diff +"0","0.06888437030500963",0,1 +"1","1.0515908805880605",1,1 +"2","1.984114316166169",2,1 +"3","2.9517833500585926",3,1 +"4","4.002254944273722",4,1 +"5","4.980986827490083",5,1 +"6","6.056759717806955",6,1 +"7","6.9606625452157855",7,1 +"8","7.995319390830471",8,1 +"9","9.016676407891007",9,1 +``` + +We obtain ten values which are sampled around the $y=x$ line. Let's check the regression we obtain: + +```shellscript +a,b,time,diff +0,0,0,1 +0,0,1,-1 +0.06888437030500971,0.9827065102830508,1,1 +0.06888437030500971,0.9827065102830508,2,-1 +0.07724821608916699,0.9576149729305795,2,1 +0.0769101730536299,0.9581220374838857,3,1 +0.07724821608916699,0.9576149729305795,3,-1 +0.05833884879671927,0.9766933617407955,4,1 +0.0769101730536299,0.9581220374838857,4,-1 +0.05087576879874134,0.9822906717392795,5,1 +0.05833884879671927,0.9766933617407955,5,-1 +0.03085078333935821,0.9943056630149089,6,1 +0.05087576879874134,0.9822906717392795,6,-1 +0.03085078333935821,0.9943056630149089,7,-1 +0.03590542987734715,0.9917783397459139,7,1 +0.03198741430177742,0.9934574892783012,8,1 +0.03590542987734715,0.9917783397459139,8,-1 +0.025649728471303895,0.9958341214647295,9,1 +0.03198741430177742,0.9934574892783012,9,-1 +``` + +We obtain close values to what we expect ($a=0$ and $b=1$). +You can play the values (number of samples, error, linear function to approximate etc.) to see how the algorithm reacts. + + +## To go further +Congrats, you are now able to use Pathway with Kafka and do some non-trivial computation! + +Why not try to do some more advanced computation such as linear regression with several explanatory variables? +Or you may want to do some [classification](/developers/showcases/lsh/lsh_chapter1)? diff --git a/docs/2.developers/7.showcases/5.unstructured-to-structured.md b/docs/2.developers/7.showcases/5.unstructured-to-structured.md new file mode 100644 index 00000000..b5a06b0d --- /dev/null +++ b/docs/2.developers/7.showcases/5.unstructured-to-structured.md @@ -0,0 +1,195 @@ +--- +title: 'Use LLMs to Ingest Raw Text into DB' +description: 'Deploy LLMs to parse raw text from streams and insert into a DB' +author: 'berke' +aside: true +article: + date: '2023-11-28' + thumbnail: '/assets/content/showcases/llm-app/architecture_unst_to_st.png' + tags: ['showcase', 'llm'] +keywords: ['LLM', 'SQL', 'unstructured', 'PostgreSQL', 'GPT', 'Entity extraction', 'Document parsing', 'JSON', 'unstructured'] +--- + +# Use LLMs for creating structured data on the fly and insert them to PostgreSQL + +This showcase demonstrates a data pipeline that calls into LLMs for document processing. In the showcase, you will see how Pathway can extract information from documents and keep the results up to date when documents change. + +## About the project +Unformatted raw text is everywhere: PDFs, Word documents, websites, emails, etc. +Extracting meaningful information from this unformatted raw text is extremely valuable. +In the past, the solution was to label data by hand and train custom models specializing in a specific task. This approach requires too many resources. However, with the advent of LLMs, we can now set up a project to structure unformatted raw text in minutes. Creating POC and verifying use cases is now particularly easy without spending too much time. + +The following video shows document structurization in action as implemented in the [`unstructured_to_sql` example](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/unstructured_to_sql_on_the_fly/app.py) of [Pathway LLM-App](https://github.com/pathwaycom/llm-app). +::article-img +--- +src: '/assets/content/showcases/llm-app/unstructured_to_sql_demo.gif' +alt: 'example workflow' +class: 'mx-auto' +zoomable: false +--- +:: + +The `unstructured_to_sql` app consists of two parts: +1. Ingesting data from documents into PostgreSQL. +2. Retrieving information from PostgreSQL with natural language. + +![App Architecture diagram](/assets/content/showcases/llm-app/u_to_s_arch_split.png) + +This showcase will focus on the ingestion of PDF content (part 1), which is more relevant for streaming data. +In a nutshell, the ingestion process can be broken down into three separate steps: +1. Reading text from PDFs. +2. Extracting information desired by the user and creating a Pathway table. +3. Inserting the Pathway table into PostgreSQL. + +## 1. Reading text from PDFs +The first step is to read the PDFs and store the unformatted text in a Pathway table. + +With Pathway, you can take input from any arbitrary connector. For this example, we will read PDFs from a local directory. +This could also be: +- a Google Drive folder, +- a Dropbox folder, +- a Sharepoint folder, +- email attachments, +- documents incoming over a Kafka topic, +- or any other stream. + +We first read the directory that stores PDFs and then extract the unformatted text from the PDFs using the callable class `ParseUnstructured` provided in Pathway LLM xpack. +To read the PDFs, we use the `pw.io.fs` connector, which is used to read documents from the local file system. We need to pass the path and the format, which is `binary` in our case, since the text is stored in PDFs and Word documents. An additional `mode` argument exists for whether we use the static or streaming mode. This parameter is not set here as `streaming` is the default value. + +Once the PDFs are loaded, we extract the text using `ParseUnstructured()`. Since it returns a list of tuples with text and metadata, we filter the metadata out. This is done using `strip_metadata`, a Pathway user-defined function (UDF). In Pathway, you can easily define a UDF using the `@pw.udf` decorator. + +```python +@pw.udf +def strip_metadata(docs: list[tuple[str, dict]]) -> list[str]: + return [doc[0] for doc in docs] + + +files = pw.io.fs.read( + data_dir, + format="binary", + ) + +parser = ParseUnstructured() +unstructured_documents = files.select(texts=parser(pw.this.data)) +unstructured_documents = unstructured_documents.select(texts=strip_metadata(pw.this.texts)) +``` +## 2. Extracting information from the text +Now that we have the text stored in a Pathway table, we use `structure_on_the_fly` to extract the relevant information from the text. +Let's see how `structure_on_the_fly` works. + +First, we need to build the prompt sent to the LLM model. This is done using `build_prompt_structure`, a Pathway user-defined function (UDF). +The prompt is a long string that gives the LLM all the information it needs to extract from the raw text. +For the LLM, we use OpenAI's GPT-3.5 Turbo model. Pathway supports other LLM models such as HuggingFace, LiteLLM or similar. +In particular, this is where the relevant data to be extracted is defined: in our example, we displayed the PostgreSQL table schema. + + +``` +columns are from {postresql_table} table whose schema is: + + +| Column Name | Data Type | +|-------------------|------------| +| company_symbol | str | +| year | int | +| quarter | str | +| revenue_md | float | +| eps | float | +| net_income_md | float | +``` + +The associated Pathway table has the following schema: + +```python +class FinancialStatementSchema(pw.Schema): + company_symbol: str + year: int + quarter: str + revenue_md: float + eps: float + net_income_md: float +``` + +The llm-app already does all of this in `build_prompt_structure`, so to obtain the prompt, we simply have to call it: + +```python +prompt = documents.select(prompt=build_prompt_structure(pw.this.texts)) +``` +`pw.this.texts` refers to the `texts` field in the `documents` table that stores the PDFs. + +We need to call the LLM to extract the data with the prompt we prepared. +Pathway is compatible with several models, including OpenAI and any open-source models hosted on the HuggingFace and LiteLLM. We will go with the OpenAI GPT-3.5 Turbo model for ease of use and accuracy. We initialize the model with `OpenAIChat` provided in the LLM xpack. To fit the prompt into format required by OpenAI API we use a function `prompt_chat_single_qa` from LLM xpack. +We call OpenAI API with our prompt, and LLM returns its answer as text. This result is stored in the `result` column of the `responses` table. + +```python +model = OpenAIChat( + api_key=api_key, + model=model_locator, + temperature=temperature, + max_tokens=max_tokens, + retry_strategy=pw.asynchronous.FixedDelayRetryStrategy(), + cache_strategy=pw.asynchronous.DefaultCache(), +) + +responses = prompt.select( + result=model(prompt_chat_single_qa(pw.this.prompt)), +) +``` + +The LLM returns raw text as output. If the result is correct, we can parse it as a JSON. After parsing items into a dictionary, we put the obtained values in a list. +This parsing step is done using the ```parse_str_to_list``` UDF provided by the llm-app. + +Now, we have a list in the `values` column of the `responses` table. +We need to unpack all these values into table columns so that instead of a single column of `values` with `[foo, bar, ...]`, we will have a table such as + +``` +col1 col2 ... +_____________ +foo bar ... +``` +This is easily doable in Pathway using the Pathway `unpack_col` method: +```python +result = unpack_col(responses.values, *sorted(FinancialStatementSchema.keys())) +``` +The column names are the keys of the schema `FinancialStatementSchema`. The column names are sorted to be sure to obtain a deterministic order. +Note: for those who are not experienced with Python, `*` in front unpacks the keys. + +Finally, we cast the numerical columns to their respective types, such as float: + +```python +result = result.with_columns( + eps=pw.apply(float, pw.this.eps), + net_income_md=pw.apply(float, pw.this.net_income_md), + revenue_md=pw.apply(float, pw.this.revenue_md), + ) +``` +`.with_columns()` is used to keep all the columns except `eps, net_income_md and revenue_md` without any modification. +The other columns are cast using `apply`, which applies a function and enforces a proper type. + +## 3. Inserting the data into PostgreSQL +Pathway comes with a connector to PostgreSQL out of the box. You can view the complete list of connectors in [our I/O API docs](/developers/api-docs/pathway-io/). +Using the connector, inserting the table in PostgreSQL is very easy: +```python +pw.io.postgres.write(structured_table, postgreSQL_settings, postresql_table) +``` +`structured_table` is the resulting Pathway table we want to output to PostgreSQL; `postresql_table` is the table name in the PostgreSQL database; and `settings` is a dictionary with the PostgreSQL parameters such as host or the port. +You can learn more about the PostgreSQL output connector in [the documentation](/developers/api-docs/pathway-io/postgres/#pathway.io.postgres.write). + +## Running the project +Now that all the pipeline is ready, you have to run it! +Don't forget to add `pw.run()`; otherwise, your pipeline will be built, but no data will be ingested. + +To run it, a simple `python app.py` should do the trick. It is also possible to run with Docker, see the [llm-app repository](https://github.com/pathwaycom/llm-app/) for details. +This pipeline will automatically read your PDF files, extract the most relevant information, and store it in a PostgreSQL table. + +You can learn more about how this works or how to query the PostgreSQL table using natural language by looking at the source on our [GitHub repository](https://github.com/pathwaycom/llm-app/blob/main/examples/pipelines/unstructured_to_sql_on_the_fly/app.py) + +::shoutout-banner +--- +href: "https://discord.gg/pathway" +icon: "ic:baseline-discord" +--- +#title +Discuss tricks & tips for RAG +#description +Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +:: diff --git a/docs/2.developers/7.showcases/50.pagerank.md b/docs/2.developers/7.showcases/50.pagerank.md new file mode 120000 index 00000000..6d05ed38 --- /dev/null +++ b/docs/2.developers/7.showcases/50.pagerank.md @@ -0,0 +1 @@ +.pagerank/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/6.llamaindex-pathway.md b/docs/2.developers/7.showcases/6.llamaindex-pathway.md new file mode 100644 index 00000000..fdff72fe --- /dev/null +++ b/docs/2.developers/7.showcases/6.llamaindex-pathway.md @@ -0,0 +1,189 @@ +--- +title: 'Launching Pathway + LlamaIndex' +description: 'Pathway is now available in LlamaIndex as Reader and Retriever' +author: 'pathway' +article: + date: '2024-01-12' + thumbnail: '/assets/content/showcases/vectorstore/llamaindexpathway.png' + tags: ['showcase', 'llm'] +keywords: ['LLM', 'RAG', 'GPT', 'OpenAI', 'LlamaIndex'] +--- + +# Pathway is now available in LlamaIndex, here is how to get started! + +You can now use Pathway in your RAG applications which enables always up-to-date knowledge from your documents to LLMs with LlamaIndex integration. + +Pathway is now available on [LlamaIndex](https://docs.llamaindex.ai/en/stable/), a data framework for LLM-based applications to ingest, structure, and access private or domain-specific data. +You can now query Pathway and access up-to-date documents for your RAG applications from LlamaIndex using Pathway [Reader](https://docs.llamaindex.ai/en/stable/examples/data_connectors/PathwayReaderDemo.html#pathway-reader) and [Retriever](https://docs.llamaindex.ai/en/stable/examples/retrievers/pathway_retriever.html#pathway-retriever). + +With this new integration, you will be able to use Pathway vector store natively in LlamaIndex, which opens up endless new possibilities! +In this article, you will have a quick dive into Pathway + LlamaIndex to explore how to create a simple, yet powerful RAG solution using PathwayRetriever. + + +## Why Pathway? + +Pathway offers an indexing solution that is always up to date without the need for traditional ETL pipelines, which are needed in regular VectorDBs. It can monitor several data sources (files, S3 folders, cloud storage) and provide the latest information to your LLM application. + +## Learning outcomes +You will learn how to create a simple RAG solution using Pathway and LlamaIndex. + +This article consists of: +- Create data sources. Define data sources Pathway will read and keep the vector store updated. +- Creating a transformation pipeline (parsing, splitting, embedding) for loading documents into Vector store +- Querying your data and getting answers from LlamaIndex. + +## Prerequisites + +### Installing Pathway and LlamaIndex. +```bash +pip install pathway +pip install llama-index +pip install llama-index-retrievers-pathway +pip install llama-index-embeddings-openai +``` + +### Setting up a folder +To start, you need to create a folder Pathway will listen to. Feel free to skip this if you already have a folder on which you want to build your RAG application. You can also use Google Drive, Sharepoint, or any other source from [pathway-io](/developers/api-docs/pathway-io). +```bash +mkdir -p 'data/' +``` + +### Set up OpenAI API Key + +```python +import getpass +import os + +# omit if embedder of choice is not OpenAI +if "OPENAI_API_KEY" not in os.environ: + os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:") +``` + +### Define data sources + +Pathway can listen to many sources simultaneously, such as local files, S3 folders, cloud storage, and any data stream. + +See [pathway-io](/developers/api-docs/pathway-io) for more information. + +You can easily connect to the data inside the folder with the Pathway file system connector. The data will automatically be updated by Pathway whenever the content of the folder changes. + +```python +import pathway as pw + +data_sources = [] +data_sources.append( + pw.io.fs.read( + "./data", + format="binary", + mode="streaming", + with_metadata=True, + ) # This creates a `pathway` connector that tracks + # all the files in the ./data directory +) +``` + +### Create the document indexing pipeline + +Now that the data is ready, you must create the document indexing pipeline. The transformations should be a list of `TransformComponent`s ending with an Embedding transformation. + +First, split the text using `TokenTextSplitter`, then embed it with `OpenAIEmbedding`. + +Finally, you can run the server with `run_server`. + +```python +from pathway.xpacks.llm.vector_store import VectorStoreServer +from llama_index.embeddings.openai import OpenAIEmbedding +from llama_index.core.node_parser import TokenTextSplitter + +embed_model = OpenAIEmbedding(embed_batch_size=10) + +transformations_example = [ + TokenTextSplitter( + chunk_size=150, + chunk_overlap=10, + separator=" ", + ), + embed_model, +] + +processing_pipeline = VectorStoreServer.from_llamaindex_components( + *data_sources, + transformations=transformations_example, +) + +# Define the Host and port that Pathway will be on +PATHWAY_HOST = "127.0.0.1" +PATHWAY_PORT = 8754 + +# `threaded` runs pathway in detached mode, you have to set it to False when running from terminal or container +# for more information on `with_cache` check out /developers/api-docs/persistence-api +processing_pipeline.run_server( + host=PATHWAY_HOST, port=PATHWAY_PORT, with_cache=False, threaded=True +) +``` + +Awesome! The vector store is now active, you're set to start sending queries. + +### Create LlamIndex Retriever and create Query Engine + +```python +from llama_index.retrievers.pathway import PathwayRetriever + +retriever = PathwayRetriever(host=PATHWAY_HOST, port=PATHWAY_PORT) +retriever.retrieve(str_or_query_bundle="what is pathway") + + +from llama_index.core.query_engine import RetrieverQueryEngine + +query_engine = RetrieverQueryEngine.from_args( + retriever, +) + +response = query_engine.query("What is Pathway?") +print(str(response)) +``` + +``` +Out[]: Empty Response +``` + +As you can see, the LLM cannot respond clearly as it lacks current knowledge, but this is where Pathway shines. Add new data to the folder Pathway is listening to, then ask our agent again to see how it responds. + +To do that, you can download the repo readme of Pathway into our `data` folder: + +```bash +wget 'https://raw.githubusercontent.com/pathwaycom/pathway/main/README.md' -O 'data/pathway_readme.md' +``` + +Try again to query with the new data: + +```python +response = query_engine.query("What is Pathway?") +print(str(response)) +``` + +``` +Out[]: Pathway is a Python framework that allows for high-throughput and low-latency real-time data processing... +``` + +As you can see, after downloading the document to the folder Pathway is listening to, changes are reflected to the query engine immediately. +LLM responses are up to date with the latest changes in the documents which would require extra ETL steps in regular Vector DBs. + +## Conclusion + +With the integration of Pathway within LlamaIndex, you can now access up-to-date documents for your RAG applications from LlamaIndex. +You should now be able to use Pathway Reader and Retriever to connect to your data sources and monitor for changes, providing always up-to-date documents for your LlamaIndex application. + +If you are interested in building RAG solutions with Pathway, don't hesitate to read [how the vector store pipeline is built with Pathway](/developers/user-guide/llm-xpack/vectorstore_pipeline/). +To learn more about the possibilities of combining the live indexing pipeline of Pathway and LLMs, check out [real-time RAG alerting with Pathway](/developers/showcases/llm-alert-pathway/) and [ingesting unstructured data to structured](/developers/showcases/unstructured-to-structured/). + +::shoutout-banner +--- +href: "https://discord.gg/pathway" +icon: "ic:baseline-discord" +--- +#title +Discuss tricks & tips for RAG +#description +Join our Discord community and dive into discussions on tricks and tips for mastering Retrieval Augmented Generation +:: diff --git a/docs/2.developers/7.showcases/65.combining_time_series.md b/docs/2.developers/7.showcases/65.combining_time_series.md new file mode 120000 index 00000000..8c110931 --- /dev/null +++ b/docs/2.developers/7.showcases/65.combining_time_series.md @@ -0,0 +1 @@ +.merging-time-series/article.md \ No newline at end of file diff --git a/docs/2.developers/7.showcases/7.realtime-log-monitoring.md b/docs/2.developers/7.showcases/7.realtime-log-monitoring.md new file mode 100644 index 00000000..9888e3b9 --- /dev/null +++ b/docs/2.developers/7.showcases/7.realtime-log-monitoring.md @@ -0,0 +1,895 @@ +--- +title: Realtime Server Log Monitoring +description: Monitor your server logs in real time with Pathway +author: 'olivier' +aside: true +article: + date: '2023-02-27' + thumbnail: '/assets/content/tutorials/realtime_log_monitoring/meme.jpg' + tags: ['tutorial', 'data-pipeline'] +keywords: ['log monitoring', 'alert', 'Kafka', 'Filebeat', 'Slack', 'ElasticSearch', 'Docker'] +github_link: "https://github.com/pathwaycom/pathway/tree/main/examples/projects/realtime-log-monitoring/filebeat-pathway-slack" +--- + +# Realtime Server Log Monitoring with Pathway +Monitor your server logs with Pathway. +Process your logs and Pathway will automatically update the results for you. +You can do anomaly detection and trigger alerts whenever too many connections happen at once and send an alert directly to Slack. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/meme.jpg' +alt: 'Meme' +width: '300' +class: 'mx-auto' +--- +:: + +Logs are an essential component of any system or application, providing valuable information about its performance, usage, and errors. +However, monitoring logs can be a time-consuming and tedious task, especially when dealing with large volumes of data. +Fortunately, Pathway can help you to process your logs and produce high value insights from those, such as doing [anomaly detections](/glossary/real-time-anomaly-detection). + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/diagram_realtime_statistics.svg' +alt: 'Diagram real time log processing' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +In this article, you'll explore how to monitor your server logs, such as Nginx logs, using Pathway. + + +## Short version +Let's consider a simple scenario: you want to monitor logs and want to send a trigger whenever more than 5 logs are received during the last second. +You can compute a sliding window on a table with a timestamp column `timestamp`, and output the alert to a dedicated Slack channel by doing the following: + +```python [alerts.py] +import pathway as pw +import requests +from datetime import timedelta + +alert_threshold = 5 +sliding_window_duration = timedelta(seconds=1) + + +SLACK_ALERT_CHANNEL_ID = "XXX" +SLACK_ALERT_TOKEN = "XXX" + +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} + +inputSchema = pw.schema_builder( + columns={ + "@timestamp": pw.column_definition(dtype=str), + "message": pw.column_definition(dtype=str), + } +) + + +# We use the Kafka connector to listen to the "logs" topic +# We only need the timestamp and the message +log_table = pw.io.kafka.read( + rdkafka_settings, + topic="logs", + format="json", + schema=inputSchema, + autocommit_duration_ms=100, +) +log_table = log_table.select(timestamp=pw.this["@timestamp"], log=pw.this.message) +log_table = log_table.select( + pw.this.log, + timestamp=pw.this.timestamp.dt.strptime("%Y-%m-%dT%H:%M:%S.%fZ"), +) + +# A sliding window is computed based on log_table using the timestamp +t_sliding_window = log_table.windowby( + log_table.timestamp, + window=pw.temporal.sliding( + hop=timedelta(milliseconds=10), duration=sliding_window_duration + ), + behavior=pw.temporal.common_behavior( + cutoff=timedelta(seconds=0.1), + keep_results=False, + ), +).reduce(timestamp=pw.this._pw_window_end, count=pw.reducers.count()) + +# There is an alert if there are more than alert_threshold logs in the window +t_alert = t_sliding_window.reduce(count=pw.reducers.max(pw.this.count)).select( + alert=pw.this.count >= alert_threshold +) + + +# on_alert_event defines the behavior of the Slack channel when the alert is received +def on_alert_event(key, row, time, is_addition): + alert_message = "Alert '{}' changed state to {}".format( + row["alert"], + "ACTIVE" if is_addition else "INACTIVE", + ) + requests.post( + "https://slack.com/api/chat.postMessage", + data="text={}&channel={}".format(alert_message, SLACK_ALERT_CHANNEL_ID), + headers={ + "Authorization": "Bearer {}".format(SLACK_ALERT_TOKEN), + "Content-Type": "application/x-www-form-urlencoded", + }, + ).raise_for_status() + + +# The alert table is connected to the Slack channel with Pathway' output connector +pw.io.subscribe(t_alert, on_alert_event) + +time.sleep(5) +# Launching the computation. +pw.run() +``` + +## How to monitor logs +Let's consider the following scenario: you have a Nginx server and you want to monitor the associated logs. +You have set up a standard ELK stack: you monitor your logs with Filebeat, forwarding the updates to ElasticSearch via Logstash. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/realtime_processing_diag_without_pathway.svg' +alt: 'Realtime log monitoring without Pathway' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +You would like to use Pathway to have a more reactive monitoring: you want to trigger an alert every time the server has more than 5 connections within a 1-second window. +You need to insert Pathway between Logstash and ElasticSearch. +Let's call it scenario #1. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/realtime_processing_diag_with_pathway.svg' +alt: 'Realtime log monitoring with Pathway' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +For even better reactivity, you want to get rid of intermediate steps. You want to connect Pathway directly to Filebeat and you output the alerts from Pathway to your dedicated Slack channel. +Let's call it scenario #2. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/realtime_processing_diag_short.svg' +alt: 'Realtime log monitoring with Pathway and without Logstash' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +In this article, you will learn how to set up both architectures. +The end results are available in our public repositories ([scenario #1](https://github.com/pathwaycom/pathway/tree/main/examples/projects/realtime-log-monitoring/logstash-pathway-elastic) and [scenario #2](https://github.com/pathwaycom/pathway/tree/main/examples/projects/realtime-log-monitoring/filebeat-pathway-slack)). + +Note that, for now, Pathway does not have a connector for Logstash nor Filebeat: the messages are forwarded to a Kafka instance. +More connectors will be available in the future, you can check our [connector page](/developers/user-guide/connecting-to-data/connectors/) to discover all the available connectors with Pathway. +If you are not using the ELK stack, connectors to Prometheus and Grafana are underway. +A unique container with Kafka and Pathway is also coming soon. + +### Docker containers + +Instead of installing all the required services (Filebeat, Logstash, etc.) you will use Docker containers. +Docker is a powerful tool that allows you to package your application and its dependencies into a portable container that can be easily deployed and run on any platform. +By using Docker, you have a consistent and reliable environment, regardless of your operating system or configuration. +Docker provides a pre-configured container with all the necessary dependencies, saving you time and frustration by eliminating the need to manually install and configure all the services yourself. + +Docker-compose allows you to use several containers simultaneously. +You can set up docker-compose by using a `docker-compose.yml` at the root of the project. +Such a file is organized as follows: + +```yml +version: "3.7" +services: + filebeat: + build: + environment: + volumes: + logstash: + build: + ... +``` + +Each application, called a "service" here, is defined with all its relevant parameters, for example, parameters in `build` define how to build the container (which image and so on). +Note that `filebeat` here is the name given to the service, not the name of the used image: the image is determined by the `build` parameter. + +## Plugging in Pathway to your server + + +### Setting up Filebeat and Logstash + +You need to set up both Filebeat and Logstash to monitor the logs and forward the logs to Pathway via a Kafka instance. +In the `docker-compose.yml`, you need to configure both services: + +```yml [./docker-compose.yml] +services: + filebeat: + build: + context: . + dockerfile: ./filebeat-src/Dockerfile + links: + - logstash:logstash + depends_on: + - logstash + logstash: + image: docker.elastic.co/logstash/logstash:8.6.2 + volumes: + - ./logstash-src/logstash.conf:/usr/share/logstash/pipeline/logstash.conf + ports: + - 5044:5044 +``` + +For Filebeat, you will use both a Dockerfile and a configuration file. +The Dockerfile will copy the configuration file in the container to configure the input/output of Filebeat. +It can also copy a script `generate_input_stream.sh` to generate an artificial log stream. + +```Dockerfile [./filebeat-src/Dockerfile] +FROM docker.elastic.co/beats/filebeat:8.6.1 + +COPY ./filebeat-src/filebeat.docker.yml /usr/share/filebeat/filebeat.yml +COPY ./filebeat-src/generate_input_stream.sh /usr/share/filebeat/generate_input_stream.sh +USER root + +RUN mkdir /input_stream/ +RUN touch /input_stream/example.log + +RUN chown root:filebeat /usr/share/filebeat/filebeat.yml +RUN chmod go-w /usr/share/filebeat/filebeat.yml +``` + +The configuration file is used to set the directory to monitor (`/input_stream/*` in our example) and how to output the logs (to Logstash here): + +```yml [./filebeat-src/filebeat.docker.yml] +filebeat.inputs: +- type: filestream + id: my-logs + paths: + - /input_stream/* + +filebeat.config.modules: + path: /usr/share/filebeat/modules.d/ + reload.enable: false + +output.logstash: + enabled: true + hosts: ["logstash:5044"] +``` +For simplicity, this example has no authentication mechanism. +To learn how to configure the Logstash plugin of Filebeat, you can take a look at the [official documentation](https://www.elastic.co/guide/en/beats/filebeat/current/logstash-output.html). + +Logstash will send messages to Kafka: you need to use the Kafka plugin of Logstash. +You need to specify the topic and the format of the Logstash messages. +In this example the topic is "logs" and Filebeat formats the updates as JSON messages, so you want to keep the same format for the output: + +```yml [./logstash-src/logstash.conf] +input { + beats { + port => 5044 + } +} + +output { + kafka { + codec => json + topic_id => "logs" + bootstrap_servers => "kafka:9092" + key_serializer => "org.apache.kafka.common.serialization.StringSerializer" + value_serializer => "org.apache.kafka.common.serialization.StringSerializer" + } +} +``` + +Similarly to Filebeat, Logstash requires a configuration file to specify the input and output. +If you want to use one, you can read the [official documentation](https://www.elastic.co/guide/en/logstash/current/plugins-outputs-kafka.html) of the Kafka plugin of Logstash, or this [article](https://upstash.com/blog/logstash-kafka-upstash). +In our case, the input is Filebeat (which is an element of the Beats agents) on port 5044. +The port is specified in the output section of the configuration file of Filebeat and it has to be also opened in the docker-compose file. +To send the output to Pathway, you will use a Kafka instance as an intermediate. + + + +#### Generating the logs + +If you don't have a real server to monitor or simply don't have enough traffic to trigger warnings, you can generate an artificial dataset. +Here is a small script that creates logs in the directory monitored by Filebeat. +To simulate an important spike in traffic, it creates 1 log per second for 100 seconds and then creates 100 logs in 1 second. + +```shellscript [./filebeat-src/generate_input_stream.sh] +#!/bin/bash + +src="../../../input_stream/example.log" + +sleep 1 + +for LOOP_ID in {1..100} +do + printf "$LOOP_ID\n" >> $src + sleep 1 +done +for LOOP_ID in {101..200} +do + printf "$LOOP_ID\n" >> $src + sleep 0.01 +done +``` + +This script should be included in the `./filebeat-src/` directory and should be copied by the Dockerfile in the container of Filebeat. + +#### Nginx logs +Nginx is a popular web server and reverse proxy server used by many organizations to host their web applications. +Monitoring Nginx logs is a standard use case for Filebeat. +The previous configuration should work with Nginx logs: you only need to specify the directory of the Nginx logs. +However, Filebeat also has a dedicated plugin for Nginx: if you want to learn more about it, take a look at the [official documentation](https://www.elastic.co/guide/en/beats/filebeat/current/filebeat-module-nginx.html). + +### Connecting Pathway + +Currently, there is no input connector to directly connect Pathway to Filebeat nor Logstash. +You need to set up a Kafka instance to relay the messages to Pathway. +Hopefully, this is easily done using a Docker container. + +#### Kafka + +You need a Kafka instance to be a gateway between Logstash and Pathway. +Kafka itself relies on ZooKeeper, so you will also need a container for ZooKeeper. +Hopefully, it is very easy by using Docker containers: + +```yaml + zookeeper: + image: confluentinc/cp-zookeeper:5.5.3 + environment: + ZOOKEEPER_CLIENT_PORT: 2181 + kafka: + image: confluentinc/cp-enterprise-kafka:5.5.3 + depends_on: [zookeeper] + environment: + KAFKA_ZOOKEEPER_CONNECT: "zookeeper:2181" + KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:9092 + KAFKA_BROKER_ID: 1 + KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 + KAFKA_JMX_PORT: 9991 + ports: + - 9092:9092 + command: sh -c "((sleep 15 && kafka-topics --create --zookeeper zookeeper:2181 --replication-factor 1 --partitions 1 --topic logs)&) && /etc/confluent/docker/run " +``` +The `command` line waits for the setup of the containers before creating the topic `logs` and then runs Kafka. +If you want to use another topic, modify this line by replacing `logs` with the topic you want to use. + +That's all you need to do for Kafka. + +#### Pathway + +Now let's set up the container for Pathway. +Pathway does not have its own docker image, so you are going to use a Dockerfile to configure our container. +In the docker-compose file, you need to specify the Dockerfile you will use: + +```yml + pathway: + build: + context: . + dockerfile: ./pathway-src/Dockerfile + depends_on: [kafka, logstash] +``` + +In our Dockerfile, you only need a Python image. +You can simply install Pathway with the `pip install` command: + +```DockerFile [./pathway-src/Dockerfile] +FROM --platform=linux/x86_64 python:3.10 + +RUN pip install -U pathway +RUN pip install python-dateutil +COPY ./pathway-src/alerts.py alerts.py + +CMD ["python", "-u", "alerts.py"] +``` + +⚠️ For compatibility reasons, a x86_64 Linux container is used. + +You also need to install the `dateutil` package. + +To obtain the logs, you now need to set the input Kafka connector. + +In addition to `rdkafka` settings, you need to connect to the Kafka instance and listen to the topic "logs" using the `topic` parameter. + +Then, you need to define the schema of the input data. +The JSON created by Filebeat and forwarded by Logstash to Kafka has two columns of interest: "@timestamp" and "message" which are self-explanatory. +The other entries contain meta-data created by Filebeat and can be discarded for our example. +So you need to declare a schema with only the two columns "@timestamp" and "message": the other columns will ignored. +Because of the special character in the column name "@timestamp", we opt for the [inline definition](/developers/user-guide/types-in-pathway/schema/#inline-schemas-definitions) to define the schema instead of the [class definition](/developers/user-guide/types-in-pathway/schema/#how-to-define-and-use-a-schema). + +If you want more information about the Kafka input connector, check out the [dedicated tutorial](/developers/user-guide/connectors/kafka_connectors). + +```python [./pathway-src/alerts.py] +import pathway as pw + +rdkafka_settings = { + "bootstrap.servers": "kafka-server:9092", + "security.protocol": "sasl_ssl", + "sasl.mechanism": "SCRAM-SHA-256", + "group.id": "$GROUP_NAME", + "session.timeout.ms": "6000", + "sasl.username": "username", + "sasl.password": "password", +} + +inputSchema = pw.schema_builder( + columns={ + '@timestamp': pw.column_definition(dtype=str), + 'message': pw.column_definition(dtype=str) + } +) + +log_table = pw.io.kafka.read( + rdkafka_settings, + topic="logs", + format="json", + schema=inputSchema, + autocommit_duration_ms=100, +) +``` + +You now have a table with the wanted columns. +You should rename the `@timestamp` column to `timestamp` to be able to use the [dot notation](/developers/user-guide/data-transformation/table-operations/#select-and-notations) of Pathway. +For consistency, you also could want to rename the column `message` to `log`. +Also, Filebeat uses timestamps in the ISO 8601 format: you need to translate them into DateTime to facilitate the computation of the elapsed time between two different logs. +Pathway provides a [DateTime](/developers/api-docs/pathway/#pathway.internals.expressions.DateTimeNamespace) API to manipulate datetimes and timestamps. + +Those two transformations can be done easily in Pathway in a few lines: + +```python +log_table = log_table.select(timestamp=pw.this["@timestamp"], log=pw.this.message) +log_table = log_table.select( + pw.this.log, + timestamp=pw.this.timestamp.dt.strptime("%Y-%m-%dT%H:%M:%S.%fZ"), +) +``` + +Your `log_table` table is now ready to be processed! + + +### Connecting to Filebeat directly (scenario #2) + +If you don't need Logstash and want to connect directly Pathway (via Kafka) to Filebeat, you need to update the docker-compose file and the configuration file of Filebeat: + +```yml [docker-compose.yml] +services: + filebeat: + build: + context: . + dockerfile: ./filebeat-src/Dockerfile +``` + +```yml [./filebeat-src/filebeat.docker.yml] +filebeat.inputs: +- type: filestream + id: my-logs + paths: + - /input_stream/* + +filebeat.config.modules: + path: /usr/share/filebeat/modules.d/ + reload.enable: false + +output.kafka: + enabled: true + hosts: ["kafka:9092"] + topic: "logs" + group_id: 1 + ssl.enabled: false +``` + +As previously, no authentication mechanism is used. +If you want to use one, check out the documentation of the [Kafka plugin](https://www.elastic.co/guide/en/beats/filebeat/current/kafka-output.html). + +## Processing your logs in real time with sliding windows + +### Windows + +In order to do realtime statistical analysis in streaming systems, such as realtime server monitoring, only the most recent data is considered. +Only the most relevant data is kept, i.e. the last ten minutes, and processed in order to detect anomalies or changes in data patterns over time. +This portion of the data is called a window. +Windows techniques are widely used in stream data processing to analyze and summarize data over a fixed period of time. + +A window is a fixed-size buffer that moves through the data stream at regular intervals, typically in a sliding or overlapping manner. +As the window moves through the data stream, it collects and processes a fixed number of data items or a fixed duration of data. + +The two main windows techniques that are commonly used in stream data processing, are: + 1. Tumbling windows: A tumbling window divides the data stream into fixed-size, non-overlapping time intervals. Each window collects and processes a fixed number of data items or a fixed duration of data, after which the window is closed and a new window is opened. + 2. Sliding windows: A sliding window moves through the data stream in a fixed-size, overlapping manner. Each window collects and processes a fixed number of data items or a fixed duration of data, after which the window is moved forward by a fixed amount. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/sliding_tumbling_windows.gif' +alt: 'Differences between tumbling and sliding windows' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +For realtime monitoring you would usually prefer a sliding window over tumbling ones as the latter cut the data in non-overlapping parts: a wrong cut could prevent it from detecting the pattern you are looking for. + +Sliding windows consider the most recent data at the time of the computation: its efficiency will be highly dependent on the timing of the computation. The timing of the start of the window is usually defined by the user, periodically. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/sliding_window_standard.svg' +alt: 'Standard sliding windows diagram' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +### Sliding windows in Pathway + +With Pathway, you can easily make sliding windows using the [windowby](/developers/api-docs/temporal/#pathway.stdlib.temporal.windowby) function. Pathway will take care of maintaining the set of windows needed for your data and updating them whenever new data arrives. + +An additional advantage of using `windowby` is to use data forgetting mechanisms in Pathway. By specifying `behavior` you can inform Pathway to forget old windows and accompanying data, which gives two benefits in the log monitoring scenario: +1) The existence of an alert will be calculated only based on recent windows. +2) The program can run in constant memory, rather than keeping all data that arrived. +You can read more about temporal behaviors in [the user guide](/developers/user-guide/temporal-data/behaviors/). + +To use `windowby`, you must provide a `window` argument, defining the type of windows you want. For log monitoring, use [sliding windows](/developers/api-docs/temporal/#pathway.stdlib.temporal.sliding) created every 10 milliseconds, each lasting 1s. As the threshold for raising an alert is based only on the number of logs, use the `count` reducer for aggregating data grouped in windows. + +```python +t_sliding_window = log_table.windowby( + log_table.timestamp, + window=pw.temporal.sliding( + hop=timedelta(milliseconds=10), duration=sliding_window_duration + ), + behavior=pw.temporal.common_behavior( + cutoff=timedelta(seconds=0.1), + keep_results=False, + ), +).reduce(count=pw.reducers.count()) +``` + +Finally, you can create an alert table containing a single entry, whether there is an alert or not. To do that, find a maximum number of logs in any of your windows and if this is greater or equal to your threshold, set your alert to `True`. + +```python +t_alert = t_sliding_window.reduce(count=pw.reducers.max(pw.this.count)) +t_alert = t_alert.select(alert=pw.this.count >= alert_threshold) +``` + +Whenever a new data point is received, each table is updated: old windows are left out if they are older than the cutoff and whenever the number of kept entries is higher than the threshold the alert value is set to true. + + +## Sending the output + +Using Pathway’s output connectors, you can output the results to your favorite data storage. +You may want to send your statistics about your traffic to ElasticSearch or send alert messages directly to your Slack for better reactivity. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/pathway_output.svg' +alt: 'Pathway output connectors' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + + +### Scenario #1: sending the data to ElasticSearch + +If you want to use ElasticSearch, you need an instance. +You can easily set up a container for ElasticSearch: + +```yaml + elasticsearch: + image: docker.elastic.co/elasticsearch/elasticsearch:8.6.2 + ports: + - 9200:9200 + environment: + - discovery.type=single-node + - "ES_JAVA_OPTS=-Xms2g -Xmx2g" + - ELASTIC_PASSWORD=password + - xpack.security.enabled=false + restart: unless-stopped +``` + +The default username is `elastic`. +No need to create the index on which the data will be added: it will be automatically created when Pathway sends the data to ElasticSearch. +To access the data, you need to wait a few seconds for the container to be ready and then you can simply type: + +`curl localhost:9200/alerts/_search?pretty` + +This command will show you the messages received on the `alerts` index. You can update it if you want to see another index. + +Now that ElasticSearch is configured, you need to use an output connector to send the tables made by Pathway to ElasticSearch. + +```python +pw.io.elasticsearch.write( + t_alert, + "http://elasticsearch:9200", + auth=pw.io.elasticsearch.ElasticSearchAuth.basic("elastic", "password"), + index_name="alerts", +) +``` + +Note that here you cannot use the traditional `localhost` but the name of the service: `elasticsearch`. + +### Scenario #2: sending the alert to Slack + +For reactivity reasons, you may want to have alerts sent directly to a Slack channel. +To do it with Pathway, you need the ID and the token of the channel: + +```python +import requests + +def on_alert_event(key, row, time, is_addition): + alert_message = "Alert '{}' changed state to {}".format( + row["alert"], + "ACTIVE" if is_addition else "INACTIVE", + ) + requests.post( + "https://slack.com/api/chat.postMessage", + data="text={}&channel={}".format(alert_message, SLACK_ALERT_CHANNEL_ID), + headers={ + "Authorization": "Bearer {}".format(SLACK_ALERT_TOKEN), + "Content-Type": "application/x-www-form-urlencoded", + }, + ).raise_for_status() + + +pw.io.subscribe(t_alert, on_alert_event) +``` + +You need the package `requests`, which should be installed in the Dockerfile: + +```yml [./pathway-src/Dockerfile] +FROM --platform=linux/x86_64 python:3.10 + +RUN pip install -U pathway +RUN pip install requests +RUN pip install python-dateutil +COPY ./pathway-src/alerts.py alerts.py + +CMD ["python", "-u", "alerts.py"] +``` + +Your alerts will be sent directly to your Slack channel. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/slack_alerts.png' +alt: 'Slack alerts' +style: 'width: 700px' +class: 'mx-auto' +--- +:: + +## Putting all the pieces together + +### Scenario #1 + +Here is the final architecture for our realtime logs monitoring using Logstash and ElasticSearch: + +``` +. +├── filebeat-src/ +│ ├── Dockerfile +│ ├── filebeat.docker.yml +│ └── generate_input_stream.sh +├── logstash-src/ +│ └── logstash.conf +├── pathway-src/ +│ ├── alerts.py +│ └── Dockerfile +├── docker-compose.yml +└── Makefile +``` + +The sources of this example are available in our [public repository](https://github.com/pathwaycom/pathway/tree/main/examples/projects/realtime-log-monitoring/logstash-pathway-elastic). + +Here is Pathway implementation: + +```python [./pathway-src/alerts.py] +import time +from datetime import timedelta +import pathway as pw + +alert_threshold = 5 +sliding_window_duration = timedelta(seconds=1) + +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} + +inputSchema = pw.schema_builder( + columns={ + '@timestamp': pw.column_definition(dtype=str), + 'message': pw.column_definition(dtype=str) + } +) + +log_table = pw.io.kafka.read( + rdkafka_settings, + topic="logs", + format="json", + schema=inputSchema, + autocommit_duration_ms=100, +) +log_table = log_table.select(timestamp=pw.this["@timestamp"], log=pw.this.message) +log_table = log_table.select( + pw.this.log, + timestamp=pw.this.timestamp.dt.strptime("%Y-%m-%dT%H:%M:%S.%fZ").dt.timestamp(), +) + +t_sliding_window = log_table.windowby( + log_table.timestamp, + window=pw.temporal.sliding( + hop=timedelta(milliseconds=10), duration=sliding_window_duration + ), + behavior=pw.temporal.common_behavior( + cutoff=timedelta(seconds=0.1), + keep_results=False, + ), +).reduce(timestamp=pw.this._pw_window_end, count=pw.reducers.count()) + +t_alert = t_sliding_window.reduce(count=pw.reducers.max(pw.this.count)).select( + alert=pw.this.count >= alert_threshold +) + +pw.io.elasticsearch.write( + t_alert, + "http://elasticsearch:9200", + auth=pw.io.elasticsearch.ElasticSearchAuth.basic("elastic", "password"), + index_name="alerts_logs", +) + +time.sleep(5) + +pw.run() +``` + +You may have noticed that there is a `time.sleep(5)` before `pw.run()` as you need to wait for Kafka to be ready. +Without it, it would still work, but you will have some errors in the logs because Pathway couldn't connect to Kafka. + +Don't forget the `pw.run()`, without it no computations are done. + + +### Scenario #2 + +The final architecture for our realtime logs monitoring using only Filebeat and forwarding the alerts to Slack is as follows: + +``` +. +├── filebeat-src/ +│ ├── Dockerfile +│ ├── filebeat.docker.yml +│ └── generate_input_stream.sh +├── pathway-src/ +│ ├── alerts.py +│ └── Dockerfile +├── docker-compose.yml +└── Makefile +``` + +The sources of this example are also available in our [public repository](https://github.com/pathwaycom/pathway/tree/main/examples/projects/realtime-log-monitoring/filebeat-pathway-slack). + +Here is the full Pathway implementation, from connecting to Kafka to forwarding the alert to Slack: + +```python [./pathway-src/alerts.py] +import time +from datetime import timedelta +import pathway as pw +import requests + +alert_threshold = 5 +sliding_window_duration = timedelta(seconds=1) + +SLACK_ALERT_CHANNEL_ID = "XXX" +SLACK_ALERT_TOKEN = "XXX" + +rdkafka_settings = { + "bootstrap.servers": "kafka:9092", + "security.protocol": "plaintext", + "group.id": "0", + "session.timeout.ms": "6000", +} + +inputSchema = pw.schema_builder( + columns={ + '@timestamp': pw.column_definition(dtype=str), + 'message': pw.column_definition(dtype=str) + } +) + + +log_table = pw.io.kafka.read( + rdkafka_settings, + topic="logs", + format="json", + schema=inputSchema, + autocommit_duration_ms=100, +) +log_table = log_table.select(timestamp=pw.this["@timestamp"], log=pw.this.message) +log_table = log_table.select( + pw.this.log, + timestamp=pw.this.timestamp.dt.strptime("%Y-%m-%dT%H:%M:%S.%fZ"), +) + +t_sliding_window = log_table.windowby( + log_table.timestamp, + window=pw.temporal.sliding( + hop=timedelta(milliseconds=10), duration=sliding_window_duration + ), + behavior=pw.temporal.common_behavior( + cutoff=timedelta(seconds=0.1), + keep_results=False, + ), +).reduce(timestamp=pw.this._pw_window_end, count=pw.reducers.count()) + +t_alert = t_sliding_window.reduce(count=pw.reducers.max(pw.this.count)).select( + alert=pw.this.count >= alert_threshold +) + +def on_alert_event(key, row, time, is_addition): + alert_message = "Alert '{}' changed state to {}".format( + row["alert"], + "ACTIVE" if is_addition else "INACTIVE", + ) + requests.post( + "https://slack.com/api/chat.postMessage", + data="text={}&channel={}".format(alert_message, SLACK_ALERT_CHANNEL_ID), + headers={ + "Authorization": "Bearer {}".format(SLACK_ALERT_TOKEN), + "Content-Type": "application/x-www-form-urlencoded", + }, + ).raise_for_status() + +pw.io.subscribe(t_alert, on_alert_event) + +time.sleep(5) + +pw.run() +``` + +### Makefile + +To start, stop, and access the different containers you can use the following Makefile: + +```Makefile [./Makefile] +build: + docker-compose up -d + +stop: + docker-compose down -v + +connect: + docker-compose exec filebeat bash +connect-pathway: + docker-compose exec pathway bash +``` + +To start the `generate_input_stream.sh` script, you need to connect to the Filebeat container using `make connect` and then type: +```bash +./generate_input_stream.sh +``` +This will create an artificial stream of logs. +You can see the impact of the stream in your ElasticSearch instance or on Slack depending on the scenario you have chosen. + +## Conclusion + +Congratulations! You can now monitor your logs in realtime. +Realtime statistical analysis is a key part of realtime processing of streaming data. +Such analysis is challenging and existing windows techniques are limited by the fact that they are triggered periodically, without any awareness of the events. With streaming data, there will be a time when the setting will lead to either a waste of resources or a loss in accuracy. +With Pathway, you can create event-based windows, which are updated, and not computed from scratch, whenever new events are incoming. You can define your pipeline and Pathway will handle the updates for you: your sliding window will always be computed on the latest data, without missing any data point nor computing twice the same window. Last but not least, you can drink your coffee calmly in front of your screen, and get all the alerts directly in your Slack. + +::article-img +--- +src: '/assets/content/tutorials/realtime_log_monitoring/manul_coffee.png' +alt: 'Slack alerts' +width: '200px' +class: 'mx-auto' +--- +::