diff --git a/pkg-py/docs/.gitignore b/pkg-py/docs/.gitignore index e0c5635d..6fb1e2c8 100644 --- a/pkg-py/docs/.gitignore +++ b/pkg-py/docs/.gitignore @@ -4,6 +4,7 @@ *.quarto_ipynb objects.txt objects.json +changelog.md # Ignore quartodoc artifacts, these are built in CI _sidebar-python.yml diff --git a/pkg-py/docs/CHANGELOG.md b/pkg-py/docs/CHANGELOG.md new file mode 100644 index 00000000..2c33cdf5 --- /dev/null +++ b/pkg-py/docs/CHANGELOG.md @@ -0,0 +1,56 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [UNRELEASED] + +### Changes + +* The entire functional API (i.e., `init()`, `sidebar()`, `server()`, etc) has been hard deprecated in favor of a simpler OOP-based API. Namely, the new `QueryChat()` class is now the main entry point (instead of `init()`) and has methods to replace old functions (e.g., `.sidebar()`, `.server()`, etc). (#101) + +### New features + +* New `QueryChat.app()` method enables quicker/easier chatting with a dataset. (#104) + +* Enabled bookmarking by default in both `.app()` and `.server()` methods. In latter case, you'll need to also specify the `bookmark_store` (either in `shiny.App()` or `shiny.express.app_opts()`) for it to take effect. (#104) + +* The current SQL query and title can now be programmatically set through the `.sql()` and `.title()` methods of `QueryChat()`. (#98, #101) + +* Added a `.generate_greeting()` method to help you create a greeting message for your querychat bot. (#87) + +* Added `querychat_reset_dashboard()` tool for easily resetting the dashboard filters when asked by the user. (#81) + +### Improvements + +* Added rich tool UI support using shinychat development version and chatlas >= 0.11.1. (#67) + +* querychat's system prompt and tool descriptions were rewritten for clarity and future extensibility. (#90) + +## [0.2.2] - 2025-09-04 + +* Fixed another issue with data sources that aren't already narwhals DataFrames (#83) + +## [0.2.1] - 2025-09-04 + +* Fixed an issue with the query tool when used with SQLAlchemy data sources. (@npelikan #79) + +## [0.2.0] - 2025-09-02 + +* `querychat.init()` now accepts a `client` argument, replacing the previous `create_chat_callback` argument. (#60) + + The `client` can be: + + * a `chatlas.Chat` object, + * a function that returns a `chatlas.Chat` object, + * or a provider-model string, e.g. `"openai/gpt-4.1"`, to be passed to `chatlas.ChatAuto()`. + + If `client` is not provided, querychat will use the `QUERYCHAT_CLIENT` environment variable, which should be a provider-model string. If the envvar is not set, querychat uses OpenAI with the default model from `chatlas.ChatOpenAI()`. + +* `querychat.ui()` now adds a `.querychat` class to the chat container and `querychat.sidebar()` adds a `.querychat-sidebar` class to the sidebar, allowing for easier customization via CSS. (#68) + +## [0.1.0] - 2025-05-24 + +This first release of the `querychat` package. diff --git a/pkg-py/docs/_brand.yml b/pkg-py/docs/_brand.yml deleted file mode 100644 index 0393f067..00000000 --- a/pkg-py/docs/_brand.yml +++ /dev/null @@ -1,48 +0,0 @@ - -color: - palette: - blue: "#007bc2" - indigo: "#4b00c1" - purple: "#74149c" - pink: "#bf007f" - red: "#c10000" - orange: "#f45100" - yellow: "#f9b928" - green: "#00891a" - teal: "#00bf7f" - cyan: "#03c7e8" - white: "#ffffff" - black: "#1D1F21" - - foreground: black - background: white - primary: blue - secondary: gray - success: green - info: cyan - warning: yellow - danger: red - light: "#f8f8f8" - dark: "#212529" - -typography: - fonts: - - family: Open Sans - source: bunny - - family: Source Code Pro - source: bunny - - headings: - family: Open Sans - weight: 400 - monospace: Source Code Pro - monospace-inline: - color: pink - background-color: transparent - size: 0.95em - -defaults: - bootstrap: - defaults: - navbar-bg: $brand-blue - code-color-dark: "#fa88d4" diff --git a/pkg-py/docs/_quarto.yml b/pkg-py/docs/_quarto.yml index ce82f054..d65d8205 100644 --- a/pkg-py/docs/_quarto.yml +++ b/pkg-py/docs/_quarto.yml @@ -1,11 +1,14 @@ project: type: website output-dir: ../../docs/py + pre-render: + cp ../CHANGELOG.md CHANGELOG.md website: title: "querychat" site-url: https://posit-dev.github.io/querychat/py - description: Chat with your data in Shiny apps + description: Explore data using natural language + page-navigation: true bread-crumbs: true open-graph: true @@ -21,43 +24,41 @@ website: [![](https://posit.co/wp-content/uploads/2024/06/Posit-Logos-2024_horiz-full-color.svg){fig-alt="Posit" width=65px}](https://posit.co) navbar: - left: - - text: Get Started - href: index.qmd - - text: "Examples" - href: examples/index.qmd + background: "#193D56" + search: true + title: 'QueryChat' + #title: 'querychat websiteQueryChat' + + right: - text: API Reference href: reference/index.qmd - - tools: + - text: Changelog + href: /changelog.html - icon: github - menu: - - text: Source code - href: https://github.com/posit-dev/querychat/tree/main/pkg-py - - text: Report a bug - href: https://github.com/posit-dev/querychat/issues/new - + href: https://github.com/posit-dev/querychat + aria-label: GitHub repository sidebar: - - id: examples - title: "Examples" - style: docked - type: light - background: light - foreground: dark + - id: get-started + title: Get Started + style: floating + align: left contents: - - href: examples/index.qmd - - section: "DataFrames" - contents: - - href: examples/pandas.qmd - - section: "Databases" - contents: - - href: examples/sqlite.qmd + - index.qmd + - section: "Overview" + contents: + - models.qmd + - data-sources.qmd + - context.qmd + - build.qmd + - greet.qmd + - tools.qmd + format: html: - theme: [brand] - highlight-style: github + theme: + - styles.scss toc: true lightbox: auto @@ -71,18 +72,38 @@ quartodoc: sidebar: reference/_sidebar.yml css: reference/_styles-quartodoc.css sections: - - title: Get Started - desc: The basic building blocks of Querychat + - title: The Querychat class + desc: The starting point for any QueryChat session contents: - - init - - sidebar - - server + - name: QueryChat + include_inherited: true + - name: express.QueryChat + include_inherited: true - - title: Customize - desc: Dive deeper into customizing Querychat + - title: Reactive values + desc: Session-specific reactive values representing the current query + contents: + - types.ServerValues + + - title: Data Sources + desc: The underlying logic for managing data sources + contents: + - name: types.DataSource + signature_name: short + - name: types.DataFrameSource + signature_name: short + - name: types.SQLAlchemySource + signature_name: short + + - title: Tools + desc: The underlying tools provided to the LLM contents: - - ui - - system_prompt + - name: tools.tool_query + signature_name: short + - name: tools.tool_update_dashboard + signature_name: short + - name: tools.tool_reset_dashboard + signature_name: short filters: - "interlinks" diff --git a/pkg-py/docs/build.qmd b/pkg-py/docs/build.qmd new file mode 100644 index 00000000..00c92290 --- /dev/null +++ b/pkg-py/docs/build.qmd @@ -0,0 +1,518 @@ +--- +title: Build an app +--- + +While the [`.app()` method](reference/QueryChat.qmd#querychat.QueryChat.app) provides a quick way to start exploring data, building bespoke Shiny apps with QueryChat unlocks the full power of integrating natural language data exploration with custom visualizations, layouts, and interactivity. This guide shows you how to integrate QueryChat into your own Shiny applications and leverage its reactive data outputs to create rich, interactive dashboards. + +## Starter template + +Integrating QueryChat into a Shiny app requires just three steps: + +1. Initialize a `QueryChat()` instance with your data +2. Add the QueryChat UI component (either `.sidebar()` or `.ui()`) +3. Use reactive values like `.df()`, `.sql()`, and `.title()` to build outputs that respond to user queries + +Here's a starter template demonstrating these steps: + +::: {.panel-tabset group="shiny-mode"} + +#### Express + +```python +{{< include /../examples/03-sidebar-express-app.py >}} +``` + +#### Core + +```python +{{< include /../examples/03-sidebar-core-app.py >}} +``` + + +::: + +::: callout-note +With Core, you'll need to call the `qc.server()` method within your server function to set up QueryChat's reactive behavior, and capture its return value to access reactive data. This is not necessary with Express, which handles it automatically and exposes reactive values directly on the `QueryChat` instance. +::: + +## Reactives + +There are three main reactive values provided by QueryChat for use in your app: + +### Filtered data {#filtered-data} + +The `.df()` method returns the current filtered and/or sorted data frame. This updates whenever the user prompts a filtering or sorting operation through the chat interface (see [Data updating](tools.qmd#data-updating) for details). + + +::: {.panel-tabset group="shiny-mode"} + +#### Express + +```python +@render.data_frame +def table(): + return qc.df() # Returns filtered/sorted data +``` + +#### Core + +```python +qc_vals = qc.server() + +@render.data_frame +def table(): + return qc_vals.df() # Returns filtered/sorted data +``` + +::: + +You can use `.df()` to power any output in your app - visualizations, summary statistics, data tables, and more. When a user asks to "show only survivors" or "sort by age", `.df()` automatically updates, and any outputs that depend on it will re-render. + +### SQL query {#sql-query} + +The `.sql()` method returns the current SQL query as a string. This is useful for displaying the query to users for transparency and reproducibility: + +::: {.panel-tabset group="shiny-mode"} + +#### Express + +```python +@render.text +def current_query(): + return qc.sql() or "SELECT * FROM my_data" +``` + +#### Core + +```python +qc_vals = qc.server() + +@render.text +def current_query(): + return qc_vals.sql() or "SELECT * FROM my_data" +``` + +::: + +You can also use `.sql()` as a setter to programmatically update the query (see [Programmatic filtering](#programmatic-filtering) below). + +### Title {#title} + +The `.title()` method returns a short description of the current filter, provided by the LLM when it generates a query. For example, if a user asks to "show first-class survivors", the title might be "First-class survivors". + +::: {.panel-tabset group="shiny-mode"} + +##### Express + +```python +@render.text +def card_title(): + return qc.title() or "All Data" +``` + +#### Core + +```python +qc_vals = qc.server() + +@render.text +def card_title(): + return qc_vals.title() or "All Data" +``` + +::: + +Returns `None` when no filter is active. You can also use `.title()` as a setter to update the title programmatically. + +## Custom UI + +In the starter template above, we used the `.sidebar()` method for a simple sidebar layout. In some cases, you might want to place the chat UI somewhere else in your app layout, or just more fully customize what goes in the sidebar. The `.ui()` method is designed for this -- it returns the chat component without additional layout wrappers. + +For example here is how to place the chat in a sidebar with some additional controls: + +::: {.panel-tabset group="shiny-mode"} + +#### Express + +```python +from shiny.express import ui, reactive +from querychat.express import QueryChat + +qc = QueryChat(data, "my_data") + +with ui.sidebar(): + qc.ui() # Chat component + ui.hr() + ui.input_action_button("reset", "Reset Filters", class_="w-100") +``` + +#### Core + +```python +from shiny import ui, reactive +from querychat import QueryChat + +qc = QueryChat(data, "my_data") + +app_ui = ui.page_sidebar( + ui.sidebar( + qc.ui(), # Chat component + ui.hr(), + ui.input_action_button("reset", "Reset Filters", class_="w-100"), + ), + # Main content here +) +``` + +::: + +::: callout-tip +### Custom Shiny chat UIs + +Learn more about customizing Shiny chat UIs in the [Shiny Chat documentation](https://shiny.posit.co/py/docs/genai-chatbots.html#layout). +::: + + +## Data views + +Thanks to Shiny's support for [Jupyter Widgets](https://shiny.posit.co/py/docs/jupyter-widgets.html) like [Plotly](https://shiny.posit.co/py/components/outputs/plot-plotly/), it's straightforward to create rich data views that depend on QueryChat data. Here's an example of an app showing both the filtered data and a bar chart depending on that same data: + + +```python +import plotly.express as px + +from seaborn import load_dataset +from shiny.express import render, ui +from shinywidgets import render_plotly + +from querychat.express import QueryChat + +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic") +qc.sidebar() + +with ui.layout_columns(): + with ui.card(): + ui.card_header("Data Table") + + @render.data_frame + def table(): + return qc.df() + + with ui.card(): + ui.card_header("Survival by Class") + + @render_plotly + def survival_plot(): + d = qc.df() + summary = d.groupby('pclass')['survived'].mean().reset_index() + return px.bar(summary, x='pclass', y='survived') +``` + +Now when a user filters the data through natural language (e.g., "filter to only children"), both the table and the chart update automatically. + +![](/images/plotly-data-view.png){fig-alt="Screenshot of a querychat app showing both a data table and a bar chart of survival by class." class="lightbox shadow rounded mb-3"} + +A more useful, but slightly more involved example like the one below might incorporate other [Shiny components](https://shiny.posit.co/py/components/) like value boxes to summarize key statistics about the filtered data. + + +
+app.py + +```python +from shiny.express import render, ui +from shinywidgets import render_plotly +from querychat.express import QueryChat +from seaborn import load_dataset +from faicons import icon_svg +import plotly.express as px + +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic") +qc.sidebar() + +with ui.layout_column_wrap(fill=False): + with ui.value_box(showcase=icon_svg("users")): + "Passengers" + + @render.text + def count(): + return str(len(qc.df())) + + with ui.value_box(showcase=icon_svg("heart")): + "Survival Rate" + + @render.text + def survival(): + rate = qc.df()['survived'].mean() * 100 + return f"{rate:.1f}%" + + with ui.value_box(showcase=icon_svg("coins")): + "Avg Fare" + + @render.text + def fare(): + avg = qc.df()['fare'].mean() + return f"${avg:.2f}" + +with ui.layout_columns(): + with ui.card(): + with ui.card_header(): + "Data Table" + + @render.text + def table_title(): + return f" - {qc.title()}" if qc.title() else "" + + @render.data_frame + def data_table(): + return qc.df() + + with ui.card(): + ui.card_header("Survival by Class") + + @render_plotly + def survival_by_class(): + df = qc.df() + summary = df.groupby('pclass')['survived'].mean().reset_index() + return px.bar( + summary, + x='pclass', + y='survived', + labels={'pclass': 'Class', 'survived': 'Survival Rate'}, + ) + +with ui.layout_columns(): + with ui.card(): + ui.card_header("Age Distribution") + + @render_plotly + def age_dist(): + df = qc.df() + return px.histogram(df, x='age', nbins=30) + + with ui.card(): + ui.card_header("Fare by Class") + + @render_plotly + def fare_by_class(): + df = qc.df() + return px.box(df, x='pclass', y='fare', color='survived') + +ui.page_opts( + title="Titanic Survival Analysis", + fillable=True, + class_="bslib-page-dashboard", +) +``` + +
+ + +![](/images/rich-data-views.png){fig-alt="Screenshot of a querychat app showing value boxes, a data table, and multiple plots." class="lightbox shadow rounded mb-3"} + + +## Programmatic updates + +QueryChat's reactive state can be updated programmatically. For example, you might want to add a "Reset Filters" button that clears any active filters and returns the data table to its original state. You can do this by setting both the SQL query and title to their default values. This way you don't have to rely on both the user and LLM to send the right prompt. + +::: {.panel-tabset group="shiny-mode"} + +#### Express + +```python +ui.input_action_button("reset", "Reset Filters") + +@reactive.effect +@reactive.event(input.reset) +def _(): + qc.sql("") + qc.title(None) +``` + +#### Core + +```python +ui.input_action_button("reset", "Reset Filters") + +qc_vals = qc.server() + +@reactive.effect +@reactive.event(input.reset) +def _(): + qc_vals.sql.set("") + qc_vals.title.set(None) +``` + +::: + +This is equivalent to the user asking the LLM to "reset" or "show all data". + +## Multiple datasets + +You can use multiple QueryChat instances in a single app to explore different datasets. Just ensure each instance has a different table name (or `id` which derives the table name) to avoid conflicts. Here's an example with two datasets: + +```python +from seaborn import load_dataset +from shiny.express import render, ui +from querychat.express import QueryChat + +titanic = load_dataset("titanic") +penguins = load_dataset("penguins") + +qc_titanic = QueryChat(titanic, "titanic") +qc_penguins = QueryChat(penguins, "penguins") + +with ui.sidebar(): + with ui.panel_conditional("input.navbar == 'Titanic'"): + qc_titanic.ui() + with ui.panel_conditional("input.navbar == 'Penguins'"): + qc_penguins.ui() + +with ui.nav_panel("Titanic"): + @render.data_frame + def titanic_table(): + return qc_titanic.df() + +with ui.nav_panel("Penguins"): + @render.data_frame + def penguins_table(): + return qc_penguins.df() + +ui.page_opts( + id="navbar", + title="Multiple Datasets with QueryChat", + fillable=True, +) +``` + +![](/images/multiple-datasets.png){fig-alt="Screenshot of a querychat app with two datasets: titanic and penguins." class="lightbox shadow rounded mb-3"} + + +## Complete example + +Here's a complete example bringing together multiple concepts - a Titanic survival analysis dashboard with natural language exploration, coordinated visualizations, and custom controls: + +```python +from shiny.express import render, ui +from querychat.express import QueryChat +from seaborn import load_dataset +import plotly.express as px + +# Load data +titanic = load_dataset("titanic") + +# Create QueryChat +qc = QueryChat( + titanic, + "titanic", + data_description="Titanic passenger data with survival outcomes", +) + +# Page configuration +ui.page_opts( + title="Titanic Survival Analysis", + fillable=True, + class_="bslib-page-dashboard", +) + +# Create sidebar with chat +with ui.sidebar(width=400): + qc.ui() + ui.hr() + ui.input_action_button("reset", "Reset Filters", class_="w-100") + +# Summary cards +with ui.layout_columns(): + with ui.value_box(showcase=ui.icon("users")): + "Passengers" + + @render.text + def count(): + return str(len(qc.df())) + + with ui.value_box(showcase=ui.icon("heart")): + "Survival Rate" + + @render.text + def survival(): + rate = qc.df()['survived'].mean() * 100 + return f"{rate:.1f}%" + + with ui.value_box(showcase=ui.icon("coins")): + "Avg Fare" + + @render.text + def fare(): + avg = qc.df()['fare'].mean() + return f"${avg:.2f}" + +# Main content area with visualizations +with ui.layout_columns(): + with ui.card(): + with ui.card_header(): + "Data Table" + + @render.text + def table_title(): + return f" - {qc.title()}" if qc.title() else "" + + @render.data_frame + def data_table(): + return qc.df() + + with ui.card(): + ui.card_header("Survival by Class") + + @render.plot + def survival_by_class(): + df = qc.df() + summary = df.groupby('pclass')['survived'].mean().reset_index() + fig = px.bar( + summary, + x='pclass', + y='survived', + labels={'pclass': 'Class', 'survived': 'Survival Rate'}, + ) + return fig + +with ui.layout_columns(): + with ui.card(): + ui.card_header("Age Distribution") + + @render.plot + def age_dist(): + df = qc.df() + fig = px.histogram(df, x='age', nbins=30) + return fig + + with ui.card(): + ui.card_header("Fare by Class") + + @render.plot + def fare_by_class(): + df = qc.df() + fig = px.box(df, x='pclass', y='fare', color='survived') + return fig + +# Reset button handler +@reactive.effect +@reactive.event(input.reset) +def handle_reset(): + qc.sql("") + qc.title(None) + ui.notification_show("Filters cleared", type="message") +``` + +This dashboard demonstrates: +- Natural language filtering through chat +- Multiple coordinated views (cards, table, plots) +- Custom reset button alongside natural language +- Dynamic titles reflecting current state +- Responsive layout that updates together + +## See also + +- [Greet users](greet.qmd) - Create welcoming onboarding experiences +- [Provide context](context.qmd) - Help the LLM understand your data better +- [Tools](tools.qmd) - Understand what QueryChat can do under the hood diff --git a/pkg-py/docs/context.qmd b/pkg-py/docs/context.qmd new file mode 100644 index 00000000..24156ebf --- /dev/null +++ b/pkg-py/docs/context.qmd @@ -0,0 +1,100 @@ +--- +title: Provide context +--- + +To improve the LLM's ability to accurately translate natural language queries into SQL, it often helps to provide relevant metadata. Querychat automatically provides things like column names and data types to the LLM, but you can enhance this further with additional context like [data descriptions](#data-description). You can also provide [custom instructions](#extra-instructions) to add additional behaviors and even supply a fully [custom prompt template](#custom-template), if desired. + +All of this information is provided to the LLM as part of the **system prompt** -- a string of text containing instructions and context for the LLM to consider when responding to user queries. + +## Default prompt + +For full visibility into the full system prompt that Querychat generates for the LLM, see the `system_prompt` property. This is useful for debugging and understanding exactly what context the LLM is using: + +```python +from querychat import QueryChat +from seaborn import load_dataset + +titanic = load_dataset("titanic") + +qc = QueryChat(titanic, "titanic") +print(qc.system_prompt) +``` + +By default, the system prompt contains the following components: + +1. The basic set of behaviors and guidelines the LLM must follow in order for querychat to work properly, including how to use [tools](tools.qmd) to execute queries and update the app. +2. The SQL schema of the data frame you provided. This includes: + - Column names + - Data types (integer, float, boolean, datetime, text) + - For text columns with less than 10 unique values, we assume they are categorical variables and include the list of values + - For integer and float columns, we include the range +3. A [data description](#data-description) (if provided via `data_description`) +4. [Additional instructions](#additional-instructions) you want to use to guide querychat's behavior (if provided via `extra_instructions`). + + +## Data description {#data-description} + +If your column names are descriptive, Querychat may already work well without additional context. However, if your columns are named `x`, `V1`, `value`, etc., you should provide a data description. Use the `data_description` parameter for this: + +```{.python filename="titanic-app.py"} +from pathlib import Path +from querychat import QueryChat + +qc = QueryChat( + titanic, + "titanic", + data_description=Path("data_description.md") +) +app = qc.app() +``` + +Querychat doesn't need this information in any particular format -- just provide what a human would find helpful: + +```{.markdown filename="data_description.md"} +This dataset contains information about Titanic passengers, collected for predicting survival. + +- survived: Survival (0 = No, 1 = Yes) +- pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd) +- sex: Sex of passenger +- age: Age in years +- sibsp: Number of siblings/spouses aboard +- parch: Number of parents/children aboard +- fare: Passenger fare +- embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) +``` + + + +## Additional instructions {#extra-instructions} + +You can add custom instructions to guide the LLM's behavior using the `extra_instructions` parameter: + +```python +qc = QueryChat( + titanic, + "titanic", + extra_instructions=Path("instructions.md") +) +``` + +Or as a string: + +```python +instructions = """ +- Use British spelling conventions +- Stay on topic and only discuss the data dashboard +- Refuse to answer unrelated questions +""" + +qc = QueryChat(titanic, "titanic", extra_instructions=instructions) +``` + +::: callout-warning + +LLMs may not always follow your instructions perfectly. Test extensively when changing instructions or models. +::: + + +## Custom template {#custom-template} + +If you want more control over the system prompt, you can provide a custom prompt template using the `prompt_template` parameter. This is for more advanced users who want to fully customize the LLM's behavior. See the [API reference](reference/QueryChat.qmd#attributes) for details on the available template variables. \ No newline at end of file diff --git a/pkg-py/docs/data-sources.qmd b/pkg-py/docs/data-sources.qmd new file mode 100644 index 00000000..636848ba --- /dev/null +++ b/pkg-py/docs/data-sources.qmd @@ -0,0 +1,179 @@ +--- +title: Data Sources +lightbox: true +--- + +`querychat` supports many types of data sources, including: + +1. Any [narwhals-compatible](https://narwhals-dev.github.io/narwhals/) data frame. +2. Any [SQLAlchemy](https://www.sqlalchemy.org/) database. +3. A custom [DataSource](reference/datasource.DataSource.qmd) interface/protocol. + +The sections below describe how to use each type of data source with `querychat`. + + +## Data frames + +You can use any [narwhals-compatible](https://narwhals-dev.github.io/narwhals/) data frame as a data source in `querychat`. This includes popular data frame libraries like [pandas](https://pandas.pydata.org/), [polars](https://www.pola.rs/), [pyarrow](https://arrow.apache.org/docs/python/), and many more. + +::: {.panel-tabset .panel-pills} + +### Pandas + +```{.python filename="pandas-app.py"} +import pandas as pd +from querychat import QueryChat + +mtcars = pd.read_csv( + "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +) + +qc = QueryChat(mtcars, "mtcars") +app = qc.app() +``` + +### Polars + +```{.python filename="polars-app.py"} +import polars as pl +from querychat import QueryChat + +mtcars = pl.read_csv( + "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +) + +qc = QueryChat(mtcars, "mtcars") +app = qc.app() +``` + +### Pyarrow + +```{.python filename="pyarrow-app.py"} +import pyarrow as pa +import pyarrow.csv as pv +from querychat import QueryChat + +mtcars = pv.read_csv( + "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" +).to_table() + +qc = QueryChat(mtcars, "mtcars") +app = qc.app() +``` + +::: + +If you're [building an app](build.qmd), note you can read the queried data frame reactively using the `df()` method, which returns a `pandas.DataFrame` by default. + +## Databases + +You can also connect `querychat` directly to any database supported by [SQLAlchemy](https://www.sqlalchemy.org/). This includes popular databases like SQLite, DuckDB, PostgreSQL, MySQL, and many more. + +Assuming you have a database set up and accessible, you can pass a SQLAlchemy [database URL](https://docs.sqlalchemy.org/en/20/core/engines.html) to `create_engine()`, and then pass the resulting engine to `QueryChat`. Below are some examples for common databases. + + +::: {.panel-tabset} + +### Duck DB + +```shell +pip install duckdb duckdb-engine +``` + +```{.python filename="duckdb-app.py"} +from pathlib import Path +from sqlalchemy import create_engine +from querychat import QueryChat + +# Assumes my_database.duckdb is in the same directory as this script +db_path = Path(__file__).parent / "my_database.duckdb" +engine = create_engine(f"duckdb:///{db_path}") + +qc = QueryChat(engine, "my_table") +app = qc.app() +``` + +### SQLite + +```{.python filename="sqlite-app.py"} +from pathlib import Path +from sqlalchemy import create_engine +from querychat import QueryChat + +# Assumes my_database.db is in the same directory as this script +db_path = Path(__file__).parent / "my_database.db" +engine = create_engine(f"sqlite:///{db_path}") + +qc = QueryChat(engine, "my_table") +app = qc.app() +``` + + +### PostgreSQL + +```shell +pip install psycopg2-binary +``` + +```{.python filename="postgresql-app.py"} +from sqlalchemy import create_engine +from querychat import QueryChat + +engine = create_engine("postgresql+psycopg2://user:password@localhost:5432/mydatabase") +qc = QueryChat(engine, "my_table") +app = qc.app() +``` + +### MySQL + +```shell +pip install pymysql +``` + +```{.python filename="mysql-app.py"} +from sqlalchemy import create_engine +from querychat import QueryChat + +engine = create_engine("mysql+pymysql://user:password@localhost:3306/mydatabase") +qc = QueryChat(engine, "my_table") +app = qc.app() +``` + +::: + + +If you don't have a database set up, you can easily create a local DuckDB database from a CSV file using the following code: + +```{.python filename="create-duckdb.py"} +import duckdb + +conn = duckdb.connect("my_database.duckdb") + +conn.execute(""" + CREATE TABLE my_table AS + SELECT * FROM read_csv_auto('path/to/your/file.csv') +""") +``` + +Or, if you have a pandas DataFrame, you can create the DuckDB database like so: + +```{.python filename="create-duckdb-from-pandas.py"} +import duckdb +import pandas as pd + +from seaborn import load_dataset +titanic = load_dataset("titanic") + +conn = duckdb.connect("my_database.duckdb") +conn.register('titanic_df', titanic) +conn.execute(""" + CREATE TABLE titanic AS + SELECT * FROM titanic_df +""") +``` + +Then you can connect to this database using the DuckDB example above (changing the table name as appropriate): + +## Custom sources + +If you have a custom data source that doesn't fit into the above categories, you can implement the [DataSource](reference/datasource.DataSource.qmd) interface/protocol. This requires implementing methods for getting schema information and executing queries. \ No newline at end of file diff --git a/pkg-py/docs/examples/index.qmd b/pkg-py/docs/examples/index.qmd deleted file mode 100644 index 06f82737..00000000 --- a/pkg-py/docs/examples/index.qmd +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: "Basic Example" ---- - -Here's the basic example that uses the `titanic` dataset. - -{{< include /includes/github_models-callout.qmd >}} - -```python -{{< include /../examples/app.py >}} -``` diff --git a/pkg-py/docs/examples/pandas.qmd b/pkg-py/docs/examples/pandas.qmd deleted file mode 100644 index 8e90f3ba..00000000 --- a/pkg-py/docs/examples/pandas.qmd +++ /dev/null @@ -1,43 +0,0 @@ ---- -title: Pandas ---- - -This example and walkthrough has the following features: - -- querychat interaction with a pandas dataframe -- Reads in a data description file -- Reads in a greeting file - -## Data - -This examples uses the `seaborn` library to load the `titanic` dataset. - -## Greeting file - -Save this file as `greeting.md`: - -```markdown -{{< include /../examples/greeting.md >}} -``` - -## Data description file - -Save this file as `data_description.md`: - -```markdown -{{< include /../examples/data_description.md >}} -``` - - -## The application - -Our application will read the the `greeting.md` and `data_description.md` files -and pass them along to the `querychat.init()` function. - -Here is our pandas example app, save the contents to `app.py`. - -{{< include /includes/github_models-callout.qmd >}} - -```python -{{< include /../examples/app-dataframe-pandas.py >}} -``` diff --git a/pkg-py/docs/examples/sqlite.qmd b/pkg-py/docs/examples/sqlite.qmd deleted file mode 100644 index 2d180831..00000000 --- a/pkg-py/docs/examples/sqlite.qmd +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: "SQLite" ---- - -This example and walkthrough has the following features: - -- querychat interaction with a SQLite database using SQLAlchemy -- Reads in a data description file -- Reads in a greeting file - -## Data - -This example uses the `seaborn` library to load up the `titanic` dataset, -and then write the dataframe into a SQLite database, `titanic.db`. -It then uses SQLAlchemy to connect to the SQLite database. - -If the `titanic.db` file does not exist in the same directory as the `app.py` file, -it will create the SQLite database file. - - -## Greeting file - -Save this file as `greeting.md`: - -```markdown -{{< include /../examples/greeting.md >}} -``` - -## Data description file - -Save this file as `data_description.md`: - -```markdown -{{< include /../examples/data_description.md >}} -``` - -## The application - -Our application will read the the `greeting.md` and `data_description.md` files -and pass them along to the `querychat.init()` function. -Also, instead of passing in a dataframe object to the `data_source` parameter in `querychat.init()`, we pass in the database connection, along with the table in the database as `table_name`. - -Here is our SQLite example app, save the contents to `app.py`. - -{{< include /includes/github_models-callout.qmd >}} - -```python -{{< include /../examples/app-database-sqlite.py >}} -``` diff --git a/pkg-py/docs/greet.qmd b/pkg-py/docs/greet.qmd new file mode 100644 index 00000000..5750ae06 --- /dev/null +++ b/pkg-py/docs/greet.qmd @@ -0,0 +1,58 @@ + +--- +title: Greet users +--- + +### Provide a greeting + +When the querychat UI first appears, you will usually want it to greet the user with some basic instructions. By default, these instructions are auto-generated every time a user arrives; this is slow, wasteful, and unpredictable. Instead, you should create a greeting file and pass it when creating your `QueryChat` object: + +```python +from pathlib import Path +qc = QueryChat(titanic, "titanic", greeting=Path("greeting.md")) +``` + +You can provide suggestions to the user by using the ` ` tag: + +```markdown +* **Filter and sort the data:** + * Show only survivors + * Filter to first class passengers under 30 + * Sort by fare from highest to lowest + +* **Answer questions about the data:** + * What was the survival rate by gender? + * What's the average age of children who survived? + * How many passengers were traveling alone? +``` + +These suggestions appear in the greeting and automatically populate the chat text box when clicked. +You can see this behavior in our [`querychat template`](https://shiny.posit.co/py/templates/querychat/). + +### Generate a greeting + +If you need help coming up with a greeting, you can use the `.generate_greeting()` method: + +```python +from palmerpenguins import load_penguins +from querychat import QueryChat + +# Create QueryChat object with your dataset +penguins = load_penguins() +qc = QueryChat(penguins, "penguins") + +# Generate a greeting (this calls the LLM) +greeting_text = qc.generate_greeting() +#> Hello! I'm here to help you explore and analyze the penguins dataset. +#> Here are some example prompts you can try: +#> ... + +# Save it for reuse +with open("penguins_greeting.md", "w") as f: + f.write(greeting_text) + +# Then use the saved greeting in your app +qc = QueryChat(penguins, "penguins", greeting=Path("penguins_greeting.md")) +``` + +This approach generates a greeting once and saves it for reuse, avoiding the latency and cost of generating it for every user. \ No newline at end of file diff --git a/pkg-py/docs/images/hex.png b/pkg-py/docs/images/hex.png new file mode 100644 index 00000000..a8c24b4e Binary files /dev/null and b/pkg-py/docs/images/hex.png differ diff --git a/pkg-py/docs/images/multiple-datasets.png b/pkg-py/docs/images/multiple-datasets.png new file mode 100644 index 00000000..df7353fa Binary files /dev/null and b/pkg-py/docs/images/multiple-datasets.png differ diff --git a/pkg-py/docs/images/plotly-data-view.png b/pkg-py/docs/images/plotly-data-view.png new file mode 100644 index 00000000..ef24eb7b Binary files /dev/null and b/pkg-py/docs/images/plotly-data-view.png differ diff --git a/pkg-py/docs/images/quickstart-filter.png b/pkg-py/docs/images/quickstart-filter.png new file mode 100644 index 00000000..1819dbae Binary files /dev/null and b/pkg-py/docs/images/quickstart-filter.png differ diff --git a/pkg-py/docs/images/quickstart-summary.png b/pkg-py/docs/images/quickstart-summary.png new file mode 100644 index 00000000..7cb0256d Binary files /dev/null and b/pkg-py/docs/images/quickstart-summary.png differ diff --git a/pkg-py/docs/images/quickstart.png b/pkg-py/docs/images/quickstart.png new file mode 100644 index 00000000..fd423d0d Binary files /dev/null and b/pkg-py/docs/images/quickstart.png differ diff --git a/pkg-py/docs/images/rich-data-views.png b/pkg-py/docs/images/rich-data-views.png new file mode 100644 index 00000000..09fc05fe Binary files /dev/null and b/pkg-py/docs/images/rich-data-views.png differ diff --git a/pkg-py/docs/images/sidebot.png b/pkg-py/docs/images/sidebot.png new file mode 100644 index 00000000..73a67616 Binary files /dev/null and b/pkg-py/docs/images/sidebot.png differ diff --git a/pkg-py/docs/includes/github_models-callout.qmd b/pkg-py/docs/includes/github_models-callout.qmd deleted file mode 100644 index 223cc1f0..00000000 --- a/pkg-py/docs/includes/github_models-callout.qmd +++ /dev/null @@ -1,5 +0,0 @@ -:::{.callout-note} -## GitHub Models and GitHub Personal Access Tokens - -{{< include /includes/github_models.qmd >}} -::: diff --git a/pkg-py/docs/includes/github_models.qmd b/pkg-py/docs/includes/github_models.qmd deleted file mode 100644 index 887ed82f..00000000 --- a/pkg-py/docs/includes/github_models.qmd +++ /dev/null @@ -1,11 +0,0 @@ -This example does not use the default OpenAI model directly from OpenAI, -which would require you to create an OpenAI API key and save it as an environment variable named `OPENAI_API_KEY`. -Instead we are using [GitHub Models](https://github.com/marketplace/models) -as a free way to access the latest LLMs, with a [rate-limit](https://docs.github.com/en/github-models/use-github-models/prototyping-with-ai-models#rate-limits). -You can follow the instructions on the -[GitHub Docs](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens#creating-a-personal-access-token-classic) -or -[Axure AI Demo](https://github.com/Azure-Samples/python-ai-agent-frameworks-demos/tree/main?tab=readme-ov-file#configuring-github-models) -on creating a PAT. - -We suggest you save your PAT into 2 environment variables: `GITHUB_TOKEN`, and `GITHUB_PAT`. diff --git a/pkg-py/docs/index.qmd b/pkg-py/docs/index.qmd index f4f47e03..97ba65d8 100644 --- a/pkg-py/docs/index.qmd +++ b/pkg-py/docs/index.qmd @@ -1,227 +1,113 @@ --- -pagetitle: "Get Started" -title: "querychat: Chat with Shiny apps (Python)" +title: Introduction +lightbox: true --- -Imagine typing questions like these directly into your Shiny dashboard, and seeing the results in realtime: - -* "Show only penguins that are not species Gentoo and have a bill length greater than 50mm." -* "Show only blue states with an incidence rate greater than 100 per 100,000 people." -* "What is the average mpg of cars with 6 cylinders?" - -querychat is a drop-in component for Shiny that allows users to query a data frame using natural language. The results are available as a reactive data frame, so they can be easily used from Shiny outputs, reactive expressions, downloads, etc. - -**This is not as terrible an idea as you might think!** We need to be very careful when bringing LLMs into data analysis, as we all know that they are prone to hallucinations and other classes of errors. querychat is designed to excel in reliability, transparency, and reproducibility by using this one technique: denying it raw access to the data, and forcing it to write SQL queries instead. See the section below on ["How it works"](#how-it-works) for more. - -## Installation - -```bash -pip install querychat +```{=html} + ``` -## How to use - -First, you'll need access to an LLM that supports tools/function calling. querychat uses [chatlas](https://github.com/posit-dev/chatlas) to interface with various providers. - -Here's a very minimal example that shows the three function calls you need to make: - -```python -{{< include /../examples/app.py >}} -``` - -{{< include /includes/github_models-callout.qmd >}} - -## How it works - -### Powered by LLMs - -querychat's natural language chat experience is powered by LLMs. You may use any model that [chatlas](https://github.com/posit-dev/chatlas) supports that has the ability to do tool calls, but we currently recommend (as of March 2025): - -* GPT-4o -* Claude 3.5 Sonnet -* Claude 3.7 Sonnet - -In our testing, we've found that those models strike a good balance between accuracy and latency. Smaller models like GPT-4o-mini are fine for simple queries but make surprising mistakes with moderately complex ones; and reasoning models like o3-mini slow down responses without providing meaningfully better results. +querychat website banner image -The small open source models (8B and below) we've tested have fared extremely poorly. Sorry. 🤷 +

+Explore data using natural language queries +

-### Powered by SQL +
+ +PyPI +MIT License +versions +Python Tests + +
-querychat does not have direct access to the raw data; it can _only_ read or filter the data by writing SQL `SELECT` statements. This is crucial for ensuring relability, transparency, and reproducibility: -- **Reliability:** Today's LLMs are excellent at writing SQL, but bad at direct calculation. -- **Transparency:** querychat always displays the SQL to the user, so it can be vetted instead of blindly trusted. -- **Reproducibility:** The SQL query can be easily copied and reused. +Querychat makes it easy to explore data with natural language through the power of [Shiny](https://shiny.posit.co/py) and large language models (LLMs). Start chatting with your data in just one line of code. Or, with a few more lines, design your own rich user experience around data exploration and analysis through natural language. -Currently, querychat uses DuckDB for its SQL engine. It's extremely fast and has a surprising number of [statistical functions](https://duckdb.org/docs/stable/sql/functions/aggregates.html#statistical-aggregates). - -## Customizing querychat - -### Provide a greeting (recommended) - -When the querychat UI first appears, you will usually want it to greet the user with some basic instructions. By default, these instructions are auto-generated every time a user arrives; this is slow, wasteful, and unpredictable. Instead, you should create a file called `greeting.md`, and when calling `querychat.init`, pass `greeting=Path("greeting.md")`. - -You can provide suggestions to the user by using the ` ` tag. - -For example: +## Installation -```markdown -* **Filter and sort the data:** - * Show only survivors - * Filter to first class passengers under 30 - * Sort by fare from highest to lowest +Install the latest stable release [from PyPI](https://pypi.org/project/querychat/): -* **Answer questions about the data:** - * What was the survival rate by gender? - * What's the average age of children who survived? - * How many passengers were traveling alone? +```bash +pip install querychat ``` -These suggestions appear in the greeting and automatically populate the chat text box when clicked. -This gives the user a few ideas to explore on their own. -You can see this behavior in our [`querychat template`](https://shiny.posit.co/py/templates/querychat/). - -### Generate a greeting +## Quick start -If you need help coming up with a greeting, you can use the `querychat.greeting()` function to generate one: +The main entry point is the [`QueryChat` class](reference/QueryChat.qmd). It requires a [data source](data-sources.qmd) (e.g., pandas, polars, etc) and a name for the data. It also accepts optional parameters to customize the behavior, such as the `client` [model](models.qmd). +The quickest way to start chatting is to call the `.app()` method, which returns a Shiny app object. -```python -import querychat -from palmerpenguins import load_penguins -# Create config with your dataset -penguins = load_penguins() -penguins_config = querychat.init(penguins, "penguins") +```{.python filename="titanic-app.py"} +from seaborn import load_dataset +from querychat import QueryChat -# Generate a greeting -querychat.greeting(penguins_config) -#> Hello! I'm here to help you explore and analyze the penguins dataset. -#> Here are some example prompts you can try: -#> ... - -# Update the config with the generated greeting -penguins_config = querychat.init( - penguins, - "penguins", - greeting="Hello! I'm here to help you explore and analyze the penguins dataset..." -) +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic", client="openai/gpt-4.1") +app = qc.app() ``` -This will use the LLM to generate a friendly greeting message with sample prompts. -In Shiny apps, you could also generate the greeting once when the app starts up so that it's shared among all users: +With the above code saved to `titanic-app.py` and an API key set[^api-key], you can [run the app](https://shiny.posit.co/py/get-started/create-run.html#run-your-shiny-application) from a terminal (or [VSCode](https://marketplace.visualstudio.com/items?itemName=Posit.shiny)): -```python -penguins_config = querychat.init(penguins, "penguins") -penguins_config.greeting = querychat.greeting(penguins_config, echo="none") +```bash +export OPENAI_API_KEY="your_api_key_here" +shiny run --reload titanic-app.py ``` -### Augment the system prompt (recommended) +[^api-key]: By default, Querychat uses OpenAI to power the chat experience. So, for this example to work, you'll need [an OpenAI API key](https://platform.openai.com/). See the [Models](models.qmd) page for details on how to set up credentials for other model providers. -In LLM parlance, the _system prompt_ is the set of instructions and specific knowledge you want the model to use during a conversation. querychat automatically creates a system prompt which is comprised of: +Once running, you'll notice 3 main views: -1. The basic set of behaviors the LLM must follow in order for querychat to work properly. (See `querychat/prompt/prompt.md` if you're curious what this looks like.) -2. The SQL schema of the data frame you provided. -3. (Optional) Any additional description of the data you choose to provide. -4. (Optional) Any additional instructions you want to use to guide querychat's behavior. +1. A sidebar chat with suggestions on where to start exploring. +2. A data table that updates to reflect filtering and sorting queries. +3. The SQL query behind the data table, for transparency and reproducibility. -#### Data description +![](/images/quickstart.png){fig-alt="Screenshot of querychat's app with the titanic dataset." class="lightbox shadow rounded mb-3"} -If you give querychat your dataset and nothing else, it will provide the LLM with the basic schema of your data: +Suppose we pick a suggestion like "Show me passengers who survived". Since this is a filtering operation, both the data table and SQL query update accordingly. -- Column names -- DuckDB data type (integer, float, boolean, datetime, text) -- For text columns with less than 10 unique values, we assume they are categorical variables and include the list of values. This threshold is configurable. -- For integer and float columns, we include the range +![](/images/quickstart-filter.png){fig-alt="Screenshot of the querychat's app with the titanic dataset filtered to passengers who survived." class="lightbox shadow rounded mb-3"} -And that's all the LLM will know about your data. -The actual data does not get passed into the LLM. -We calculate these values before we pass the schema information into the LLM. +Querychat can also handle more general questions about the data that require calculations and aggregations. For example, we can ask "What is the average age of passengers who survived?". In this case, querychat will generate/execute the SQL query to perform the relevant calculation, and return the result in the chat: -If the column names are usefully descriptive, it may be able to make a surprising amount of sense out of the data. But if your data frame's columns are `x`, `V1`, `value`, etc., then the model will need to be given more background info--just like a human would. +![](/images/quickstart-summary.png){fig-alt="Screenshot of the querychat's app with a summary statistic inlined in the chat." class="lightbox shadow rounded mb-3"} -To provide this information, use the `data_description` argument. For example, if you're using the `titanic` dataset, you might create a `data_description.md` like this: +As you'll learn later in [Build an app](build.qmd), you can also access the SQL query and filtered/sorted data frame programmatically for use elsewhere in your app. This makes it rather seemless to have natural language interaction with your data alongside other visualizations and analyses. -```markdown -This dataset contains information about Titanic passengers, collected for predicting survival. +Before we build though, let's take a moment to better understand how querychat works under the hood, and whether it's right for you. -- survived: Survival (0 = No, 1 = Yes) -- pclass: Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd) -- sex: Sex of passenger -- age: Age in years -- sibsp: Number of siblings/spouses aboard -- parch: Number of parents/children aboard -- fare: Passenger fare -- embarked: Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton) -- class: Same as pclass but as text -- who: Man, woman, or child -- adult_male: Boolean for adult males -- deck: Deck of the ship -- embark_town: Town of embarkation -- alive: Survival status as text -- alone: Whether the passenger was alone -``` -which you can then pass via: - -```python -qc_config = querychat.init( - titanic, - "titanic", - data_description=Path("data_description.md") -) -``` - -querychat doesn't need this information in any particular format; just put whatever information, in whatever format, you think a human would find helpful. +## How it works -#### Additional instructions +Querychat leverages LLMs incredible capability to translate natural language into SQL queries. Frontier models are shockingly good at this task, but even the best models still need to know the overall data structure to perform well. For this reason, querychat supplies a [system prompt](context.qmd) with the schema of the data (i.e., column names, types, ranges, etc), but never the raw data itself. -You can add additional instructions of your own to the end of the system prompt, by passing `extra_instructions` into `querychat.init`. +When the LLM generates a SQL query, querychat executes it against a SQL database (DuckDB[^duckdb] by default) to get results in a **safe**, **reliable**, and **verifiable** manner. In short, this execution is **safe** since only `SELECT` statements are allowed, **reliable** since the database engine handles all calculations, and **verifiable** since the user can always see the SQL query that was run. This makes querychat a trustworthy tool for data exploration, as every action taken by the LLM is transparent and independently reproducible. -```python -qc_config = querychat.init( - titanic, - "titanic", - extra_instructions=[ - "You're speaking to a British audience--please use appropriate spelling conventions.", - "Use lots of emojis! 😃 Emojis everywhere, 🌍 emojis forever. ♾️", - "Stay on topic, only talk about the data dashboard and refuse to answer other questions." - ] -) -``` -You can also put these instructions in a separate file and use `Path("instructions.md")` to load them, as we did for `data_description` above. +::: callout-important +### Data privacy -**Warning:** It is not 100% guaranteed that the LLM will always—or in many cases, ever—obey your instructions, and it can be difficult to predict which instructions will be a problem. So be sure to test extensively each time you change your instructions, and especially, if you change the model you use. +See the [Provide context](context.qmd) and [Tools](tools.qmd) articles to learn more about what information is provided to the LLM and what it's capable of doing with code execution. +:::: -### Use a different LLM provider +[^duckdb]: Duckdb is extremely fast and has a surprising number of [statistical functions](https://duckdb.org/docs/stable/sql/functions/aggregates.html#statistical-aggregates). -By default, querychat uses GPT-4o via the OpenAI API. If you want to use a different model, you can provide a `create_chat_callback` function that takes a `system_prompt` parameter, and returns a chatlas Chat object: +### Bespoke interfaces -```python -import querychat -import chatlas +While the quickstart app is a great way to get started, querychat is designed to be highly extensible. +You can not only customize the underlying model and data source, but also build fully custom Shiny apps around the core chat functionality. -qc_config = querychat.init( - titanic, - "titanic", - client=chatlas.ChatAnthropic(model="claude-3-7-sonnet-latest") -) -``` +For a motivating example, consider the following ([sidebot](https://shiny.posit.co/py/docs/genai-inspiration.html#sidebot)) app that leverages querychat's tooling to create reactive summaries and visualizations based on the user's natural language queries: -This would use Claude 3.7 Sonnet instead, which would require you to provide an API key. See the [chatlas documentation](https://github.com/posit-dev/chatlas) for more information on how to authenticate with different providers. +![](/images/sidebot.png){fig-alt="Screenshot of sidebot, a custom shiny app built with querychat." class="lightbox shadow rounded mb-3"} -## Complete example -For a complete working example, see the [examples/app-dataframe.py](examples/app-dataframe.py) file in the repository. -This example includes: +## Next steps -- Loading a dataset -- Reading greeting and data description from files -- Setting up the querychat configuration -- Creating a Shiny UI with the chat sidebar -- Displaying the filtered data in the main panel +From here, you might want to learn more about: -If you have Shiny installed, and want to get started right away, you can use our -[querychat template](https://shiny.posit.co/py/templates/querychat/) -or -[sidebot template](https://shiny.posit.co/py/templates/sidebot/). +- [Models](models.qmd): customize the LLM behind querychat. +- [Data sources](data-sources.qmd): different data sources you can use with querychat. +- [Provide context](context.qmd): provide the LLM with the context it needs to work well. +- [Build an app](build.qmd): design a custom Shiny app around querychat. diff --git a/pkg-py/docs/models.qmd b/pkg-py/docs/models.qmd new file mode 100644 index 00000000..3d19e133 --- /dev/null +++ b/pkg-py/docs/models.qmd @@ -0,0 +1,88 @@ +--- +title: Models +--- + +Under the hood, `querychat` is powered by [chatlas](https://posit-dev.github.io/chatlas/), a library for building chat-based applications with large language models (LLMs). `chatlas` supports a wide range of LLM providers -- [see here](https://posit-dev.github.io/chatlas/get-started/models.html) for a full list. + +### Specify a model + +To use a particular model, pass a `"{provider}/{model}"` string to the `client` parameter. Under the hood, this gets passed along to [`chatlas.ChatAuto`](https://posit-dev.github.io/chatlas/reference/ChatAuto.html) + +```python +from querychat import QueryChat +from seaborn import load_dataset +titanic = load_dataset("titanic") + +qc = QueryChat( + titanic, + "titanic", + client="anthropic/claude-sonnet-4-5" +) +``` + +And, if you'd like to effectively set a new default model, you can use the `QUERYCHAT_CLIENT` environment variable. + +```shell +export QUERYCHAT_CLIENT="anthropic/claude-sonnet-4-5" +``` + +Note that it can also be useful to pass a full `Chat` object to the `client` parameter for more advanced use cases (e.g., custom [parameters](https://posit-dev.github.io/chatlas/get-started/parameters.html), [tools](https://posit-dev.github.io/chatlas/get-started/tools.html), etc). It can also be useful for getting some [helpful autocomplete](https://posit-dev.github.io/chatlas/get-started/models.html#model-type-hints) of available models. + +```python +from chatlas import ChatAnthropic + +client = ChatAnthropic(model="claude-sonnet-4-5") +``` + +### Credentials + +Most models require an API key or some other form of authentication. See the reference page for the relevant [model provider](https://posit-dev.github.io/chatlas/get-started/models.html) (e.g., [ChatAnthropic](https://posit-dev.github.io/chatlas/reference/ChatAnthropic.html)) to learn more on how to set up credentials. + +::: callout-tip +### Github model marketplace + +If you are already setup with Github credentials, [Github model marketplace](https://github.com/marketplace/models) provides a free and easy way to get started. See [here](https://posit-dev.github.io/chatlas/reference/ChatGithub.html) for more details on how to get setup. + +```{.python filename="github-model.py"} +from chatlas import ChatGithub + +# Just works if GITHUB_TOKEN is set in your environment +client = ChatGithub(model="gpt-4.1") +``` +::: + +In general, most providers will prefer credentials stored as environment variables, and common practice is to use a `.env` file to manage these variables. For example, for `ChatOpenAI()`, you might create a `.env` file like so: + +```{.shell filename=".env"} +OPENAI_API_KEY="your_api_key_here" +``` + +Then, load the environment variables via the `dotenv` package: + +```shell +pip install dotenv +``` + +```python +from dotenv import load_dotenv +load_dotenv() +``` + + +### Recommended models + +In theory, you could use any model that has tool calling support, but we currently recommend (as of November 2025): + +- GPT-4.1 (the default) +- Claude 4.5 Sonnet +- Google Gemini 3.0 + +In our testing, we've found that those models strike a good balance between accuracy and latency. Smaller/cheaper models like GPT-4o-mini are fine for simple queries but make surprising mistakes with more complex ones; and reasoning models like o3-mini slow down responses without providing meaningfully better results. + +We've also seen some decent results with frontier local models, but even if you have the compute to run the largest models, they still tend to lag behind the cloud-hosted options in terms of accuracy and speed. + +::: callout-tip +## Data privacy concerns? + +If you have data privacy concerns, consider that your org may provide access to private instances of these models with data residency guarantees. For example, Azure, AWS Bedrock, and Google Vertex AI all provide private instances of popular LLMs. You can interface with these enterprise providers by passing the right string (e.g., `"bedrock-anthropic"`) or `Chat` object (e.g., `ChatBedrockAnthropic()`) to the `client` parameter. See the [chatlas docs](https://posit-dev.github.io/chatlas/get-started/models.html) for more details. +::: diff --git a/pkg-py/docs/styles.scss b/pkg-py/docs/styles.scss new file mode 100644 index 00000000..44b3bbeb --- /dev/null +++ b/pkg-py/docs/styles.scss @@ -0,0 +1,70 @@ +/*-- scss:defaults --*/ + +$font-family-sans-serif: 'Public Sans', sans-serif; +$font-family-monospace: 'Fira Code', monospace; +$headings-font-family: 'Hubot Sans', sans-serif; +$display-font-family: 'Hubot Sans', sans-serif; +$headings-color: #193D56; + +/*-- scss:rules --*/ + +@import url('https://fonts.googleapis.com/css2?family=Public+Sans:ital,wght@0,100..900;1,100..900&display=swap'); +@import url('https://fonts.googleapis.com/css?family=Fira Code'); +@import url('https://fonts.googleapis.com/css?family=Hubot Sans'); + +.header { + font-family: $headings-font-family; + color: $headings-color; +} + +/* css styles */ + +.cell-output pre code { + white-space: pre-wrap; +} + +/* Undo somebody's aggressive CSS */ +pre { + font-family: var(--bs-font-monospace); +} + + +/* sidebar */ +.sidebar-item-container { + font-size: 1rem; + + .text-start { + font-weight: 600; + } +} + +.sidebar-item-section { + padding-top: 0.5rem; +} + +// make it even more noticable +.sidebar-link { + &:hover { + font-weight: 500; + } + + &.active { + position: relative; + + &::before { + content: "\23F5"; + position: absolute; + left: -0.9em; + font-size: 1em; + color: var(--bs-primary); + } + } +} + + +/* Get code output to look like a sourceCode block */ +pre:has(> code) { + background-color: rgba(233, 236, 239, 0.65); + border-radius: .25em; + padding: .4em; +} \ No newline at end of file diff --git a/pkg-py/docs/tools.qmd b/pkg-py/docs/tools.qmd new file mode 100644 index 00000000..cd348eba --- /dev/null +++ b/pkg-py/docs/tools.qmd @@ -0,0 +1,71 @@ +--- +title: Tools +--- + +QueryChat combines [tool calling](https://posit-dev.github.io/chatlas/get-started/tools.html) with [reactivity](https://shiny.posit.co/py/docs/reactive-foundations.html) to not only execute SQL, but also reactively update dependent data views. Understanding how these tools work will help you better understand what QueryChat is capable of and how to customize/extend to its behavior. + +One important thing to understand generally about Querychat's tools is they are Python functions, and that execution happens on _your machine_, not on the LLM provider's side. In other words, the SQL queries generated by the LLM are executed locally in the Shiny app process, and only the results (if any) are sent back to the LLM. + +Querychat provides the LLM access to three tools, serving two primary purposes: + +1. **Data updating** - Filter and sort data (without sending results to the LLM). +2. **Data analysis** - Calculate summaries and return results for interpretation by the LLM. + +## Data updating + +When a user asks to "Show me..." or "Filter to..." or "Sort by...", the LLM requests a call to the `update_dashboard` tool with an appropriate SQL query as input. An important constraint is that the query must return all original schema columns (typically using `SELECT *`). When called, Querychat will both set a reactive value holding [the current SQL query](build.qmd#sql-query) and execute the query to get the result. + +The result of query then used to set a reactive value holding the [filtered/sorted data frame](build.qmd#filtered-data). Thanks to reactivity, this will automatically update any views depending on this data frame, such as the data table displayed in the UI. + +This tool also takes a `title` parameter, which is a short description of the filter/sort operation (e.g., "First-class passengers"). This, also, is made available through [a reactive value](build.qmd#title) for display somewhere in your app. + +Here's a basic example of this tool in action with the `.app()` method. Notice how this pre-built app not only shows the data table, but also the SQL query and title generated by the LLM (for transparency): + +```{.python filename="titanic-app.py"} +from querychat import QueryChat +from seaborn import load_dataset + +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic") +app = qc.app() +``` + +![](/images/quickstart-filter.png){fig-alt="Screenshot of the querychat's app with the titanic dataset filtered to passengers who survived." class="lightbox shadow rounded mb-3"} + +The other data updating tool is `reset_dashboard`, which clears any active filters and returns the data table to its original unfiltered state. The LLM typically uses this when users say "reset", "start over", or "clear filters". + +## Data analysis + +When a user asks analytical questions like "What is the average...?", "How many...?", or "Which item has the highest...?", the LLM generates a SQL query and requests a call to the `query` tool. Unlike the data updating tools, this tool will not update any reactive values. Instead, it will: + +1. Execute the SQL query +2. Display both the SQL query and results in the UI +3. Return the results back to the LLM for interpretation + +Here's an example of it in action: + +```{.python filename="titanic-app.py"} +from querychat import QueryChat +from seaborn import load_dataset + +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic") +app = qc.app() +``` + +![](/images/quickstart-summary.png){fig-alt="Screenshot of the querychat's app with a summary statistic inlined in the chat." class="lightbox shadow rounded mb-3"} + + +## View the source + +If you'd like to better understand how the tools work and how the LLM is prompted to use them, check out the following resources: + +**Source code:** + +- [`tools.py`](https://github.com/posit-dev/querychat/blob/main/pkg-py/src/querychat/tools.py) + +**Prompts:** + +- [`prompts/tool-update-dashboard.md`](https://github.com/posit-dev/querychat/blob/main/pkg-py/src/querychat/prompts/tool-update-dashboard.md) +- [`prompts/tool-reset-dashboard.md`](https://github.com/posit-dev/querychat/blob/main/pkg-py/src/querychat/prompts/tool-reset-dashboard.md) +- [`prompts/tool-query.md`](https://github.com/posit-dev/querychat/blob/main/pkg-py/src/querychat/prompts/tool-query.md) diff --git a/pkg-py/examples/01-hello-app.py b/pkg-py/examples/01-hello-app.py new file mode 100644 index 00000000..856c93bd --- /dev/null +++ b/pkg-py/examples/01-hello-app.py @@ -0,0 +1,6 @@ +from seaborn import load_dataset +from querychat import QueryChat + +titanic = load_dataset("titanic") +qc = QueryChat(titanic, "titanic") +app = qc.app() diff --git a/pkg-py/examples/02-prompt-app.py b/pkg-py/examples/02-prompt-app.py new file mode 100644 index 00000000..832059f1 --- /dev/null +++ b/pkg-py/examples/02-prompt-app.py @@ -0,0 +1,18 @@ + +from pathlib import Path +from seaborn import load_dataset +from querychat import QueryChat + +titanic = load_dataset("titanic") + +greeting = Path(__file__).parent / "greeting.md" +data_desc = Path(__file__).parent / "data_description.md" + +qc = QueryChat( + titanic, + "titanic", + greeting=greeting, + data_description=data_desc, +) + +qc.app() diff --git a/pkg-py/examples/03-sidebar-core-app.py b/pkg-py/examples/03-sidebar-core-app.py new file mode 100644 index 00000000..b37dc750 --- /dev/null +++ b/pkg-py/examples/03-sidebar-core-app.py @@ -0,0 +1,36 @@ +from seaborn import load_dataset +from shiny import App, render, ui +from querychat import QueryChat + +titanic = load_dataset("titanic") + +# 1. Provide data source to QueryChat +qc = QueryChat(titanic, "titanic") + +app_ui = ui.page_sidebar( + # 2. Create sidebar chat control + qc.sidebar(), + ui.card( + ui.card_header(ui.output_text("title")), + ui.output_data_frame("data_table"), + fill=True, + ), + fillable=True +) + + +def server(input, output, session): + # 3. Add server logic (to get reactive data frame and title) + qc_vals = qc.server() + + # 4. Use the filtered/sorted data frame reactively + @render.data_frame + def data_table(): + return qc_vals.df() + + @render.text + def title(): + return qc_vals.title() or "Titanic Dataset" + + +app = App(app_ui, server) diff --git a/pkg-py/examples/03-sidebar-express-app.py b/pkg-py/examples/03-sidebar-express-app.py new file mode 100644 index 00000000..8e0b7cac --- /dev/null +++ b/pkg-py/examples/03-sidebar-express-app.py @@ -0,0 +1,28 @@ +from seaborn import load_dataset +from shiny.express import render, ui +from querychat.express import QueryChat + +titanic = load_dataset("titanic") + +# 1. Provide data source to QueryChat +qc = QueryChat(titanic, "titanic") + +# 2. Add sidebar chat control +qc.sidebar() + +# 3. Add a card with reactive title and data frame +with ui.card(): + with ui.card_header(): + @render.text + def title(): + return qc.title() or "Titanic Dataset" + + @render.data_frame + def data_table(): + return qc.df() + +# 4. Set some page options (optional) +ui.page_opts( + fillable=True, + title="Titanic Dataset Explorer" +) diff --git a/pkg-py/examples/app-database-sqlite.py b/pkg-py/examples/app-database-sqlite.py deleted file mode 100644 index 54327d5c..00000000 --- a/pkg-py/examples/app-database-sqlite.py +++ /dev/null @@ -1,68 +0,0 @@ -from pathlib import Path - -import chatlas -import querychat -from seaborn import load_dataset -from shiny import App, render, ui -from sqlalchemy import create_engine - -# Load titanic data and create SQLite database -db_path = Path(__file__).parent / "titanic.db" -engine = create_engine("sqlite:///" + str(db_path)) - -if not db_path.exists(): - # For example purposes, we'll create the database if it doesn't exist. Don't - # do this in your app! - titanic = load_dataset("titanic") - titanic.to_sql("titanic", engine, if_exists="replace", index=False) - -greeting = Path(__file__).parent / "greeting.md" -data_desc = Path(__file__).parent / "data_description.md" - -# 1. Configure querychat - -def use_github_models(system_prompt: str) -> chatlas.Chat: - # GitHub models give us free rate-limited access to the latest LLMs - # you will need to have GITHUB_PAT defined in your environment - return chatlas.ChatGithub( - model="gpt-4.1", - system_prompt=system_prompt, - ) - -qc_config = querychat.init( - engine, - "titanic", - greeting=greeting, - data_description=data_desc, - client=use_github_models, -) - -# Create UI -app_ui = ui.page_sidebar( - # 2. Place the chat component in the sidebar - querychat.sidebar("chat"), - # Main panel with data viewer - ui.card( - ui.output_data_frame("data_table"), - fill=True, - ), - title="querychat with Python (SQLite)", - fillable=True, - class_="bslib-page-dashboard", -) - - -# Define server logic -def server(input, output, session): - # 3. Initialize querychat server with the config from step 1 - qc = querychat.server("chat", qc_config) - - # 4. Display the filtered dataframe - @render.data_frame - def data_table(): - # Access filtered data via qc.df() reactive - return qc.df() - - -# Create Shiny app -app = App(app_ui, server) diff --git a/pkg-py/examples/app-dataframe-pandas.py b/pkg-py/examples/app-dataframe-pandas.py deleted file mode 100644 index 487095a5..00000000 --- a/pkg-py/examples/app-dataframe-pandas.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path - -import chatlas -import querychat -from seaborn import load_dataset -from shiny import App, render, ui - -titanic = load_dataset("titanic") - -greeting = Path(__file__).parent / "greeting.md" -data_desc = Path(__file__).parent / "data_description.md" - -# 1. Configure querychat - -def use_github_models(system_prompt: str) -> chatlas.Chat: - # GitHub models give us free rate-limited access to the latest LLMs - # you will need to have GITHUB_PAT defined in your environment - return chatlas.ChatGithub( - model="gpt-4.1", - system_prompt=system_prompt, - ) - -qc_config = querychat.init( - titanic, - "titanic", - greeting=greeting, - data_description=data_desc, - client=use_github_models, -) - -# Create UI -app_ui = ui.page_sidebar( - # 2. Place the chat component in the sidebar - querychat.sidebar("chat"), - # Main panel with data viewer - ui.card( - ui.output_data_frame("data_table"), - fill=True, - ), - title="querychat with Python", - fillable=True, - class_="bslib-page-dashboard", -) - - -# Define server logic -def server(input, output, session): - # 3. Initialize querychat server with the config from step 1 - qc = querychat.server("chat", qc_config) - - # 4. Display the filtered dataframe - @render.data_frame - def data_table(): - # Access filtered data via chat.df() reactive - return qc.df() - - -# Create Shiny app -app = App(app_ui, server) diff --git a/pkg-py/examples/app.py b/pkg-py/examples/app.py deleted file mode 100644 index 02614833..00000000 --- a/pkg-py/examples/app.py +++ /dev/null @@ -1,66 +0,0 @@ -import chatlas -from seaborn import load_dataset -from shiny import App, render, ui - -from dotenv import load_dotenv -load_dotenv() - -import querychat - -titanic = load_dataset("titanic") - -# 1. Configure querychat. -# This is where you specify the dataset and can also -# override options like the greeting message, system prompt, model, etc. - - -def use_github_models(system_prompt: str) -> chatlas.Chat: - # GitHub models give us free rate-limited access to the latest LLMs - # you will need to have GITHUB_PAT defined in your environment - return chatlas.ChatGithub( - model="gpt-4.1", - system_prompt=system_prompt, - ) - - -qc_config = querychat.init( - data_source=titanic, - table_name="titanic", - client=use_github_models, -) - -# Create UI -app_ui = ui.page_sidebar( - # 2. Use querychat.sidebar(id) in a ui.page_sidebar. - # Alternatively, use querychat.ui(id) elsewhere if you don't want your - # chat interface to live in a sidebar. - querychat.sidebar("chat"), - ui.card( - ui.card_header(ui.output_text("title")), - ui.output_data_frame("data_table"), - fill=True, - ), - fillable=True, - class_="bslib-page-dashboard" -) - - -# Define server logic -def server(input, output, session): - # 3. Create a querychat object using the config from step 1. - qc = querychat.server("chat", qc_config) - - # 4. Use the filtered/sorted data frame anywhere you wish, via the - # chat.df() reactive. - @render.data_frame - def data_table(): - return qc.df() - - @render.text - def title(): - title = qc.title() - return title if title else "Titanic Dataset" - - -# Create Shiny app -app = App(app_ui, server)