From c2d0e7571bb541404e88189b9f355085fa5fce74 Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 11:06:06 -0400 Subject: [PATCH 1/8] Performance messy initial --- meltano.yml | 29 +++++++--- performance/1m_rows_generate.py | 28 ++++++++++ performance/README.md | 25 +++++++++ performance/meltano_import.sh | 2 + performance/native_import.sh | 53 +++++++++++++++++++ performance/prep.sh | 35 +++++++++++++ performance/progres_indicator.sh | 15 ++++++ performance/speed_compare.py | 59 +++++++++++++++++++++ performance/speed_compare.sh | 61 ++++++++++++++++++++++ performance/target_postgres_copy_branch.sh | 2 + 10 files changed, 302 insertions(+), 7 deletions(-) create mode 100644 performance/1m_rows_generate.py create mode 100644 performance/README.md create mode 100755 performance/meltano_import.sh create mode 100755 performance/native_import.sh create mode 100755 performance/prep.sh create mode 100755 performance/progres_indicator.sh create mode 100644 performance/speed_compare.py create mode 100755 performance/speed_compare.sh create mode 100755 performance/target_postgres_copy_branch.sh diff --git a/meltano.yml b/meltano.yml index 64fa82a0..ac83a3a2 100644 --- a/meltano.yml +++ b/meltano.yml @@ -11,9 +11,11 @@ plugins: config: streams: - stream_name: animals - input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl + input_filename: + https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl - stream_name: page_views - input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl + input_filename: + https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl stream_maps: animals: __key_properties__: [id] @@ -30,13 +32,22 @@ plugins: - commits.url - commits.sha - commits.commit_timestamp + - name: tap-csv + variant: meltanolabs + pip_url: git+https://github.com/MeltanoLabs/tap-csv.git + config: + files: + - entity: data_target_postgres + path: $MELTANO_PROJECT_ROOT/performance/data.csv + keys: [column_1] + add_metadata_columns: false loaders: - name: target-postgres namespace: target_postgres pip_url: -e . settings: - name: sqlalchemy_url - kind: password + kind: string sensitive: true - name: ssl_enable kind: boolean @@ -46,16 +57,16 @@ plugins: sensitive: true - name: ssl_mode - name: ssl_certificate_authority - kind: password + kind: string sensitive: true - name: ssl_client_certificate - kind: password + kind: string sensitive: true - name: ssl_client_private_key - kind: password + kind: string sensitive: true - name: password - kind: password + kind: string sensitive: true - name: host - name: port @@ -72,6 +83,10 @@ plugins: password: postgres database: postgres target_schema: test + validate_records: false add_record_metadata: true + - name: target-postgres-copy-branch + inherit_from: target-postgres + pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy environments: - name: dev diff --git a/performance/1m_rows_generate.py b/performance/1m_rows_generate.py new file mode 100644 index 00000000..465a5f5b --- /dev/null +++ b/performance/1m_rows_generate.py @@ -0,0 +1,28 @@ +import csv +import random +import string + +# Number of rows and columns +num_rows = 1_000_000 +num_columns = 10 + +# Generate random data for CSV +def random_string(length=10): + return ''.join(random.choices(string.ascii_letters + string.digits, k=length)) + +# Generate the CSV file +csv_filename = "data.csv" + +with open(csv_filename, mode='w', newline='') as csv_file: + writer = csv.writer(csv_file) + + # Write header + header = [f"column_{i+1}" for i in range(num_columns)] + writer.writerow(header) + + # Write data rows + for _ in range(num_rows): + row = [random_string() for _ in range(num_columns)] + writer.writerow(row) + +print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.") diff --git a/performance/README.md b/performance/README.md new file mode 100644 index 00000000..12d658de --- /dev/null +++ b/performance/README.md @@ -0,0 +1,25 @@ +# target-postgres Performance Analysis + +Just a POC right now. + +Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. + +Main points: +1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. +1. Relative speed is the metric to focus on. If we focus on absolute speed then there's a bunch of hardware consideration that we are not trying to solve here (Would need to consider how paralleization fits into the mix here if we go there) + +# Why do this work? +1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on + +# How to run +1. `./prep.sh` gets the data together for you in the right place +2. `./speed_compare.sh meltano_import.sh native_import.sh` runs each and gives you a nice time comparisons + +# Results on my slow machine + + + +# Other questions / concerns +1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well +1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records +kk diff --git a/performance/meltano_import.sh b/performance/meltano_import.sh new file mode 100755 index 00000000..9ed32d10 --- /dev/null +++ b/performance/meltano_import.sh @@ -0,0 +1,2 @@ +#!/bin/bash +meltano invoke target-postgres < data.singer diff --git a/performance/native_import.sh b/performance/native_import.sh new file mode 100755 index 00000000..58c56726 --- /dev/null +++ b/performance/native_import.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Variables +CSV_FILE="data.csv" +DB_NAME="postgres" +DB_USER="postgres" +DB_PASSWORD="postgres" +DB_HOST="localhost" +DB_PORT="5432" + +# Export the password to avoid being prompted +export PGPASSWORD=$DB_PASSWORD + +# Execute COPY command to import the CSV into PostgreSQL +#psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c "\COPY large_data FROM '$CSV_FILE' CSV HEADER;" +# Begin transaction +psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME < performance/data.singer + +# Create initial table in postgres + +#Spin up postgres instance +#podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres + +#Vars We'd definietly want this as a meltano utility, just as POC right now +DB_NAME="postgres" +DB_USER="postgres" +DB_PASSWORD="postgres" +DB_HOST="localhost" +DB_PORT="5432" +export PGPASSWORD=$DB_PASSWORD + +psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME < time2: + diff = time1 - time2 + ratio = time1 / time2 if time2 != 0 else float('inf') + print(f"{script2} is faster than {script1} by {diff:.6f} seconds.") + print(f"{script2} is {ratio:.2f} times faster than {script1}.") + else: + print(f"{script1} and {script2} have the same execution time.") + +if __name__ == "__main__": + main() diff --git a/performance/speed_compare.sh b/performance/speed_compare.sh new file mode 100755 index 00000000..b9e70281 --- /dev/null +++ b/performance/speed_compare.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Check if two arguments are given +if [ $# -ne 2 ]; then + echo "Usage: $0 script1 script2" + exit 1 +fi + +script1="$1" +script2="$2" + +# Check if the scripts exist and are executable +if [ ! -x "$script1" ]; then + echo "Error: $script1 does not exist or is not executable." + exit 1 +fi + +if [ ! -x "$script2" ]; then + echo "Error: $script2 does not exist or is not executable." + exit 1 +fi + +# Function to measure execution time +measure_time() { + local script="$1" + local start_time end_time duration + start_time=$(date +%s.%N) + ./"$script" + end_time=$(date +%s.%N) + duration=$(awk "BEGIN {print $end_time - $start_time}") + echo "$duration" +} + +# Measure execution time for script1 +echo "Measuring execution time for $script1..." +time1=$(measure_time "$script1") +echo "Execution time for $script1: $time1 seconds" + +# Measure execution time for script2 +echo "Measuring execution time for $script2..." +time2=$(measure_time "$script2") +echo "Execution time for $script2: $time2 seconds" + +# Compare the execution times using awk +comparison=$(awk -v t1="$time1" -v t2="$time2" 'BEGIN { if (t1 < t2) print "script1_faster"; else if (t1 > t2) print "script2_faster"; else print "equal" }') + +if [ "$comparison" = "script1_faster" ]; then + diff=$(awk "BEGIN {print $time2 - $time1}") + ratio=$(awk "BEGIN {print $time2 / $time1}") + ratio_formatted=$(printf "%.2f" "$ratio") + echo "$script1 is faster than $script2 by $diff seconds." + echo "$script1 is $ratio_formatted times faster than $script2." +elif [ "$comparison" = "script2_faster" ]; then + diff=$(awk "BEGIN {print $time1 - $time2}") + ratio=$(awk "BEGIN {print $time1 / $time2}") + ratio_formatted=$(printf "%.2f" "$ratio") + echo "$script2 is faster than $script1 by $diff seconds." + echo "$script2 is $ratio_formatted times faster than $script1." +else + echo "$script1 and $script2 have the same execution time." +fi \ No newline at end of file diff --git a/performance/target_postgres_copy_branch.sh b/performance/target_postgres_copy_branch.sh new file mode 100755 index 00000000..1b2b8b53 --- /dev/null +++ b/performance/target_postgres_copy_branch.sh @@ -0,0 +1,2 @@ +#!/bin/bash +meltano invoke target-postgres-copy-branch < data.singer From 81f14c2a9f1f327e193950d774e429244df0ec40 Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:41:13 -0400 Subject: [PATCH 2/8] Performance refactored with table added --- performance/speed_compare.py | 59 ------ performance/speed_compare.sh | 61 ------ podman.sh | 3 + scripts/performance/.gitignore | 2 + .../performance}/1m_rows_generate.py | 3 +- .../performance}/README.md | 21 ++- scripts/performance/meltano.yml | 67 +++++++ .../performance/perf_tests/pg_copy_upsert.sh | 0 .../target_postgres_copy_branch.sh | 0 ...target_postgres_copy_branch_no_validate.sh | 3 + .../target_postgres_current_branch.sh | 0 ...get_postgres_current_branch_no_validate.sh | 3 + .../extractors/tap-csv--meltanolabs.lock | 83 ++++++++ .../extractors/tap-github--meltanolabs.lock | 177 ++++++++++++++++++ {performance => scripts/performance}/prep.sh | 7 +- .../performance}/progres_indicator.sh | 0 scripts/performance/speed_compare.py | 39 ++++ 17 files changed, 393 insertions(+), 135 deletions(-) delete mode 100644 performance/speed_compare.py delete mode 100755 performance/speed_compare.sh create mode 100755 podman.sh create mode 100644 scripts/performance/.gitignore rename {performance => scripts/performance}/1m_rows_generate.py (91%) rename {performance => scripts/performance}/README.md (51%) create mode 100644 scripts/performance/meltano.yml rename performance/native_import.sh => scripts/performance/perf_tests/pg_copy_upsert.sh (100%) rename {performance => scripts/performance/perf_tests}/target_postgres_copy_branch.sh (100%) create mode 100755 scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh rename performance/meltano_import.sh => scripts/performance/perf_tests/target_postgres_current_branch.sh (100%) create mode 100755 scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh create mode 100644 scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock create mode 100644 scripts/performance/plugins/extractors/tap-github--meltanolabs.lock rename {performance => scripts/performance}/prep.sh (71%) rename {performance => scripts/performance}/progres_indicator.sh (100%) create mode 100644 scripts/performance/speed_compare.py diff --git a/performance/speed_compare.py b/performance/speed_compare.py deleted file mode 100644 index 159e3f02..00000000 --- a/performance/speed_compare.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import subprocess -import time -import os - -def measure_time(script_path): - # Check if the script exists and is executable - if not os.path.isfile(script_path): - print(f"Error: {script_path} does not exist.") - sys.exit(1) - if not os.access(script_path, os.X_OK): - print(f"Error: {script_path} is not executable.") - sys.exit(1) - - # Measure execution time - start_time = time.perf_counter() - try: - subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: - print(f"Error: {script_path} exited with a non-zero status.") - sys.exit(1) - end_time = time.perf_counter() - duration = end_time - start_time - return duration - -def main(): - if len(sys.argv) != 3: - print(f"Usage: {sys.argv[0]} script1 script2") - sys.exit(1) - - script1 = sys.argv[1] - script2 = sys.argv[2] - - print(f"Measuring execution time for {script1}...") - time1 = measure_time(script1) - print(f"Execution time for {script1}: {time1:.6f} seconds\n") - - print(f"Measuring execution time for {script2}...") - time2 = measure_time(script2) - print(f"Execution time for {script2}: {time2:.6f} seconds\n") - - # Compare execution times - if time1 < time2: - diff = time2 - time1 - ratio = time2 / time1 if time1 != 0 else float('inf') - print(f"{script1} is faster than {script2} by {diff:.6f} seconds.") - print(f"{script1} is {ratio:.2f} times faster than {script2}.") - elif time1 > time2: - diff = time1 - time2 - ratio = time1 / time2 if time2 != 0 else float('inf') - print(f"{script2} is faster than {script1} by {diff:.6f} seconds.") - print(f"{script2} is {ratio:.2f} times faster than {script1}.") - else: - print(f"{script1} and {script2} have the same execution time.") - -if __name__ == "__main__": - main() diff --git a/performance/speed_compare.sh b/performance/speed_compare.sh deleted file mode 100755 index b9e70281..00000000 --- a/performance/speed_compare.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash - -# Check if two arguments are given -if [ $# -ne 2 ]; then - echo "Usage: $0 script1 script2" - exit 1 -fi - -script1="$1" -script2="$2" - -# Check if the scripts exist and are executable -if [ ! -x "$script1" ]; then - echo "Error: $script1 does not exist or is not executable." - exit 1 -fi - -if [ ! -x "$script2" ]; then - echo "Error: $script2 does not exist or is not executable." - exit 1 -fi - -# Function to measure execution time -measure_time() { - local script="$1" - local start_time end_time duration - start_time=$(date +%s.%N) - ./"$script" - end_time=$(date +%s.%N) - duration=$(awk "BEGIN {print $end_time - $start_time}") - echo "$duration" -} - -# Measure execution time for script1 -echo "Measuring execution time for $script1..." -time1=$(measure_time "$script1") -echo "Execution time for $script1: $time1 seconds" - -# Measure execution time for script2 -echo "Measuring execution time for $script2..." -time2=$(measure_time "$script2") -echo "Execution time for $script2: $time2 seconds" - -# Compare the execution times using awk -comparison=$(awk -v t1="$time1" -v t2="$time2" 'BEGIN { if (t1 < t2) print "script1_faster"; else if (t1 > t2) print "script2_faster"; else print "equal" }') - -if [ "$comparison" = "script1_faster" ]; then - diff=$(awk "BEGIN {print $time2 - $time1}") - ratio=$(awk "BEGIN {print $time2 / $time1}") - ratio_formatted=$(printf "%.2f" "$ratio") - echo "$script1 is faster than $script2 by $diff seconds." - echo "$script1 is $ratio_formatted times faster than $script2." -elif [ "$comparison" = "script2_faster" ]; then - diff=$(awk "BEGIN {print $time1 - $time2}") - ratio=$(awk "BEGIN {print $time1 / $time2}") - ratio_formatted=$(printf "%.2f" "$ratio") - echo "$script2 is faster than $script1 by $diff seconds." - echo "$script2 is $ratio_formatted times faster than $script1." -else - echo "$script1 and $script2 have the same execution time." -fi \ No newline at end of file diff --git a/podman.sh b/podman.sh new file mode 100755 index 00000000..e9666eee --- /dev/null +++ b/podman.sh @@ -0,0 +1,3 @@ +#/bin/bash +#Username postgres password postgres +podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres diff --git a/scripts/performance/.gitignore b/scripts/performance/.gitignore new file mode 100644 index 00000000..6fe07289 --- /dev/null +++ b/scripts/performance/.gitignore @@ -0,0 +1,2 @@ +data.csv +data.singer \ No newline at end of file diff --git a/performance/1m_rows_generate.py b/scripts/performance/1m_rows_generate.py similarity index 91% rename from performance/1m_rows_generate.py rename to scripts/performance/1m_rows_generate.py index 465a5f5b..a71f1fed 100644 --- a/performance/1m_rows_generate.py +++ b/scripts/performance/1m_rows_generate.py @@ -2,7 +2,6 @@ import random import string -# Number of rows and columns num_rows = 1_000_000 num_columns = 10 @@ -25,4 +24,4 @@ def random_string(length=10): row = [random_string() for _ in range(num_columns)] writer.writerow(row) -print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.") +print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.") \ No newline at end of file diff --git a/performance/README.md b/scripts/performance/README.md similarity index 51% rename from performance/README.md rename to scripts/performance/README.md index 12d658de..4fdf71d7 100644 --- a/performance/README.md +++ b/scripts/performance/README.md @@ -1,7 +1,5 @@ # target-postgres Performance Analysis -Just a POC right now. - Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. Main points: @@ -13,13 +11,18 @@ Main points: # How to run 1. `./prep.sh` gets the data together for you in the right place -2. `./speed_compare.sh meltano_import.sh native_import.sh` runs each and gives you a nice time comparisons - -# Results on my slow machine - - +2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons +3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` + +# Results on my machine +| **Test Name** | **Total Run Time (s)** | **x Slower Than Native Copy** | +|-------------------------------------------------------------|------------------------|-------------------------------| +| `./perf_tests/pg_copy_upsert.sh` | 13.64 | 1.0000 | +| `./perf_tests/target_postgres_copy_branch_no_validate.sh` | 100.50 | 7.3697 | +| `./perf_tests/target_postgres_current_branch_no_validate.sh`| 141.48 | 10.3749 | +| `./perf_tests/target_postgres_copy_branch.sh` | 265.53 | 19.4719 | +| `./perf_tests/target_postgres_current_branch.sh` | 298.37 | 21.8799 | # Other questions / concerns 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well -1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records -kk +1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records \ No newline at end of file diff --git a/scripts/performance/meltano.yml b/scripts/performance/meltano.yml new file mode 100644 index 00000000..c25b2b5c --- /dev/null +++ b/scripts/performance/meltano.yml @@ -0,0 +1,67 @@ +version: 1 +send_anonymous_usage_stats: true +default_environment: dev +project_id: target-postgres +plugins: + extractors: + - name: tap-csv + variant: meltanolabs + pip_url: git+https://github.com/MeltanoLabs/tap-csv.git + config: + files: + - entity: data_target_postgres + path: $MELTANO_PROJECT_ROOT/data.csv + keys: [column_1] + add_metadata_columns: false + loaders: + - name: target-postgres + namespace: target_postgres + pip_url: -e ../../. + settings: + - name: sqlalchemy_url + kind: string + sensitive: true + - name: ssl_enable + kind: boolean + sensitive: true + - name: ssl_client_certificate_enable + kind: boolean + sensitive: true + - name: ssl_mode + - name: ssl_certificate_authority + kind: string + sensitive: true + - name: ssl_client_certificate + kind: string + sensitive: true + - name: ssl_client_private_key + kind: string + sensitive: true + - name: password + kind: string + sensitive: true + - name: host + - name: port + kind: integer + - name: user + - name: database + - name: target_schema + - name: add_record_metadata + kind: boolean + - name: validate_records + kind: boolean + - name: batch_size_rows + kind: integer + config: + host: localhost + port: 5432 + user: postgres + password: postgres + database: postgres + target_schema: test + add_record_metadata: true + - name: target-postgres-copy-branch + inherit_from: target-postgres + pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy +environments: +- name: dev diff --git a/performance/native_import.sh b/scripts/performance/perf_tests/pg_copy_upsert.sh similarity index 100% rename from performance/native_import.sh rename to scripts/performance/perf_tests/pg_copy_upsert.sh diff --git a/performance/target_postgres_copy_branch.sh b/scripts/performance/perf_tests/target_postgres_copy_branch.sh similarity index 100% rename from performance/target_postgres_copy_branch.sh rename to scripts/performance/perf_tests/target_postgres_copy_branch.sh diff --git a/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh b/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh new file mode 100755 index 00000000..e27475c0 --- /dev/null +++ b/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh @@ -0,0 +1,3 @@ +#!/bin/bash +export TARGET_POSTGRES_VALIDATE_RECORDS="false" +meltano invoke target-postgres-copy-branch < data.singer diff --git a/performance/meltano_import.sh b/scripts/performance/perf_tests/target_postgres_current_branch.sh similarity index 100% rename from performance/meltano_import.sh rename to scripts/performance/perf_tests/target_postgres_current_branch.sh diff --git a/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh b/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh new file mode 100755 index 00000000..181613ef --- /dev/null +++ b/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh @@ -0,0 +1,3 @@ +#!/bin/bash +export TARGET_POSTGRES_VALIDATE_RECORDS="false" +meltano invoke target-postgres < data.singer diff --git a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock new file mode 100644 index 00000000..e804040f --- /dev/null +++ b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock @@ -0,0 +1,83 @@ +{ + "plugin_type": "extractors", + "name": "tap-csv", + "namespace": "tap_csv", + "variant": "meltanolabs", + "label": "Comma Separated Values (CSV)", + "docs": "https://hub.meltano.com/extractors/tap-csv--meltanolabs", + "repo": "https://github.com/MeltanoLabs/tap-csv", + "pip_url": "git+https://github.com/MeltanoLabs/tap-csv.git", + "description": "Generic data extractor of CSV (comma separated value) files", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/csv.png", + "capabilities": [ + "catalog", + "discover" + ], + "settings_group_validation": [ + [ + "files" + ], + [ + "csv_files_definition" + ] + ], + "settings": [ + { + "name": "add_metadata_columns", + "kind": "boolean", + "value": false, + "label": "Add Metadata Columns", + "description": "When True, add the metadata columns (`_sdc_source_file`, `_sdc_source_file_mtime`, `_sdc_source_lineno`) to output." + }, + { + "name": "csv_files_definition", + "kind": "string", + "label": "Csv Files Definition", + "documentation": "https://github.com/MeltanoLabs/tap-csv#settings", + "description": "Project-relative path to JSON file holding array of objects as described under [Files](#files) - with `entity`, `path`, `keys`, and other optional keys:\n\n```json\n[\n {\n \"entity\": \"\",\n \"path\": \"\",\n \"keys\": [\"\"],\n },\n // ...\n]\n```\n", + "placeholder": "Ex. files-def.json" + }, + { + "name": "faker_config.locale", + "kind": "array", + "label": "Faker Config Locale", + "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization" + }, + { + "name": "faker_config.seed", + "kind": "string", + "label": "Faker Config Seed", + "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator" + }, + { + "name": "files", + "kind": "array", + "label": "Files", + "description": "Array of objects with `entity`, `path`, `keys`, and `encoding` [Optional] keys:\n\n* `entity`: The entity name, used as the table name for the data loaded from that CSV.\n* `path`: Local path (relative to the project's root) to the file to be ingested. Note that this may be a directory, in which case all files in that directory and any of its subdirectories will be recursively processed\n* `keys`: The names of the columns that constitute the unique keys for that entity.\n* `encoding`: [Optional] The file encoding to use when reading the file (i.e. \"latin1\", \"UTF-8\"). Use this setting when you get a UnicodeDecodeError error.\n Each input CSV file must be a traditionally-delimited CSV (comma separated columns, newlines indicate new rows, double quoted values).\n\nThe following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader:\n\n* `delimiter`: A one-character string used to separate fields. It defaults to ','.\n* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True.\n* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping.\n* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '\"'.\n* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False.\n* `strict`: When True, raise exception Error on bad CSV input. The default is False.\n\nThe first row is the header defining the attribute name for that column and will result to a column of the same name in the database. It must have a valid format with no spaces or special characters (like for example `!` or `@`, etc).\n" + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config", + "description": "User-defined config values to be used within map expressions." + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps", + "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." + } + ] +} \ No newline at end of file diff --git a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock new file mode 100644 index 00000000..f2dda680 --- /dev/null +++ b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock @@ -0,0 +1,177 @@ +{ + "plugin_type": "extractors", + "name": "tap-github", + "namespace": "tap_github", + "variant": "meltanolabs", + "label": "GitHub", + "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs", + "repo": "https://github.com/MeltanoLabs/tap-github", + "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git", + "description": "Code hosting platform", + "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png", + "capabilities": [ + "about", + "batch", + "catalog", + "discover", + "schema-flattening", + "state", + "stream-maps" + ], + "settings_group_validation": [ + [ + "repositories" + ], + [ + "organizations" + ], + [ + "searches" + ], + [ + "user_usernames" + ], + [ + "user_ids" + ] + ], + "settings": [ + { + "name": "additional_auth_tokens", + "kind": "array", + "label": "Additional Auth Tokens", + "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits." + }, + { + "name": "auth_token", + "kind": "string", + "label": "Auth Token", + "description": "GitHub token to authenticate with.", + "sensitive": true + }, + { + "name": "batch_config.encoding.compression", + "kind": "options", + "label": "Batch Config Encoding Compression", + "description": "Compression format to use for batch files.", + "options": [ + { + "label": "Gzip", + "value": "gzip" + }, + { + "label": "None", + "value": "none" + } + ] + }, + { + "name": "batch_config.encoding.format", + "kind": "options", + "label": "Batch Config Encoding Format", + "description": "Format to use for batch files.", + "options": [ + { + "label": "Jsonl", + "value": "jsonl" + } + ] + }, + { + "name": "batch_config.storage.prefix", + "kind": "string", + "label": "Batch Config Storage Prefix", + "description": "Prefix to use when writing batch files." + }, + { + "name": "batch_config.storage.root", + "kind": "string", + "label": "Batch Config Storage Root", + "description": "Root path to use when writing batch files." + }, + { + "name": "flattening_enabled", + "kind": "boolean", + "label": "Flattening Enabled", + "description": "'True' to enable schema flattening and automatically expand nested properties." + }, + { + "name": "flattening_max_depth", + "kind": "integer", + "label": "Flattening Max Depth", + "description": "The max depth to flatten schemas." + }, + { + "name": "metrics_log_level", + "kind": "string", + "label": "Metrics Log Level", + "description": "The log level of the API response metrics." + }, + { + "name": "organizations", + "kind": "array", + "label": "Organizations", + "description": "An array of strings containing the github organizations to be included" + }, + { + "name": "rate_limit_buffer", + "kind": "integer", + "label": "Rate Limit Buffer", + "description": "Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000." + }, + { + "name": "repositories", + "kind": "array", + "label": "Repositories", + "description": "An array of strings containing the github repos to be included" + }, + { + "name": "searches", + "kind": "array", + "label": "Searches", + "description": "An array of search descriptor objects with the following properties. \"name\" - a human readable name for the search query. \"query\" - a github search string (generally the same as would come after ?q= in the URL)" + }, + { + "name": "skip_parent_streams", + "kind": "boolean", + "label": "Skip Parent Streams", + "description": "Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are" + }, + { + "name": "start_date", + "kind": "date_iso8601", + "label": "Start Date" + }, + { + "name": "stream_map_config", + "kind": "object", + "label": "Stream Map Config" + }, + { + "name": "stream_maps", + "kind": "object", + "label": "Stream Maps" + }, + { + "name": "user_agent", + "kind": "string", + "label": "User Agent" + }, + { + "name": "user_ids", + "kind": "array", + "label": "User IDs", + "description": "A list of GitHub user ids." + }, + { + "name": "user_usernames", + "kind": "array", + "label": "User Usernames", + "description": "A list of GithHub usernames." + } + ], + "select": [ + "*.*", + "!traffic_*.*" + ] +} \ No newline at end of file diff --git a/performance/prep.sh b/scripts/performance/prep.sh similarity index 71% rename from performance/prep.sh rename to scripts/performance/prep.sh index 2061d15d..b725b81d 100755 --- a/performance/prep.sh +++ b/scripts/performance/prep.sh @@ -1,12 +1,11 @@ #!/bin/bash -# time python 1m_rows_generate.py -# Not trying to test the time here but this takes a very long time -#time meltano invoke tap-csv > performance/data.singer +time python 1m_rows_generate.py +time meltano invoke tap-csv > data.singer # Create initial table in postgres #Spin up postgres instance -#podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres +podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres #Vars We'd definietly want this as a meltano utility, just as POC right now DB_NAME="postgres" diff --git a/performance/progres_indicator.sh b/scripts/performance/progres_indicator.sh similarity index 100% rename from performance/progres_indicator.sh rename to scripts/performance/progres_indicator.sh diff --git a/scripts/performance/speed_compare.py b/scripts/performance/speed_compare.py new file mode 100644 index 00000000..2fa8b9cd --- /dev/null +++ b/scripts/performance/speed_compare.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import os +import subprocess +import sys +import time + + +def measure_time(script_path): + # Check if the script exists and is executable + if not os.path.isfile(script_path): + print(f"Error: {script_path} does not exist.") + sys.exit(1) + if not os.access(script_path, os.X_OK): + print(f"Error: {script_path} is not executable.") + sys.exit(1) + + # Measure execution time + print(f"Measuring execution time for {script_path}...") + start_time = time.perf_counter() + try: + subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print(f"Error: {script_path} exited with a non-zero status.") + sys.exit(1) + end_time = time.perf_counter() + duration = end_time - start_time + print(f"Execution time for {script_path}: {duration:.6f} seconds\n") + return duration + +def main(): + perf_tests_dir = "./perf_tests" + scripts = [test for test in os.listdir(perf_tests_dir) if os.path.isfile(os.path.join(perf_tests_dir, test))] + for script in scripts: + script_path = os.path.join("./perf_tests", script) + measure_time(script_path) + +if __name__ == "__main__": + main() From e6dd402fff59257535e4da071531653a3570f85c Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:42:27 -0400 Subject: [PATCH 3/8] Linter --- scripts/performance/.gitignore | 2 +- scripts/performance/1m_rows_generate.py | 14 +++++++++----- scripts/performance/README.md | 10 +++++----- .../performance/perf_tests/pg_copy_upsert.sh | 2 +- .../extractors/tap-csv--meltanolabs.lock | 2 +- .../extractors/tap-github--meltanolabs.lock | 2 +- scripts/performance/prep.sh | 2 +- scripts/performance/speed_compare.py | 17 ++++++++++++++--- 8 files changed, 33 insertions(+), 18 deletions(-) diff --git a/scripts/performance/.gitignore b/scripts/performance/.gitignore index 6fe07289..10f39b4d 100644 --- a/scripts/performance/.gitignore +++ b/scripts/performance/.gitignore @@ -1,2 +1,2 @@ data.csv -data.singer \ No newline at end of file +data.singer diff --git a/scripts/performance/1m_rows_generate.py b/scripts/performance/1m_rows_generate.py index a71f1fed..280aa3bf 100644 --- a/scripts/performance/1m_rows_generate.py +++ b/scripts/performance/1m_rows_generate.py @@ -5,23 +5,27 @@ num_rows = 1_000_000 num_columns = 10 + # Generate random data for CSV def random_string(length=10): - return ''.join(random.choices(string.ascii_letters + string.digits, k=length)) + return "".join(random.choices(string.ascii_letters + string.digits, k=length)) + # Generate the CSV file csv_filename = "data.csv" -with open(csv_filename, mode='w', newline='') as csv_file: +with open(csv_filename, mode="w", newline="") as csv_file: writer = csv.writer(csv_file) - + # Write header header = [f"column_{i+1}" for i in range(num_columns)] writer.writerow(header) - + # Write data rows for _ in range(num_rows): row = [random_string() for _ in range(num_columns)] writer.writerow(row) -print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.") \ No newline at end of file +print( + f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated." +) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 4fdf71d7..63fde7ce 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -1,18 +1,18 @@ # target-postgres Performance Analysis -Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. +Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. Main points: -1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. +1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. 1. Relative speed is the metric to focus on. If we focus on absolute speed then there's a bunch of hardware consideration that we are not trying to solve here (Would need to consider how paralleization fits into the mix here if we go there) # Why do this work? -1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on +1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on # How to run 1. `./prep.sh` gets the data together for you in the right place 2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons -3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` +3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` # Results on my machine | **Test Name** | **Total Run Time (s)** | **x Slower Than Native Copy** | @@ -25,4 +25,4 @@ Main points: # Other questions / concerns 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well -1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records \ No newline at end of file +1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records diff --git a/scripts/performance/perf_tests/pg_copy_upsert.sh b/scripts/performance/perf_tests/pg_copy_upsert.sh index 58c56726..a45d8cf5 100755 --- a/scripts/performance/perf_tests/pg_copy_upsert.sh +++ b/scripts/performance/perf_tests/pg_copy_upsert.sh @@ -50,4 +50,4 @@ ON CONFLICT (column_1) DO UPDATE SET EOF -echo "CSV file has been imported into the database with merge handling." \ No newline at end of file +echo "CSV file has been imported into the database with merge handling." diff --git a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock index e804040f..0f842733 100644 --- a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock +++ b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock @@ -80,4 +80,4 @@ "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)." } ] -} \ No newline at end of file +} diff --git a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock index f2dda680..da622e8f 100644 --- a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock +++ b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock @@ -174,4 +174,4 @@ "*.*", "!traffic_*.*" ] -} \ No newline at end of file +} diff --git a/scripts/performance/prep.sh b/scripts/performance/prep.sh index b725b81d..9d6303c1 100755 --- a/scripts/performance/prep.sh +++ b/scripts/performance/prep.sh @@ -1,6 +1,6 @@ #!/bin/bash time python 1m_rows_generate.py -time meltano invoke tap-csv > data.singer +time meltano invoke tap-csv > data.singer # Create initial table in postgres diff --git a/scripts/performance/speed_compare.py b/scripts/performance/speed_compare.py index 2fa8b9cd..81be8940 100644 --- a/scripts/performance/speed_compare.py +++ b/scripts/performance/speed_compare.py @@ -19,8 +19,13 @@ def measure_time(script_path): print(f"Measuring execution time for {script_path}...") start_time = time.perf_counter() try: - subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT) - except subprocess.CalledProcessError as e: + subprocess.run( + [script_path], + check=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + ) + except subprocess.CalledProcessError: print(f"Error: {script_path} exited with a non-zero status.") sys.exit(1) end_time = time.perf_counter() @@ -28,12 +33,18 @@ def measure_time(script_path): print(f"Execution time for {script_path}: {duration:.6f} seconds\n") return duration + def main(): perf_tests_dir = "./perf_tests" - scripts = [test for test in os.listdir(perf_tests_dir) if os.path.isfile(os.path.join(perf_tests_dir, test))] + scripts = [ + test + for test in os.listdir(perf_tests_dir) + if os.path.isfile(os.path.join(perf_tests_dir, test)) + ] for script in scripts: script_path = os.path.join("./perf_tests", script) measure_time(script_path) + if __name__ == "__main__": main() From c0386def8f583eccbdf6dab0666eb022c026aa5a Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:45:14 -0400 Subject: [PATCH 4/8] remove unneeded script --- scripts/performance/progres_indicator.sh | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100755 scripts/performance/progres_indicator.sh diff --git a/scripts/performance/progres_indicator.sh b/scripts/performance/progres_indicator.sh deleted file mode 100755 index b22edfa4..00000000 --- a/scripts/performance/progres_indicator.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -TOTAL_LINES=1000000 -FILE="data.singer" - -while true; do - CURRENT_LINES=$(wc -l < "$FILE") - PERCENT=$(( CURRENT_LINES * 100 / TOTAL_LINES )) - echo -ne "Progress: $PERCENT% ($CURRENT_LINES/$TOTAL_LINES lines)\r" - if [ "$CURRENT_LINES" -ge "$TOTAL_LINES" ]; then - echo -e "\nDone!" - break - fi - sleep 1 # Update every 5 seconds -done From 2723bbf89674aa75ca214fc7452d4b07ce8b21b5 Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:45:51 -0400 Subject: [PATCH 5/8] Sight heading tweak --- scripts/performance/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 63fde7ce..bc348a66 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -14,7 +14,7 @@ Main points: 2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons 3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` -# Results on my machine +# Results for 1 million records | **Test Name** | **Total Run Time (s)** | **x Slower Than Native Copy** | |-------------------------------------------------------------|------------------------|-------------------------------| | `./perf_tests/pg_copy_upsert.sh` | 13.64 | 1.0000 | From f5e3754b2f4dc78a13a6f20e249091f4a158ae48 Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:50:06 -0400 Subject: [PATCH 6/8] Next steps --- scripts/performance/README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index bc348a66..60885b8a 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -26,3 +26,12 @@ Main points: # Other questions / concerns 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well 1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records + +# Next steps to improve performance +Next steps to improve peformance: +- [ ] Split the current [Bulk Insert Speed PR](https://github.com/MeltanoLabs/target-postgres/pull/370) to be a seperate sink that can be turned on with a configuration setting +- [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass +- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. +- [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess +- [ ] Add pg_table with multiple threads, no reason we couldn't do something similar in targets +- [ ] Add a CI job that calculates performance implications of PR for every run \ No newline at end of file From d184bb1a64b6bba5d15beca34e1de45c42023ae7 Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 14:59:19 -0400 Subject: [PATCH 7/8] pg_loader not pg_table --- scripts/performance/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 60885b8a..20f6032e 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -33,5 +33,5 @@ Next steps to improve peformance: - [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass - [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. - [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess -- [ ] Add pg_table with multiple threads, no reason we couldn't do something similar in targets +- [ ] Add [pg_loader](https://github.com/dimitri/pgloader) with multiple threads, no reason we couldn't do something similar in targets - [ ] Add a CI job that calculates performance implications of PR for every run \ No newline at end of file From 6872180b1047db45cc661b8fec1c5a06b49ec35a Mon Sep 17 00:00:00 2001 From: Derek Visch Date: Mon, 16 Sep 2024 15:05:06 -0400 Subject: [PATCH 8/8] how to section was wrong --- scripts/performance/README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 20f6032e..349bdaa0 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -11,8 +11,7 @@ Main points: # How to run 1. `./prep.sh` gets the data together for you in the right place -2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons -3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` +2. `python speed_compare.py` runs all the tests and gives you the times for each test # Results for 1 million records | **Test Name** | **Total Run Time (s)** | **x Slower Than Native Copy** | @@ -31,7 +30,7 @@ Main points: Next steps to improve peformance: - [ ] Split the current [Bulk Insert Speed PR](https://github.com/MeltanoLabs/target-postgres/pull/370) to be a seperate sink that can be turned on with a configuration setting - [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass -- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. +- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. - [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess - [ ] Add [pg_loader](https://github.com/dimitri/pgloader) with multiple threads, no reason we couldn't do something similar in targets -- [ ] Add a CI job that calculates performance implications of PR for every run \ No newline at end of file +- [ ] Add a CI job that calculates performance implications of PR for every run