From c2d0e7571bb541404e88189b9f355085fa5fce74 Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 11:06:06 -0400
Subject: [PATCH 1/8] Performance messy initial

---
 meltano.yml                                | 29 +++++++---
 performance/1m_rows_generate.py            | 28 ++++++++++
 performance/README.md                      | 25 +++++++++
 performance/meltano_import.sh              |  2 +
 performance/native_import.sh               | 53 +++++++++++++++++++
 performance/prep.sh                        | 35 +++++++++++++
 performance/progres_indicator.sh           | 15 ++++++
 performance/speed_compare.py               | 59 +++++++++++++++++++++
 performance/speed_compare.sh               | 61 ++++++++++++++++++++++
 performance/target_postgres_copy_branch.sh |  2 +
 10 files changed, 302 insertions(+), 7 deletions(-)
 create mode 100644 performance/1m_rows_generate.py
 create mode 100644 performance/README.md
 create mode 100755 performance/meltano_import.sh
 create mode 100755 performance/native_import.sh
 create mode 100755 performance/prep.sh
 create mode 100755 performance/progres_indicator.sh
 create mode 100644 performance/speed_compare.py
 create mode 100755 performance/speed_compare.sh
 create mode 100755 performance/target_postgres_copy_branch.sh

diff --git a/meltano.yml b/meltano.yml
index 64fa82a0..ac83a3a2 100644
--- a/meltano.yml
+++ b/meltano.yml
@@ -11,9 +11,11 @@ plugins:
     config:
       streams:
       - stream_name: animals
-        input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl
+        input_filename: 
+          https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/animals-data.jsonl
       - stream_name: page_views
-        input_filename: https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl
+        input_filename: 
+          https://gitlab.com/meltano/tap-smoke-test/-/raw/main/demo-data/pageviews-data.jsonl
       stream_maps:
         animals:
           __key_properties__: [id]
@@ -30,13 +32,22 @@ plugins:
     - commits.url
     - commits.sha
     - commits.commit_timestamp
+  - name: tap-csv
+    variant: meltanolabs
+    pip_url: git+https://github.com/MeltanoLabs/tap-csv.git
+    config:
+      files:
+      - entity: data_target_postgres
+        path: $MELTANO_PROJECT_ROOT/performance/data.csv
+        keys: [column_1]
+      add_metadata_columns: false
   loaders:
   - name: target-postgres
     namespace: target_postgres
     pip_url: -e .
     settings:
     - name: sqlalchemy_url
-      kind: password
+      kind: string
       sensitive: true
     - name: ssl_enable
       kind: boolean
@@ -46,16 +57,16 @@ plugins:
       sensitive: true
     - name: ssl_mode
     - name: ssl_certificate_authority
-      kind: password
+      kind: string
       sensitive: true
     - name: ssl_client_certificate
-      kind: password
+      kind: string
       sensitive: true
     - name: ssl_client_private_key
-      kind: password
+      kind: string
       sensitive: true
     - name: password
-      kind: password
+      kind: string
       sensitive: true
     - name: host
     - name: port
@@ -72,6 +83,10 @@ plugins:
       password: postgres
       database: postgres
       target_schema: test
+      validate_records: false
       add_record_metadata: true
+  - name: target-postgres-copy-branch
+    inherit_from: target-postgres
+    pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy
 environments:
 - name: dev
diff --git a/performance/1m_rows_generate.py b/performance/1m_rows_generate.py
new file mode 100644
index 00000000..465a5f5b
--- /dev/null
+++ b/performance/1m_rows_generate.py
@@ -0,0 +1,28 @@
+import csv
+import random
+import string
+
+# Number of rows and columns
+num_rows = 1_000_000
+num_columns = 10
+
+# Generate random data for CSV
+def random_string(length=10):
+    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
+
+# Generate the CSV file
+csv_filename = "data.csv"
+
+with open(csv_filename, mode='w', newline='') as csv_file:
+    writer = csv.writer(csv_file)
+    
+    # Write header
+    header = [f"column_{i+1}" for i in range(num_columns)]
+    writer.writerow(header)
+    
+    # Write data rows
+    for _ in range(num_rows):
+        row = [random_string() for _ in range(num_columns)]
+        writer.writerow(row)
+
+print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.")
diff --git a/performance/README.md b/performance/README.md
new file mode 100644
index 00000000..12d658de
--- /dev/null
+++ b/performance/README.md
@@ -0,0 +1,25 @@
+# target-postgres Performance Analysis
+
+Just a POC right now.
+
+Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. 
+
+Main points:
+1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. 
+1. Relative speed is the metric to focus on. If we focus on absolute speed then there's a bunch of hardware consideration that we are not trying to solve here (Would need to consider how paralleization fits into the mix here if we go there)
+
+# Why do this work?
+1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on 
+
+# How to run
+1. `./prep.sh` gets the data together for you in the right place
+2. `./speed_compare.sh meltano_import.sh native_import.sh` runs each and gives you a nice time comparisons
+
+# Results on my slow machine
+
+
+
+# Other questions / concerns
+1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well
+1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
+kk
diff --git a/performance/meltano_import.sh b/performance/meltano_import.sh
new file mode 100755
index 00000000..9ed32d10
--- /dev/null
+++ b/performance/meltano_import.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+meltano invoke target-postgres < data.singer
diff --git a/performance/native_import.sh b/performance/native_import.sh
new file mode 100755
index 00000000..58c56726
--- /dev/null
+++ b/performance/native_import.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Variables
+CSV_FILE="data.csv"
+DB_NAME="postgres"
+DB_USER="postgres"
+DB_PASSWORD="postgres"
+DB_HOST="localhost"
+DB_PORT="5432"
+
+# Export the password to avoid being prompted
+export PGPASSWORD=$DB_PASSWORD
+
+# Execute COPY command to import the CSV into PostgreSQL
+#psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME -c "\COPY large_data FROM '$CSV_FILE' CSV HEADER;"
+# Begin transaction
+psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME <<EOF
+
+-- Create the staging table
+DROP TABLE IF EXISTS large_data_staging;
+CREATE UNLOGGED TABLE large_data_staging (
+    column_1 VARCHAR(255),
+    column_2 VARCHAR(255),
+    column_3 VARCHAR(255),
+    column_4 VARCHAR(255),
+    column_5 VARCHAR(255),
+    column_6 VARCHAR(255),
+    column_7 VARCHAR(255),
+    column_8 VARCHAR(255),
+    column_9 VARCHAR(255),
+    column_10 VARCHAR(255)
+);
+
+-- Import data into the staging table
+\COPY large_data_staging FROM '$CSV_FILE' CSV HEADER;
+
+-- Upsert data into the main table
+INSERT INTO large_data AS target
+SELECT * FROM large_data_staging
+ON CONFLICT (column_1) DO UPDATE SET
+    column_2 = EXCLUDED.column_2,
+    column_3 = EXCLUDED.column_3,
+    column_4 = EXCLUDED.column_4,
+    column_5 = EXCLUDED.column_5,
+    column_6 = EXCLUDED.column_6,
+    column_7 = EXCLUDED.column_7,
+    column_8 = EXCLUDED.column_8,
+    column_9 = EXCLUDED.column_9,
+    column_10 = EXCLUDED.column_10;
+
+EOF
+
+echo "CSV file has been imported into the database with merge handling."
\ No newline at end of file
diff --git a/performance/prep.sh b/performance/prep.sh
new file mode 100755
index 00000000..2061d15d
--- /dev/null
+++ b/performance/prep.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# time python 1m_rows_generate.py
+# Not trying to test the time here but this takes a very long time
+#time meltano invoke tap-csv > performance/data.singer 
+
+# Create initial table in postgres
+
+#Spin up postgres instance
+#podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres
+
+#Vars  We'd definietly want this as a meltano utility, just as POC right now
+DB_NAME="postgres"
+DB_USER="postgres"
+DB_PASSWORD="postgres"
+DB_HOST="localhost"
+DB_PORT="5432"
+export PGPASSWORD=$DB_PASSWORD
+
+psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME <<EOF
+
+-- Create the staging table
+CREATE TABLE large_data (
+    column_1 TEXT PRIMARY KEY,
+    column_2 TEXT,
+    column_3 TEXT,
+    column_4 TEXT,
+    column_5 TEXT,
+    column_6 TEXT,
+    column_7 TEXT,
+    column_8 TEXT,
+    column_9 TEXT,
+    column_10 TEXT
+);
+
+EOF
diff --git a/performance/progres_indicator.sh b/performance/progres_indicator.sh
new file mode 100755
index 00000000..b22edfa4
--- /dev/null
+++ b/performance/progres_indicator.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+TOTAL_LINES=1000000
+FILE="data.singer"
+
+while true; do
+    CURRENT_LINES=$(wc -l < "$FILE")
+    PERCENT=$(( CURRENT_LINES * 100 / TOTAL_LINES ))
+    echo -ne "Progress: $PERCENT% ($CURRENT_LINES/$TOTAL_LINES lines)\r"
+    if [ "$CURRENT_LINES" -ge "$TOTAL_LINES" ]; then
+        echo -e "\nDone!"
+        break
+    fi
+    sleep 1  # Update every 5 seconds
+done
diff --git a/performance/speed_compare.py b/performance/speed_compare.py
new file mode 100644
index 00000000..159e3f02
--- /dev/null
+++ b/performance/speed_compare.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+
+import sys
+import subprocess
+import time
+import os
+
+def measure_time(script_path):
+    # Check if the script exists and is executable
+    if not os.path.isfile(script_path):
+        print(f"Error: {script_path} does not exist.")
+        sys.exit(1)
+    if not os.access(script_path, os.X_OK):
+        print(f"Error: {script_path} is not executable.")
+        sys.exit(1)
+
+    # Measure execution time
+    start_time = time.perf_counter()
+    try:
+        subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: {script_path} exited with a non-zero status.")
+        sys.exit(1)
+    end_time = time.perf_counter()
+    duration = end_time - start_time
+    return duration
+
+def main():
+    if len(sys.argv) != 3:
+        print(f"Usage: {sys.argv[0]} script1 script2")
+        sys.exit(1)
+
+    script1 = sys.argv[1]
+    script2 = sys.argv[2]
+
+    print(f"Measuring execution time for {script1}...")
+    time1 = measure_time(script1)
+    print(f"Execution time for {script1}: {time1:.6f} seconds\n")
+
+    print(f"Measuring execution time for {script2}...")
+    time2 = measure_time(script2)
+    print(f"Execution time for {script2}: {time2:.6f} seconds\n")
+
+    # Compare execution times
+    if time1 < time2:
+        diff = time2 - time1
+        ratio = time2 / time1 if time1 != 0 else float('inf')
+        print(f"{script1} is faster than {script2} by {diff:.6f} seconds.")
+        print(f"{script1} is {ratio:.2f} times faster than {script2}.")
+    elif time1 > time2:
+        diff = time1 - time2
+        ratio = time1 / time2 if time2 != 0 else float('inf')
+        print(f"{script2} is faster than {script1} by {diff:.6f} seconds.")
+        print(f"{script2} is {ratio:.2f} times faster than {script1}.")
+    else:
+        print(f"{script1} and {script2} have the same execution time.")
+
+if __name__ == "__main__":
+    main()
diff --git a/performance/speed_compare.sh b/performance/speed_compare.sh
new file mode 100755
index 00000000..b9e70281
--- /dev/null
+++ b/performance/speed_compare.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Check if two arguments are given
+if [ $# -ne 2 ]; then
+    echo "Usage: $0 script1 script2"
+    exit 1
+fi
+
+script1="$1"
+script2="$2"
+
+# Check if the scripts exist and are executable
+if [ ! -x "$script1" ]; then
+    echo "Error: $script1 does not exist or is not executable."
+    exit 1
+fi
+
+if [ ! -x "$script2" ]; then
+    echo "Error: $script2 does not exist or is not executable."
+    exit 1
+fi
+
+# Function to measure execution time
+measure_time() {
+    local script="$1"
+    local start_time end_time duration
+    start_time=$(date +%s.%N)
+    ./"$script"
+    end_time=$(date +%s.%N)
+    duration=$(awk "BEGIN {print $end_time - $start_time}")
+    echo "$duration"
+}
+
+# Measure execution time for script1
+echo "Measuring execution time for $script1..."
+time1=$(measure_time "$script1")
+echo "Execution time for $script1: $time1 seconds"
+
+# Measure execution time for script2
+echo "Measuring execution time for $script2..."
+time2=$(measure_time "$script2")
+echo "Execution time for $script2: $time2 seconds"
+
+# Compare the execution times using awk
+comparison=$(awk -v t1="$time1" -v t2="$time2" 'BEGIN { if (t1 < t2) print "script1_faster"; else if (t1 > t2) print "script2_faster"; else print "equal" }')
+
+if [ "$comparison" = "script1_faster" ]; then
+    diff=$(awk "BEGIN {print $time2 - $time1}")
+    ratio=$(awk "BEGIN {print $time2 / $time1}")
+    ratio_formatted=$(printf "%.2f" "$ratio")
+    echo "$script1 is faster than $script2 by $diff seconds."
+    echo "$script1 is $ratio_formatted times faster than $script2."
+elif [ "$comparison" = "script2_faster" ]; then
+    diff=$(awk "BEGIN {print $time1 - $time2}")
+    ratio=$(awk "BEGIN {print $time1 / $time2}")
+    ratio_formatted=$(printf "%.2f" "$ratio")
+    echo "$script2 is faster than $script1 by $diff seconds."
+    echo "$script2 is $ratio_formatted times faster than $script1."
+else
+    echo "$script1 and $script2 have the same execution time."
+fi
\ No newline at end of file
diff --git a/performance/target_postgres_copy_branch.sh b/performance/target_postgres_copy_branch.sh
new file mode 100755
index 00000000..1b2b8b53
--- /dev/null
+++ b/performance/target_postgres_copy_branch.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+meltano invoke target-postgres-copy-branch < data.singer

From 81f14c2a9f1f327e193950d774e429244df0ec40 Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:41:13 -0400
Subject: [PATCH 2/8] Performance refactored with table added

---
 performance/speed_compare.py                  |  59 ------
 performance/speed_compare.sh                  |  61 ------
 podman.sh                                     |   3 +
 scripts/performance/.gitignore                |   2 +
 .../performance}/1m_rows_generate.py          |   3 +-
 .../performance}/README.md                    |  21 ++-
 scripts/performance/meltano.yml               |  67 +++++++
 .../performance/perf_tests/pg_copy_upsert.sh  |   0
 .../target_postgres_copy_branch.sh            |   0
 ...target_postgres_copy_branch_no_validate.sh |   3 +
 .../target_postgres_current_branch.sh         |   0
 ...get_postgres_current_branch_no_validate.sh |   3 +
 .../extractors/tap-csv--meltanolabs.lock      |  83 ++++++++
 .../extractors/tap-github--meltanolabs.lock   | 177 ++++++++++++++++++
 {performance => scripts/performance}/prep.sh  |   7 +-
 .../performance}/progres_indicator.sh         |   0
 scripts/performance/speed_compare.py          |  39 ++++
 17 files changed, 393 insertions(+), 135 deletions(-)
 delete mode 100644 performance/speed_compare.py
 delete mode 100755 performance/speed_compare.sh
 create mode 100755 podman.sh
 create mode 100644 scripts/performance/.gitignore
 rename {performance => scripts/performance}/1m_rows_generate.py (91%)
 rename {performance => scripts/performance}/README.md (51%)
 create mode 100644 scripts/performance/meltano.yml
 rename performance/native_import.sh => scripts/performance/perf_tests/pg_copy_upsert.sh (100%)
 rename {performance => scripts/performance/perf_tests}/target_postgres_copy_branch.sh (100%)
 create mode 100755 scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh
 rename performance/meltano_import.sh => scripts/performance/perf_tests/target_postgres_current_branch.sh (100%)
 create mode 100755 scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh
 create mode 100644 scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
 create mode 100644 scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
 rename {performance => scripts/performance}/prep.sh (71%)
 rename {performance => scripts/performance}/progres_indicator.sh (100%)
 create mode 100644 scripts/performance/speed_compare.py

diff --git a/performance/speed_compare.py b/performance/speed_compare.py
deleted file mode 100644
index 159e3f02..00000000
--- a/performance/speed_compare.py
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env python3
-
-import sys
-import subprocess
-import time
-import os
-
-def measure_time(script_path):
-    # Check if the script exists and is executable
-    if not os.path.isfile(script_path):
-        print(f"Error: {script_path} does not exist.")
-        sys.exit(1)
-    if not os.access(script_path, os.X_OK):
-        print(f"Error: {script_path} is not executable.")
-        sys.exit(1)
-
-    # Measure execution time
-    start_time = time.perf_counter()
-    try:
-        subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-    except subprocess.CalledProcessError as e:
-        print(f"Error: {script_path} exited with a non-zero status.")
-        sys.exit(1)
-    end_time = time.perf_counter()
-    duration = end_time - start_time
-    return duration
-
-def main():
-    if len(sys.argv) != 3:
-        print(f"Usage: {sys.argv[0]} script1 script2")
-        sys.exit(1)
-
-    script1 = sys.argv[1]
-    script2 = sys.argv[2]
-
-    print(f"Measuring execution time for {script1}...")
-    time1 = measure_time(script1)
-    print(f"Execution time for {script1}: {time1:.6f} seconds\n")
-
-    print(f"Measuring execution time for {script2}...")
-    time2 = measure_time(script2)
-    print(f"Execution time for {script2}: {time2:.6f} seconds\n")
-
-    # Compare execution times
-    if time1 < time2:
-        diff = time2 - time1
-        ratio = time2 / time1 if time1 != 0 else float('inf')
-        print(f"{script1} is faster than {script2} by {diff:.6f} seconds.")
-        print(f"{script1} is {ratio:.2f} times faster than {script2}.")
-    elif time1 > time2:
-        diff = time1 - time2
-        ratio = time1 / time2 if time2 != 0 else float('inf')
-        print(f"{script2} is faster than {script1} by {diff:.6f} seconds.")
-        print(f"{script2} is {ratio:.2f} times faster than {script1}.")
-    else:
-        print(f"{script1} and {script2} have the same execution time.")
-
-if __name__ == "__main__":
-    main()
diff --git a/performance/speed_compare.sh b/performance/speed_compare.sh
deleted file mode 100755
index b9e70281..00000000
--- a/performance/speed_compare.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# Check if two arguments are given
-if [ $# -ne 2 ]; then
-    echo "Usage: $0 script1 script2"
-    exit 1
-fi
-
-script1="$1"
-script2="$2"
-
-# Check if the scripts exist and are executable
-if [ ! -x "$script1" ]; then
-    echo "Error: $script1 does not exist or is not executable."
-    exit 1
-fi
-
-if [ ! -x "$script2" ]; then
-    echo "Error: $script2 does not exist or is not executable."
-    exit 1
-fi
-
-# Function to measure execution time
-measure_time() {
-    local script="$1"
-    local start_time end_time duration
-    start_time=$(date +%s.%N)
-    ./"$script"
-    end_time=$(date +%s.%N)
-    duration=$(awk "BEGIN {print $end_time - $start_time}")
-    echo "$duration"
-}
-
-# Measure execution time for script1
-echo "Measuring execution time for $script1..."
-time1=$(measure_time "$script1")
-echo "Execution time for $script1: $time1 seconds"
-
-# Measure execution time for script2
-echo "Measuring execution time for $script2..."
-time2=$(measure_time "$script2")
-echo "Execution time for $script2: $time2 seconds"
-
-# Compare the execution times using awk
-comparison=$(awk -v t1="$time1" -v t2="$time2" 'BEGIN { if (t1 < t2) print "script1_faster"; else if (t1 > t2) print "script2_faster"; else print "equal" }')
-
-if [ "$comparison" = "script1_faster" ]; then
-    diff=$(awk "BEGIN {print $time2 - $time1}")
-    ratio=$(awk "BEGIN {print $time2 / $time1}")
-    ratio_formatted=$(printf "%.2f" "$ratio")
-    echo "$script1 is faster than $script2 by $diff seconds."
-    echo "$script1 is $ratio_formatted times faster than $script2."
-elif [ "$comparison" = "script2_faster" ]; then
-    diff=$(awk "BEGIN {print $time1 - $time2}")
-    ratio=$(awk "BEGIN {print $time1 / $time2}")
-    ratio_formatted=$(printf "%.2f" "$ratio")
-    echo "$script2 is faster than $script1 by $diff seconds."
-    echo "$script2 is $ratio_formatted times faster than $script1."
-else
-    echo "$script1 and $script2 have the same execution time."
-fi
\ No newline at end of file
diff --git a/podman.sh b/podman.sh
new file mode 100755
index 00000000..e9666eee
--- /dev/null
+++ b/podman.sh
@@ -0,0 +1,3 @@
+#/bin/bash
+#Username postgres password postgres
+podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres
diff --git a/scripts/performance/.gitignore b/scripts/performance/.gitignore
new file mode 100644
index 00000000..6fe07289
--- /dev/null
+++ b/scripts/performance/.gitignore
@@ -0,0 +1,2 @@
+data.csv
+data.singer
\ No newline at end of file
diff --git a/performance/1m_rows_generate.py b/scripts/performance/1m_rows_generate.py
similarity index 91%
rename from performance/1m_rows_generate.py
rename to scripts/performance/1m_rows_generate.py
index 465a5f5b..a71f1fed 100644
--- a/performance/1m_rows_generate.py
+++ b/scripts/performance/1m_rows_generate.py
@@ -2,7 +2,6 @@
 import random
 import string
 
-# Number of rows and columns
 num_rows = 1_000_000
 num_columns = 10
 
@@ -25,4 +24,4 @@ def random_string(length=10):
         row = [random_string() for _ in range(num_columns)]
         writer.writerow(row)
 
-print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.")
+print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.")
\ No newline at end of file
diff --git a/performance/README.md b/scripts/performance/README.md
similarity index 51%
rename from performance/README.md
rename to scripts/performance/README.md
index 12d658de..4fdf71d7 100644
--- a/performance/README.md
+++ b/scripts/performance/README.md
@@ -1,7 +1,5 @@
 # target-postgres Performance Analysis
 
-Just a POC right now.
-
 Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. 
 
 Main points:
@@ -13,13 +11,18 @@ Main points:
 
 # How to run
 1. `./prep.sh` gets the data together for you in the right place
-2. `./speed_compare.sh meltano_import.sh native_import.sh` runs each and gives you a nice time comparisons
-
-# Results on my slow machine
-
-
+2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons
+3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` 
+
+# Results on my machine
+| **Test Name**                                               | **Total Run Time (s)** | **x Slower Than Native Copy** |
+|-------------------------------------------------------------|------------------------|-------------------------------|
+| `./perf_tests/pg_copy_upsert.sh`                            | 13.64                  | 1.0000                        |
+| `./perf_tests/target_postgres_copy_branch_no_validate.sh`   | 100.50                 | 7.3697                        |
+| `./perf_tests/target_postgres_current_branch_no_validate.sh`| 141.48                 | 10.3749                       |
+| `./perf_tests/target_postgres_copy_branch.sh`               | 265.53                 | 19.4719                       |
+| `./perf_tests/target_postgres_current_branch.sh`            | 298.37                 | 21.8799                       |
 
 # Other questions / concerns
 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well
-1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
-kk
+1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
\ No newline at end of file
diff --git a/scripts/performance/meltano.yml b/scripts/performance/meltano.yml
new file mode 100644
index 00000000..c25b2b5c
--- /dev/null
+++ b/scripts/performance/meltano.yml
@@ -0,0 +1,67 @@
+version: 1
+send_anonymous_usage_stats: true
+default_environment: dev
+project_id: target-postgres
+plugins:
+  extractors:
+  - name: tap-csv
+    variant: meltanolabs
+    pip_url: git+https://github.com/MeltanoLabs/tap-csv.git
+    config:
+      files:
+      - entity: data_target_postgres
+        path: $MELTANO_PROJECT_ROOT/data.csv
+        keys: [column_1]
+      add_metadata_columns: false
+  loaders:
+  - name: target-postgres
+    namespace: target_postgres
+    pip_url: -e ../../.
+    settings:
+    - name: sqlalchemy_url
+      kind: string
+      sensitive: true
+    - name: ssl_enable
+      kind: boolean
+      sensitive: true
+    - name: ssl_client_certificate_enable
+      kind: boolean
+      sensitive: true
+    - name: ssl_mode
+    - name: ssl_certificate_authority
+      kind: string
+      sensitive: true
+    - name: ssl_client_certificate
+      kind: string
+      sensitive: true
+    - name: ssl_client_private_key
+      kind: string
+      sensitive: true
+    - name: password
+      kind: string
+      sensitive: true
+    - name: host
+    - name: port
+      kind: integer
+    - name: user
+    - name: database
+    - name: target_schema
+    - name: add_record_metadata
+      kind: boolean
+    - name: validate_records
+      kind: boolean
+    - name: batch_size_rows
+      kind: integer
+    config:
+      host: localhost
+      port: 5432
+      user: postgres
+      password: postgres
+      database: postgres
+      target_schema: test
+      add_record_metadata: true
+  - name: target-postgres-copy-branch
+    inherit_from: target-postgres
+    pip_url: git+https://github.com/kinghuang/target-postgres@bulk-insert-copy
+environments:
+- name: dev
diff --git a/performance/native_import.sh b/scripts/performance/perf_tests/pg_copy_upsert.sh
similarity index 100%
rename from performance/native_import.sh
rename to scripts/performance/perf_tests/pg_copy_upsert.sh
diff --git a/performance/target_postgres_copy_branch.sh b/scripts/performance/perf_tests/target_postgres_copy_branch.sh
similarity index 100%
rename from performance/target_postgres_copy_branch.sh
rename to scripts/performance/perf_tests/target_postgres_copy_branch.sh
diff --git a/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh b/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh
new file mode 100755
index 00000000..e27475c0
--- /dev/null
+++ b/scripts/performance/perf_tests/target_postgres_copy_branch_no_validate.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+export TARGET_POSTGRES_VALIDATE_RECORDS="false"
+meltano invoke target-postgres-copy-branch < data.singer
diff --git a/performance/meltano_import.sh b/scripts/performance/perf_tests/target_postgres_current_branch.sh
similarity index 100%
rename from performance/meltano_import.sh
rename to scripts/performance/perf_tests/target_postgres_current_branch.sh
diff --git a/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh b/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh
new file mode 100755
index 00000000..181613ef
--- /dev/null
+++ b/scripts/performance/perf_tests/target_postgres_current_branch_no_validate.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+export TARGET_POSTGRES_VALIDATE_RECORDS="false"
+meltano invoke target-postgres < data.singer
diff --git a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
new file mode 100644
index 00000000..e804040f
--- /dev/null
+++ b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
@@ -0,0 +1,83 @@
+{
+  "plugin_type": "extractors",
+  "name": "tap-csv",
+  "namespace": "tap_csv",
+  "variant": "meltanolabs",
+  "label": "Comma Separated Values (CSV)",
+  "docs": "https://hub.meltano.com/extractors/tap-csv--meltanolabs",
+  "repo": "https://github.com/MeltanoLabs/tap-csv",
+  "pip_url": "git+https://github.com/MeltanoLabs/tap-csv.git",
+  "description": "Generic data extractor of CSV (comma separated value) files",
+  "logo_url": "https://hub.meltano.com/assets/logos/extractors/csv.png",
+  "capabilities": [
+    "catalog",
+    "discover"
+  ],
+  "settings_group_validation": [
+    [
+      "files"
+    ],
+    [
+      "csv_files_definition"
+    ]
+  ],
+  "settings": [
+    {
+      "name": "add_metadata_columns",
+      "kind": "boolean",
+      "value": false,
+      "label": "Add Metadata Columns",
+      "description": "When True, add the metadata columns (`_sdc_source_file`, `_sdc_source_file_mtime`, `_sdc_source_lineno`) to output."
+    },
+    {
+      "name": "csv_files_definition",
+      "kind": "string",
+      "label": "Csv Files Definition",
+      "documentation": "https://github.com/MeltanoLabs/tap-csv#settings",
+      "description": "Project-relative path to JSON file holding array of objects as described under [Files](#files) - with `entity`, `path`, `keys`, and other optional keys:\n\n```json\n[\n  {\n    \"entity\": \"<entity>\",\n    \"path\": \"<path>\",\n    \"keys\": [\"<key>\"],\n  },\n  // ...\n]\n```\n",
+      "placeholder": "Ex. files-def.json"
+    },
+    {
+      "name": "faker_config.locale",
+      "kind": "array",
+      "label": "Faker Config Locale",
+      "description": "One or more LCID locale strings to produce localized output for: https://faker.readthedocs.io/en/master/#localization"
+    },
+    {
+      "name": "faker_config.seed",
+      "kind": "string",
+      "label": "Faker Config Seed",
+      "description": "Value to seed the Faker generator for deterministic output: https://faker.readthedocs.io/en/master/#seeding-the-generator"
+    },
+    {
+      "name": "files",
+      "kind": "array",
+      "label": "Files",
+      "description": "Array of objects with `entity`, `path`, `keys`, and `encoding` [Optional] keys:\n\n* `entity`: The entity name, used as the table name for the data loaded from that CSV.\n* `path`: Local path (relative to the project's root) to the file to be ingested. Note that this may be a directory, in which case all files in that directory and any of its subdirectories will be recursively processed\n* `keys`: The names of the columns that constitute the unique keys for that entity.\n* `encoding`: [Optional] The file encoding to use when reading the file (i.e. \"latin1\", \"UTF-8\"). Use this setting when you get a UnicodeDecodeError error.\n  Each input CSV file must be a traditionally-delimited CSV (comma separated columns, newlines indicate new rows, double quoted values).\n\nThe following entries are passed through in an internal CSV dialect that then is used to configure the CSV reader:\n\n* `delimiter`: A one-character string used to separate fields. It defaults to ','.\n* `doublequote`: Controls how instances of quotechar appearing inside a field should themselves be quoted. When True, the character is doubled. When False, the escapechar is used as a prefix to the quotechar. It defaults to True.\n* `escapechar`: A one-character string used by the reader, where the escapechar removes any special meaning from the following character. It defaults to None, which disables escaping.\n* `quotechar`: A one-character string used to quote fields containing special characters, such as the delimiter or quotechar, or which contain new-line characters. It defaults to '\"'.\n* `skipinitialspace`: When True, spaces immediately following the delimiter are ignored. The default is False.\n* `strict`: When True, raise exception Error on bad CSV input. The default is False.\n\nThe first row is the header defining the attribute name for that column and will result to a column of the same name in the database. It must have a valid format with no spaces or special characters (like for example `!` or `@`, etc).\n"
+    },
+    {
+      "name": "flattening_enabled",
+      "kind": "boolean",
+      "label": "Flattening Enabled",
+      "description": "'True' to enable schema flattening and automatically expand nested properties."
+    },
+    {
+      "name": "flattening_max_depth",
+      "kind": "integer",
+      "label": "Flattening Max Depth",
+      "description": "The max depth to flatten schemas."
+    },
+    {
+      "name": "stream_map_config",
+      "kind": "object",
+      "label": "Stream Map Config",
+      "description": "User-defined config values to be used within map expressions."
+    },
+    {
+      "name": "stream_maps",
+      "kind": "object",
+      "label": "Stream Maps",
+      "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
+    }
+  ]
+}
\ No newline at end of file
diff --git a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
new file mode 100644
index 00000000..f2dda680
--- /dev/null
+++ b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
@@ -0,0 +1,177 @@
+{
+  "plugin_type": "extractors",
+  "name": "tap-github",
+  "namespace": "tap_github",
+  "variant": "meltanolabs",
+  "label": "GitHub",
+  "docs": "https://hub.meltano.com/extractors/tap-github--meltanolabs",
+  "repo": "https://github.com/MeltanoLabs/tap-github",
+  "pip_url": "git+https://github.com/MeltanoLabs/tap-github.git",
+  "description": "Code hosting platform",
+  "logo_url": "https://hub.meltano.com/assets/logos/extractors/github.png",
+  "capabilities": [
+    "about",
+    "batch",
+    "catalog",
+    "discover",
+    "schema-flattening",
+    "state",
+    "stream-maps"
+  ],
+  "settings_group_validation": [
+    [
+      "repositories"
+    ],
+    [
+      "organizations"
+    ],
+    [
+      "searches"
+    ],
+    [
+      "user_usernames"
+    ],
+    [
+      "user_ids"
+    ]
+  ],
+  "settings": [
+    {
+      "name": "additional_auth_tokens",
+      "kind": "array",
+      "label": "Additional Auth Tokens",
+      "description": "List of GitHub tokens to authenticate with. Streams will loop through them when hitting rate limits."
+    },
+    {
+      "name": "auth_token",
+      "kind": "string",
+      "label": "Auth Token",
+      "description": "GitHub token to authenticate with.",
+      "sensitive": true
+    },
+    {
+      "name": "batch_config.encoding.compression",
+      "kind": "options",
+      "label": "Batch Config Encoding Compression",
+      "description": "Compression format to use for batch files.",
+      "options": [
+        {
+          "label": "Gzip",
+          "value": "gzip"
+        },
+        {
+          "label": "None",
+          "value": "none"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.encoding.format",
+      "kind": "options",
+      "label": "Batch Config Encoding Format",
+      "description": "Format to use for batch files.",
+      "options": [
+        {
+          "label": "Jsonl",
+          "value": "jsonl"
+        }
+      ]
+    },
+    {
+      "name": "batch_config.storage.prefix",
+      "kind": "string",
+      "label": "Batch Config Storage Prefix",
+      "description": "Prefix to use when writing batch files."
+    },
+    {
+      "name": "batch_config.storage.root",
+      "kind": "string",
+      "label": "Batch Config Storage Root",
+      "description": "Root path to use when writing batch files."
+    },
+    {
+      "name": "flattening_enabled",
+      "kind": "boolean",
+      "label": "Flattening Enabled",
+      "description": "'True' to enable schema flattening and automatically expand nested properties."
+    },
+    {
+      "name": "flattening_max_depth",
+      "kind": "integer",
+      "label": "Flattening Max Depth",
+      "description": "The max depth to flatten schemas."
+    },
+    {
+      "name": "metrics_log_level",
+      "kind": "string",
+      "label": "Metrics Log Level",
+      "description": "The log level of the API response metrics."
+    },
+    {
+      "name": "organizations",
+      "kind": "array",
+      "label": "Organizations",
+      "description": "An array of strings containing the github organizations to be included"
+    },
+    {
+      "name": "rate_limit_buffer",
+      "kind": "integer",
+      "label": "Rate Limit Buffer",
+      "description": "Add a buffer to avoid consuming all query points for the token at hand. Defaults to 1000."
+    },
+    {
+      "name": "repositories",
+      "kind": "array",
+      "label": "Repositories",
+      "description": "An array of strings containing the github repos to be included"
+    },
+    {
+      "name": "searches",
+      "kind": "array",
+      "label": "Searches",
+      "description": "An array of search descriptor objects with the following properties. \"name\" - a human readable name for the search query. \"query\" -  a github search string (generally the same as would come after ?q= in the URL)"
+    },
+    {
+      "name": "skip_parent_streams",
+      "kind": "boolean",
+      "label": "Skip Parent Streams",
+      "description": "Set to true to skip API calls for the parent streams (such as repositories) if it is not selected but children are"
+    },
+    {
+      "name": "start_date",
+      "kind": "date_iso8601",
+      "label": "Start Date"
+    },
+    {
+      "name": "stream_map_config",
+      "kind": "object",
+      "label": "Stream Map Config"
+    },
+    {
+      "name": "stream_maps",
+      "kind": "object",
+      "label": "Stream Maps"
+    },
+    {
+      "name": "user_agent",
+      "kind": "string",
+      "label": "User Agent"
+    },
+    {
+      "name": "user_ids",
+      "kind": "array",
+      "label": "User IDs",
+      "description": "A list of GitHub user ids."
+    },
+    {
+      "name": "user_usernames",
+      "kind": "array",
+      "label": "User Usernames",
+      "description": "A list of GithHub usernames."
+    }
+  ],
+  "select": [
+    "*.*",
+    "!traffic_*.*"
+  ]
+}
\ No newline at end of file
diff --git a/performance/prep.sh b/scripts/performance/prep.sh
similarity index 71%
rename from performance/prep.sh
rename to scripts/performance/prep.sh
index 2061d15d..b725b81d 100755
--- a/performance/prep.sh
+++ b/scripts/performance/prep.sh
@@ -1,12 +1,11 @@
 #!/bin/bash
-# time python 1m_rows_generate.py
-# Not trying to test the time here but this takes a very long time
-#time meltano invoke tap-csv > performance/data.singer 
+time python 1m_rows_generate.py
+time meltano invoke tap-csv > data.singer 
 
 # Create initial table in postgres
 
 #Spin up postgres instance
-#podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres
+podman run -e POSTGRES_PASSWORD=postgres -p 5432:5432 -h postgres -d postgres
 
 #Vars  We'd definietly want this as a meltano utility, just as POC right now
 DB_NAME="postgres"
diff --git a/performance/progres_indicator.sh b/scripts/performance/progres_indicator.sh
similarity index 100%
rename from performance/progres_indicator.sh
rename to scripts/performance/progres_indicator.sh
diff --git a/scripts/performance/speed_compare.py b/scripts/performance/speed_compare.py
new file mode 100644
index 00000000..2fa8b9cd
--- /dev/null
+++ b/scripts/performance/speed_compare.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import sys
+import time
+
+
+def measure_time(script_path):
+    # Check if the script exists and is executable
+    if not os.path.isfile(script_path):
+        print(f"Error: {script_path} does not exist.")
+        sys.exit(1)
+    if not os.access(script_path, os.X_OK):
+        print(f"Error: {script_path} is not executable.")
+        sys.exit(1)
+
+    # Measure execution time
+    print(f"Measuring execution time for {script_path}...")
+    start_time = time.perf_counter()
+    try:
+        subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
+    except subprocess.CalledProcessError as e:
+        print(f"Error: {script_path} exited with a non-zero status.")
+        sys.exit(1)
+    end_time = time.perf_counter()
+    duration = end_time - start_time
+    print(f"Execution time for {script_path}: {duration:.6f} seconds\n")
+    return duration
+
+def main():
+    perf_tests_dir = "./perf_tests"
+    scripts = [test for test in os.listdir(perf_tests_dir) if os.path.isfile(os.path.join(perf_tests_dir, test))]
+    for script in scripts:
+        script_path = os.path.join("./perf_tests", script)
+        measure_time(script_path)
+
+if __name__ == "__main__":
+    main()

From e6dd402fff59257535e4da071531653a3570f85c Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:42:27 -0400
Subject: [PATCH 3/8] Linter

---
 scripts/performance/.gitignore                  |  2 +-
 scripts/performance/1m_rows_generate.py         | 14 +++++++++-----
 scripts/performance/README.md                   | 10 +++++-----
 .../performance/perf_tests/pg_copy_upsert.sh    |  2 +-
 .../extractors/tap-csv--meltanolabs.lock        |  2 +-
 .../extractors/tap-github--meltanolabs.lock     |  2 +-
 scripts/performance/prep.sh                     |  2 +-
 scripts/performance/speed_compare.py            | 17 ++++++++++++++---
 8 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/scripts/performance/.gitignore b/scripts/performance/.gitignore
index 6fe07289..10f39b4d 100644
--- a/scripts/performance/.gitignore
+++ b/scripts/performance/.gitignore
@@ -1,2 +1,2 @@
 data.csv
-data.singer
\ No newline at end of file
+data.singer
diff --git a/scripts/performance/1m_rows_generate.py b/scripts/performance/1m_rows_generate.py
index a71f1fed..280aa3bf 100644
--- a/scripts/performance/1m_rows_generate.py
+++ b/scripts/performance/1m_rows_generate.py
@@ -5,23 +5,27 @@
 num_rows = 1_000_000
 num_columns = 10
 
+
 # Generate random data for CSV
 def random_string(length=10):
-    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))
+    return "".join(random.choices(string.ascii_letters + string.digits, k=length))
+
 
 # Generate the CSV file
 csv_filename = "data.csv"
 
-with open(csv_filename, mode='w', newline='') as csv_file:
+with open(csv_filename, mode="w", newline="") as csv_file:
     writer = csv.writer(csv_file)
-    
+
     # Write header
     header = [f"column_{i+1}" for i in range(num_columns)]
     writer.writerow(header)
-    
+
     # Write data rows
     for _ in range(num_rows):
         row = [random_string() for _ in range(num_columns)]
         writer.writerow(row)
 
-print(f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated.")
\ No newline at end of file
+print(
+    f"CSV file '{csv_filename}' with {num_rows} rows and {num_columns} columns has been generated."
+)
diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index 4fdf71d7..63fde7ce 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -1,18 +1,18 @@
 # target-postgres Performance Analysis
 
-Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's. 
+Main goal is to lay out an objective way to do performance analysis with target-postgres, and hopefuly the ground work for others if they want to do analysis with their target's.
 
 Main points:
-1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline. 
+1. We need something to comapre to. For postgres we have native import commands that are well optimized. We will use this as a baseline.
 1. Relative speed is the metric to focus on. If we focus on absolute speed then there's a bunch of hardware consideration that we are not trying to solve here (Would need to consider how paralleization fits into the mix here if we go there)
 
 # Why do this work?
-1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on 
+1. Without it we are guessing at what can help improve performance, this gives us a more objective way to pick what we should focus on
 
 # How to run
 1. `./prep.sh` gets the data together for you in the right place
 2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons
-3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh` 
+3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh`
 
 # Results on my machine
 | **Test Name**                                               | **Total Run Time (s)** | **x Slower Than Native Copy** |
@@ -25,4 +25,4 @@ Main points:
 
 # Other questions / concerns
 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well
-1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
\ No newline at end of file
+1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
diff --git a/scripts/performance/perf_tests/pg_copy_upsert.sh b/scripts/performance/perf_tests/pg_copy_upsert.sh
index 58c56726..a45d8cf5 100755
--- a/scripts/performance/perf_tests/pg_copy_upsert.sh
+++ b/scripts/performance/perf_tests/pg_copy_upsert.sh
@@ -50,4 +50,4 @@ ON CONFLICT (column_1) DO UPDATE SET
 
 EOF
 
-echo "CSV file has been imported into the database with merge handling."
\ No newline at end of file
+echo "CSV file has been imported into the database with merge handling."
diff --git a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
index e804040f..0f842733 100644
--- a/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
+++ b/scripts/performance/plugins/extractors/tap-csv--meltanolabs.lock
@@ -80,4 +80,4 @@
       "description": "Config object for stream maps capability. For more information check out [Stream Maps](https://sdk.meltano.com/en/latest/stream_maps.html)."
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
index f2dda680..da622e8f 100644
--- a/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
+++ b/scripts/performance/plugins/extractors/tap-github--meltanolabs.lock
@@ -174,4 +174,4 @@
     "*.*",
     "!traffic_*.*"
   ]
-}
\ No newline at end of file
+}
diff --git a/scripts/performance/prep.sh b/scripts/performance/prep.sh
index b725b81d..9d6303c1 100755
--- a/scripts/performance/prep.sh
+++ b/scripts/performance/prep.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 time python 1m_rows_generate.py
-time meltano invoke tap-csv > data.singer 
+time meltano invoke tap-csv > data.singer
 
 # Create initial table in postgres
 
diff --git a/scripts/performance/speed_compare.py b/scripts/performance/speed_compare.py
index 2fa8b9cd..81be8940 100644
--- a/scripts/performance/speed_compare.py
+++ b/scripts/performance/speed_compare.py
@@ -19,8 +19,13 @@ def measure_time(script_path):
     print(f"Measuring execution time for {script_path}...")
     start_time = time.perf_counter()
     try:
-        subprocess.run([script_path], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-    except subprocess.CalledProcessError as e:
+        subprocess.run(
+            [script_path],
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.STDOUT,
+        )
+    except subprocess.CalledProcessError:
         print(f"Error: {script_path} exited with a non-zero status.")
         sys.exit(1)
     end_time = time.perf_counter()
@@ -28,12 +33,18 @@ def measure_time(script_path):
     print(f"Execution time for {script_path}: {duration:.6f} seconds\n")
     return duration
 
+
 def main():
     perf_tests_dir = "./perf_tests"
-    scripts = [test for test in os.listdir(perf_tests_dir) if os.path.isfile(os.path.join(perf_tests_dir, test))]
+    scripts = [
+        test
+        for test in os.listdir(perf_tests_dir)
+        if os.path.isfile(os.path.join(perf_tests_dir, test))
+    ]
     for script in scripts:
         script_path = os.path.join("./perf_tests", script)
         measure_time(script_path)
 
+
 if __name__ == "__main__":
     main()

From c0386def8f583eccbdf6dab0666eb022c026aa5a Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:45:14 -0400
Subject: [PATCH 4/8] remove unneeded script

---
 scripts/performance/progres_indicator.sh | 15 ---------------
 1 file changed, 15 deletions(-)
 delete mode 100755 scripts/performance/progres_indicator.sh

diff --git a/scripts/performance/progres_indicator.sh b/scripts/performance/progres_indicator.sh
deleted file mode 100755
index b22edfa4..00000000
--- a/scripts/performance/progres_indicator.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-TOTAL_LINES=1000000
-FILE="data.singer"
-
-while true; do
-    CURRENT_LINES=$(wc -l < "$FILE")
-    PERCENT=$(( CURRENT_LINES * 100 / TOTAL_LINES ))
-    echo -ne "Progress: $PERCENT% ($CURRENT_LINES/$TOTAL_LINES lines)\r"
-    if [ "$CURRENT_LINES" -ge "$TOTAL_LINES" ]; then
-        echo -e "\nDone!"
-        break
-    fi
-    sleep 1  # Update every 5 seconds
-done

From 2723bbf89674aa75ca214fc7452d4b07ce8b21b5 Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:45:51 -0400
Subject: [PATCH 5/8] Sight heading tweak

---
 scripts/performance/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index 63fde7ce..bc348a66 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -14,7 +14,7 @@ Main points:
 2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons
 3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh`
 
-# Results on my machine
+# Results for 1 million records
 | **Test Name**                                               | **Total Run Time (s)** | **x Slower Than Native Copy** |
 |-------------------------------------------------------------|------------------------|-------------------------------|
 | `./perf_tests/pg_copy_upsert.sh`                            | 13.64                  | 1.0000                        |

From f5e3754b2f4dc78a13a6f20e249091f4a158ae48 Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:50:06 -0400
Subject: [PATCH 6/8] Next steps

---
 scripts/performance/README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index bc348a66..60885b8a 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -26,3 +26,12 @@ Main points:
 # Other questions / concerns
 1. `COPY` is single threaded, there's no reason we need to stick to a single thread. https://github.com/dimitri/pgloader is much faster. We should try this out as well
 1. `prep.sh`'s tap-csv step runs to give us a data.singer file (jsonl output from the tap) this takes an extremely long time to run for one million records
+
+# Next steps to improve performance
+Next steps to improve peformance:
+- [ ] Split the current [Bulk Insert Speed PR](https://github.com/MeltanoLabs/target-postgres/pull/370) to be a seperate sink that can be turned on with a configuration setting
+- [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass
+- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. 
+- [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess
+- [ ] Add pg_table with multiple threads, no reason we couldn't do something similar in targets
+- [ ] Add a CI job that calculates performance implications of PR for every run
\ No newline at end of file

From d184bb1a64b6bba5d15beca34e1de45c42023ae7 Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 14:59:19 -0400
Subject: [PATCH 7/8] pg_loader not pg_table

---
 scripts/performance/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index 60885b8a..20f6032e 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -33,5 +33,5 @@ Next steps to improve peformance:
 - [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass
 - [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. 
 - [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess
-- [ ] Add pg_table with multiple threads, no reason we couldn't do something similar in targets
+- [ ] Add [pg_loader](https://github.com/dimitri/pgloader) with multiple threads, no reason we couldn't do something similar in targets
 - [ ] Add a CI job that calculates performance implications of PR for every run
\ No newline at end of file

From 6872180b1047db45cc661b8fec1c5a06b49ec35a Mon Sep 17 00:00:00 2001
From: Derek Visch <dvisch@autoidm.com>
Date: Mon, 16 Sep 2024 15:05:06 -0400
Subject: [PATCH 8/8] how to section was wrong

---
 scripts/performance/README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index 20f6032e..349bdaa0 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -11,8 +11,7 @@ Main points:
 
 # How to run
 1. `./prep.sh` gets the data together for you in the right place
-2. `python speed_compare.py ./meltano_import.sh ./pg_copy_upsert.sh` runs each and gives you a nice time comparisons
-3. `python speed_compare.py ./target_postgres_copy_branch.sh ./target-postgres_copy_branch_no_validate.sh`
+2. `python speed_compare.py` runs all the tests and gives you the times for each test
 
 # Results for 1 million records
 | **Test Name**                                               | **Total Run Time (s)** | **x Slower Than Native Copy** |
@@ -31,7 +30,7 @@ Main points:
 Next steps to improve peformance:
 - [ ] Split the current [Bulk Insert Speed PR](https://github.com/MeltanoLabs/target-postgres/pull/370) to be a seperate sink that can be turned on with a configuration setting
 - [ ] Test the new sink with the same tests as the main sink and add failures for the one's we know do not pass
-- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation. 
+- [ ] Note to folks in the main README about peformance and how to get the best performance right now is to turn on COPY mode, turn off record validation.
 - [ ] Evaluate why we're not closer to native copy speeds. Within 50% of native speeds seems reasonable but that's just a guess
 - [ ] Add [pg_loader](https://github.com/dimitri/pgloader) with multiple threads, no reason we couldn't do something similar in targets
-- [ ] Add a CI job that calculates performance implications of PR for every run
\ No newline at end of file
+- [ ] Add a CI job that calculates performance implications of PR for every run