-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcardinality-based-q-error-per-node-using-yaml.py
92 lines (75 loc) · 2.8 KB
/
cardinality-based-q-error-per-node-using-yaml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# pip install psycopg2
# pip install pyyaml
import psycopg2
import yaml
# Database connection parameters
conn_params = {
'database': 'imdb',
'user': 'postgres',
'password': '5trathm0re',
'host': 'localhost',
'port': '5433'
}
# Connect to the PostgreSQL database
conn = psycopg2.connect(**conn_params)
cur = conn.cursor()
# Set the schema
cur.execute("SET search_path TO imdb_schema;")
# Disable query optimizer options (if necessary)
# cur.execute("SET enable_hashjoin = OFF;")
def execute_query_and_calculate_qerror(query):
# Get the EXPLAIN ANALYZE output in YAML format
cur.execute("EXPLAIN (BUFFERS, VERBOSE, ANALYZE, FORMAT YAML) " + query)
analyze_results = cur.fetchall()
analyze_data = yaml.safe_load(analyze_results[0][0])
# Get the EXPLAIN output in YAML format
cur.execute("EXPLAIN (FORMAT YAML) " + query)
explain_results = cur.fetchall()
explain_data = yaml.safe_load(explain_results[0][0])
# Process YAML data to extract actual and estimated rows and maintain node order
def extract_rows(data, key_actual, key_estimated):
q_error_results = []
def recurse_nodes(node, path=""):
node_type = node.get('Node Type', 'Unknown')
new_path = f"{path}/{node_type}" if path else node_type
if 'Plans' in node:
for subnode in node['Plans']:
recurse_nodes(subnode, new_path)
actual_rows = node.get(key_actual, 0)
estimated_rows = node.get(key_estimated, 0)
if actual_rows > 0:
q_error = max(estimated_rows / actual_rows, actual_rows / estimated_rows)
else:
q_error = float('inf') # Handle cases where actual rows are zero
q_error_results.append((new_path, actual_rows, estimated_rows, q_error))
recurse_nodes(data[0]['Plan'])
return q_error_results
actual_key = 'Actual Rows'
estimated_key = 'Plan Rows'
q_error = extract_rows(analyze_data, actual_key, estimated_key)
return q_error
# Example usage
# query = "SELECT title FROM title WHERE kind_id = 7;"
query = """
SELECT
t.title
FROM
title t
JOIN kind_type kt ON
(t.kind_id = kt.id)
WHERE
kind_id = 7;
"""
q_error = execute_query_and_calculate_qerror(query)
print("\nCalculation:")
print("Q-Error = max(Estimated Rows / Actual Rows, Actual Rows / Estimated Rows)\n")
print("\nInterpretation:")
print(
"* Q-error = 1 implies a perfect estimation.",
"\n* Q-error > 1 indicates how many times the estimate was off",
"compared to the actual execution.\n")
print("\nResults:")
for node, actual, estimated, error in q_error:
print(f"Node: {node}, Actual Rows: {actual}, Estimated Rows: {estimated}, Q-Error: {error}")
# Enable query optimizer options (if necessary)
# cur.execute("SET enable_hashjoin = ON;")