Skip to content

Commit c99d8ed

Browse files
committed
Add flat files stock trades tutorial
1 parent dfec732 commit c99d8ed

15 files changed

+460
-126
lines changed

examples/rest/demo_correlation_matrix.py

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
essential to do your own research or consult a financial advisor for
4141
personalized advice when investing.
4242
"""
43+
4344
import pandas as pd # type: ignore
4445
import numpy as np # type: ignore
4546
import seaborn as sns # type: ignore
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# We can use a Python script that aggregates trades by exchange into 30-minute
2+
# chunks, setting the stage for a visual analysis. This approach will highlight
3+
# trade flows, including opening hours and peak activity times, across the
4+
# exchanges. Please see https://polygon.io/blog/insights-from-trade-level-data
5+
#
6+
import pandas as pd
7+
import seaborn as sns
8+
import matplotlib.pyplot as plt
9+
import numpy as np
10+
import pytz
11+
12+
# Replace '2024-04-05.csv' with the path to your actual file
13+
file_path = "2024-04-05.csv"
14+
15+
# Load the CSV file into a pandas DataFrame
16+
df = pd.read_csv(file_path)
17+
18+
# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
19+
df["participant_timestamp"] = pd.to_datetime(
20+
df["participant_timestamp"], unit="ns", utc=True
21+
)
22+
23+
# Convert to Eastern Time (ET), accounting for both EST and EDT
24+
df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
25+
"America/New_York"
26+
)
27+
28+
# Create a new column for 30-minute time intervals, now in ET
29+
df["time_interval"] = df["participant_timestamp"].dt.floor("30T").dt.time
30+
31+
# Ensure full 24-hour coverage by generating all possible 30-minute intervals
32+
all_intervals = pd.date_range(start="00:00", end="23:59", freq="30T").time
33+
all_exchanges = df["exchange"].unique()
34+
full_index = pd.MultiIndex.from_product(
35+
[all_exchanges, all_intervals], names=["exchange", "time_interval"]
36+
)
37+
38+
# Group by 'exchange' and 'time_interval', count trades, and reset index
39+
grouped = (
40+
df.groupby(["exchange", "time_interval"])
41+
.size()
42+
.reindex(full_index, fill_value=0)
43+
.reset_index(name="trade_count")
44+
)
45+
46+
# Pivot the DataFrame for the heatmap, ensuring all intervals and exchanges are represented
47+
pivot_table = grouped.pivot("exchange", "time_interval", "trade_count").fillna(0)
48+
49+
# Apply a log scale transformation to the trade counts + 1 to handle zero trades correctly
50+
log_scale_data = np.log1p(pivot_table.values)
51+
52+
# Plotting the heatmap using the log scale data
53+
plt.figure(figsize=(20, 10))
54+
sns.heatmap(
55+
log_scale_data,
56+
annot=False,
57+
cmap="Reds",
58+
linewidths=0.5,
59+
cbar=False,
60+
xticklabels=[t.strftime("%H:%M") for t in all_intervals],
61+
yticklabels=pivot_table.index,
62+
)
63+
plt.title("Trade Count Heatmap by Exchange and Time Interval (Log Scale, ET)")
64+
plt.ylabel("Exchange")
65+
plt.xlabel("Time Interval (ET)")
66+
plt.xticks(rotation=45)
67+
plt.tight_layout() # Adjust layout to not cut off labels
68+
plt.show()
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Here's a Python script for analyzing the dataset, that identifies the
2+
# distribution of trades across different exchanges and calculates their
3+
# respective percentages of the total trades. Please see
4+
# https://polygon.io/blog/insights-from-trade-level-data
5+
#
6+
import pandas as pd
7+
8+
# Replace '2024-04-05.csv' with the path to your actual file
9+
file_path = "2024-04-05.csv"
10+
11+
# Load the CSV file into a pandas DataFrame
12+
df = pd.read_csv(file_path)
13+
14+
# Count the number of trades for each exchange
15+
exchange_counts = df["exchange"].value_counts()
16+
17+
# Calculate the total number of trades
18+
total_trades = exchange_counts.sum()
19+
20+
# Print out all exchanges and their percentage of total trades
21+
for exchange, count in exchange_counts.items():
22+
percentage = (count / total_trades) * 100
23+
print(f"Exchange {exchange}: {count} trades, {percentage:.2f}% of total trades")
Loading
Loading
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Polygon.io Flat Files Stock Trades Analysis Scripts
2+
3+
This repository contains Python scripts for analyzing stock market trading data using Flat Files from Polygon.io. These scripts demonstrate various ways to dissect and visualize trade data for comprehensive market analysis.
4+
5+
Please see the tutorial: [Deep Dive into Trade-Level Data with Flat Files](https://polygon.io/blog/insights-from-trade-level-data)
6+
7+
## Scripts Overview
8+
9+
### **exchange-heatmap.py**
10+
This script aggregates trades by exchange into 30-minute chunks and creates a heatmap visualization. It highlights the flow of trades and peak activity times across different exchanges, providing insights into how different exchanges operate throughout the day.
11+
12+
![Treemap Visualization](./heatmap.png)
13+
14+
### **exchanges-seen.py**
15+
Analyzes the distribution of trades across different exchanges and calculates their respective percentages of total trades. This script helps identify which exchanges handle the most trading volume, offering a perspective on market structure.
16+
17+
```
18+
Exchange 4: 25,570,324 trades, 36.32% of total trades
19+
Exchange 12: 15,147,689 trades, 21.52% of total trades
20+
Exchange 11: 6,877,306 trades, 9.77% of total trades
21+
Exchange 19: 5,098,852 trades, 7.24% of total trades
22+
Exchange 10: 4,006,611 trades, 5.69% of total trades
23+
Exchange 8: 3,686,168 trades, 5.24% of total trades
24+
Exchange 15: 2,446,340 trades, 3.47% of total trades
25+
Exchange 21: 2,173,744 trades, 3.09% of total trades
26+
Exchange 7: 1,509,083 trades, 2.14% of total trades
27+
Exchange 20: 1,296,811 trades, 1.84% of total trades
28+
Exchange 18: 674,553 trades, 0.96% of total trades
29+
Exchange 13: 527,767 trades, 0.75% of total trades
30+
Exchange 2: 417,295 trades, 0.59% of total trades
31+
Exchange 3: 393,919 trades, 0.56% of total trades
32+
Exchange 17: 230,210 trades, 0.33% of total trades
33+
Exchange 1: 183,010 trades, 0.26% of total trades
34+
Exchange 9: 159,020 trades, 0.23% of total trades
35+
Exchange 14: 1,211 trades, 0.00% of total trades
36+
```
37+
38+
### **top-10-tickers.py**
39+
Identifies the top 10 most traded stocks and calculates their respective percentages of the total trades. This script provides a clear view of the market's most active stocks, highlighting where the most trading activity is concentrated.
40+
41+
```
42+
TSLA: 1,549,605 trades, 2.20% of total trades
43+
NVDA: 788,331 trades, 1.12% of total trades
44+
SPY: 669,762 trades, 0.95% of total trades
45+
AMD: 587,140 trades, 0.83% of total trades
46+
MDIA: 561,698 trades, 0.80% of total trades
47+
AAPL: 540,870 trades, 0.77% of total trades
48+
SOXL: 533,511 trades, 0.76% of total trades
49+
QQQ: 508,822 trades, 0.72% of total trades
50+
CADL: 466,604 trades, 0.66% of total trades
51+
AMZN: 465,526 trades, 0.66% of total trades
52+
```
53+
54+
### **trades-histogram.py**
55+
Creates a histogram that aggregates trades into 30-minute intervals throughout the day. This visualization helps understand the distribution of trading volume across different times, including pre-market, regular trading hours, and after-hours.
56+
57+
![Treemap Visualization](./histogram.png)
58+
59+
## Download the Data
60+
61+
First, let's download an actual file and explore the data and see what we can learn. We start by downloading the trades for 2024-04-05 via the [File Browser](https://polygon.io/flat-files/stocks-trades/2024/04). The `us_stocks_sip/trades_v1/2024/04/2024-04-05.csv.gz` file is about 1.35GB and is in a compressed gzip format.
62+
63+
```
64+
gunzip 2024-04-05.csv.gz
65+
```
66+
67+
## Getting Started
68+
69+
To run these scripts, you will need Python 3 and several dependencies installed, including pandas, matplotlib, seaborn, and pytz. Ensure that you have the trading data file available and modify the `file_path` variable in each script to point to your data file location.
70+
71+
```
72+
pip install pandas matplotlib seaborn pytz
73+
```
74+
75+
## Usage
76+
77+
Each script is designed to be run independently:
78+
79+
```bash
80+
python exchange-heatmap.py
81+
python exchanges-seen.py
82+
python top-10-tickers.py
83+
python trades-histogram.py
84+
```
85+
86+
Adjust the script parameters as necessary to fit your specific analysis needs or to accommodate different datasets.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Here's a Python script for analyzing the dataset, that identifies the top 10
2+
# most traded stocks and calculates their respective percentages of the total
3+
# trades. Please see https://polygon.io/blog/insights-from-trade-level-data
4+
#
5+
import pandas as pd
6+
7+
# Replace '2024-04-05.csv' with the path to your actual file
8+
file_path = "2024-04-05.csv"
9+
10+
# Load the CSV file into a pandas DataFrame
11+
df = pd.read_csv(file_path)
12+
13+
# Count the number of trades for each ticker
14+
trade_counts = df["ticker"].value_counts()
15+
16+
# Calculate the total number of trades
17+
total_trades = trade_counts.sum()
18+
19+
# Get the top 10 traded stocks
20+
top_10_traded = trade_counts.head(10)
21+
22+
# Print out the top 10 traded stocks and their percentage of total trades
23+
for ticker, count in top_10_traded.items():
24+
percentage = (count / total_trades) * 100
25+
print(f"{ticker}: {count} trades, {percentage:.2f}% of total trades")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# To visualize these dynamics, we can use a Python script to create a histogram
2+
# aggregating trades into 30-minute intervals, providing a clear view of when
3+
# trading activity concentrates during the day. This analysis aims to highlight
4+
# the distribution of trading volume across the day, from pre-market to after-
5+
# hours. Please see https://polygon.io/blog/insights-from-trade-level-data
6+
#
7+
import pandas as pd
8+
import matplotlib.pyplot as plt
9+
10+
# Replace '2024-04-05.csv' with the path to your actual file
11+
file_path = "2024-04-05.csv"
12+
13+
# Load the CSV file into a pandas DataFrame
14+
df = pd.read_csv(file_path)
15+
16+
# Convert 'participant_timestamp' to datetime (assuming nanoseconds Unix timestamp)
17+
df["participant_timestamp"] = pd.to_datetime(
18+
df["participant_timestamp"], unit="ns", utc=True
19+
)
20+
21+
# Convert to Eastern Time (ET), accounting for both EST and EDT
22+
df["participant_timestamp"] = df["participant_timestamp"].dt.tz_convert(
23+
"America/New_York"
24+
)
25+
26+
# Create a new column for 30-minute time intervals, now in ET
27+
df["time_interval"] = df["participant_timestamp"].dt.floor("30T")
28+
29+
# Aggregate trades into 30-minute intervals for the entire dataset
30+
trade_counts_per_interval = df.groupby("time_interval").size()
31+
32+
# Prepare the plot
33+
plt.figure(figsize=(15, 7))
34+
35+
# Plotting the histogram/bar chart
36+
bars = plt.bar(
37+
trade_counts_per_interval.index, trade_counts_per_interval.values, width=0.02
38+
)
39+
40+
# Adding trade count annotations on each bar
41+
for bar in bars:
42+
height = bar.get_height()
43+
plt.annotate(
44+
f"{int(height)}",
45+
xy=(bar.get_x() + bar.get_width() / 2, height),
46+
xytext=(0, 3), # 3 points vertical offset
47+
textcoords="offset points",
48+
ha="center",
49+
va="bottom",
50+
)
51+
52+
plt.title("Trade Counts Aggregated by 30-Minute Intervals (ET)")
53+
plt.xlabel("Time Interval (ET)")
54+
plt.ylabel("Number of Trades")
55+
plt.xticks(rotation=45, ha="right")
56+
57+
# Ensure that every 30-minute interval is represented on the x-axis
58+
plt.gca().set_xticklabels(
59+
[t.strftime("%Y-%m-%d %H:%M") for t in trade_counts_per_interval.index], rotation=90
60+
)
61+
62+
plt.tight_layout()
63+
plt.show()

polygon/rest/models/conditions.py

+20-12
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,16 @@ class UpdateRules:
4747
@staticmethod
4848
def from_dict(d):
4949
return UpdateRules(
50-
consolidated=None
51-
if "consolidated" not in d
52-
else Consolidated.from_dict(d["consolidated"]),
53-
market_center=None
54-
if "market_center" not in d
55-
else MarketCenter.from_dict(d["market_center"]),
50+
consolidated=(
51+
None
52+
if "consolidated" not in d
53+
else Consolidated.from_dict(d["consolidated"])
54+
),
55+
market_center=(
56+
None
57+
if "market_center" not in d
58+
else MarketCenter.from_dict(d["market_center"])
59+
),
5660
)
5761

5862

@@ -82,11 +86,15 @@ def from_dict(d):
8286
id=d.get("id", None),
8387
legacy=d.get("legacy", None),
8488
name=d.get("name", None),
85-
sip_mapping=None
86-
if "sip_mapping" not in d
87-
else SipMapping.from_dict(d["sip_mapping"]),
89+
sip_mapping=(
90+
None
91+
if "sip_mapping" not in d
92+
else SipMapping.from_dict(d["sip_mapping"])
93+
),
8894
type=d.get("type", None),
89-
update_rules=None
90-
if "update_rules" not in d
91-
else UpdateRules.from_dict(d["update_rules"]),
95+
update_rules=(
96+
None
97+
if "update_rules" not in d
98+
else UpdateRules.from_dict(d["update_rules"])
99+
),
92100
)

polygon/rest/models/contracts.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,11 @@ class OptionsContract:
3232
@staticmethod
3333
def from_dict(d):
3434
return OptionsContract(
35-
additional_underlyings=None
36-
if "additional_underlyings" not in d
37-
else [Underlying.from_dict(u) for u in d["additional_underlyings"]],
35+
additional_underlyings=(
36+
None
37+
if "additional_underlyings" not in d
38+
else [Underlying.from_dict(u) for u in d["additional_underlyings"]]
39+
),
3840
cfi=d.get("cfi", None),
3941
contract_type=d.get("contract_type", None),
4042
correction=d.get("correction", None),

0 commit comments

Comments
 (0)