switch from APIs to processing files

amor71 · Sep 23, 2022 · 90803a7 · 90803a7
1 parent e95a48e
commit 90803a7
Show file tree

Hide file tree

Showing 6 changed files with 549 additions and 198 deletions.
diff --git a/README.md b/README.md
@@ -4,13 +4,8 @@
 [![codecov](https://codecov.io/gh/amor71/FINRAShortData/branch/main/graph/badge.svg?token=Gy7JKcpOqh)](https://codecov.io/gh/amor71/FINRAShortData)
 
 # FINRAShortData
-Process FINRA Short Daily Data [feeds](https://developer.finra.org/docs#query_api-equity-equity_short_interest_standardized)
 
-## Prerequisite
-
-* FINRA Developer Credentials are required. If you do not yet have an account, [create one here](https://developer.finra.org/create-account?Forward_URL=https://gateway.finra.org/app/dfo-console?rcpRedirNum=1).
-
-* Once you have access, you will need to create an API key. Daily Short Data feeds are free. [click here](https://gateway.finra.org/app/api-console/add-credential) to create API credential and follow the instructions.
+Process FINRA Short Daily Data [feeds](https://www.finra.org/finra-data/browse-catalog/short-sale-volume-data/daily-short-sale-volume-files)
 
 ## Install
 
@@ -20,29 +15,30 @@ To install the package type:
 
 ## Quick start
 
-### Authenticate
+### Example 1: Daily Short Volumes for past 2 days (inclusive)
 
 ```python
-from finrashortdata import auth
-token = auth(client_id=<your api client id>, secret=<your api secret>)
+import asyncio
+from finrashortdata import daily_shorts
+import pandas as pd
+
+df : pd.DataFrame = asyncio.run(daily_shorts(offset=2))
 ```
 
-### Example 1: Basic data loading & processing
+### Example 2: Daily Short Volumes for time_range
 
 ```python
+import asyncio
 from finrashortdata import daily_shorts
+from datetime import date
 import pandas as pd
-df : pd.DataFrame = daily_shorts(token)
-```
 
-### Example 2: load latest data
-```python
-from finrashortdata import daily_shorts_chunk_and_size, daily_shorts
-
-chunk, max_data = daily_shorts_chunk_and_size(token)
-df : pd.DataFrame = daily_shorts(token=token, offset=max_data-10*chunk)
+df : pd.DataFrame = asyncio.run(daily_shorts(
+    start_date=date(year=2022, month=9, day=1), 
+    end_date=date(year=2022, month=9, day=10)))
 ```
 
+*Scripts work as-is*
 
 ## Licensing
 
@@ -55,6 +51,3 @@ Use the [Issues](https://github.com/amor71/FINRAShortData/issues) section
 ## Contributing
 
 If you'd like to contribute to the project, drop me a line at mailto:[email protected]
-
-
-
diff --git a/finrashortdata/__init__.py b/finrashortdata/__init__.py
@@ -1,4 +1,3 @@
-__version__ = "0.0.13"
+__version__ = "0.1.0"
 
-from .auth import auth
-from .daily import daily_shorts, daily_shorts_chunk_and_size
+from .daily import daily_shorts
diff --git a/finrashortdata/daily.py b/finrashortdata/daily.py
@@ -1,103 +1,94 @@
-import asyncio
-import concurrent.futures
-import time
-from typing import Optional, Tuple
+import io
+from datetime import date, datetime, timezone
+from typing import List, Optional
 
 import pandas as pd
+import pandas_market_calendars
 import requests
 
 from .decorators import timeit
 
-url: str = "https://api.finra.org/data/group/OTCMarket/name/regShoDaily"
 
-
-def _requests_get(token: str, chunk_size: int, offset: int) -> pd.DataFrame:
-    r = requests.get(
-        url=url,
-        headers={
-            "Authorization": f"Bearer {token}",
-            "Accept": "application/json",
-        },
-        params={"limit": chunk_size, "offset": offset},
-    )
-    r.raise_for_status()
-
-    if r.status_code in (429, 502):
-        print(f"{url} return {r.status_code}, waiting and re-trying")
-        time.sleep(10)
-        return _requests_get(token, chunk_size, offset)
-
-    x = r.json()
-    df = pd.DataFrame(x)
-    df.rename(
-        columns={
-            "securitiesInformationProcessorSymbolIdentifier": "symbol",
-            "totalParQuantity": "volume",
-            "shortParQuantity": "shorts",
-            "shortExemptParQuantity": "exempt",
-        },
-        inplace=True,
+def _short_by_date(d: datetime) -> pd.DataFrame:
+    base_url = f'https://cdn.finra.org/equity/regsho/daily/CNMSshvol{d.strftime("%Y%m%d")}.txt'
+    content = requests.get(base_url).content
+    df = pd.read_csv(
+        io.StringIO(content.decode("utf-8")),
+        sep="|",
+        engine="python",
+        skipfooter=1,
+        keep_default_na=False,
     )
-    df.drop(["reportingFacilityCode", "marketCode"], axis=1, inplace=True)
+    df["date"] = d.date()
+
+    if not df.empty:
+        del df["Date"]
+        df["ShortPercent"] = round(
+            100.0 * df["ShortVolume"] / df["TotalVolume"], 2
+        )
+        df["ShortExemptPercent"] = round(
+            100.0 * df["ShortExemptVolume"] / df["TotalVolume"], 2
+        )
+        return df.set_index(["Symbol", "date"]).sort_index().dropna()
+
     return df
 
 
-def daily_shorts_chunk_and_size(token: str) -> Tuple[int, int]:
-    """Return the optimal chunk size and total number of data-points,
+def _get_trading_holidays(
+    mcal: pandas_market_calendars.MarketCalendar,
+) -> List[str]:
+    return mcal.holidays().holidays
 
-    Chunk size is used internally, by the daily_shorts() function
-    to reduce the number of calls to the FINRA end-point,
-    it is also used as the 'offset' step when calling daily_shorts() directly with restrictions.
 
-    Input Arguments: token obtained from the auth() function.
-    Returns: tuple with chunk size followed by number of data-points to be loaded from FINRA end-point.
-    """
-    r = requests.get(
-        url=url,
-        headers={
-            "Authorization": f"Bearer {token}",
-            "Accept": "application/json",
-        },
-        params={"limit": 1},
+def _calc_start_date_from_offset(
+    mcal: pandas_market_calendars.MarketCalendar, end_date: date, offset: int
+) -> date:
+    cbd_offset = pd.tseries.offsets.CustomBusinessDay(
+        n=offset - 1, holidays=_get_trading_holidays(mcal)
     )
-    r.raise_for_status()
-    return int(r.headers["Record-Max-Limit"]), int(r.headers["Record-Total"])
+    return (datetime.now(timezone.utc) - cbd_offset).date()
+
+
+def _short_iterator(days: List) -> pd.DataFrame:
+    df = pd.DataFrame()
+    for day in days:
+        day_df = _short_by_date(day)
+        if not day_df.empty:
+            df = (
+                day_df
+                if df.empty
+                else pd.concat([df, day_df], axis=0).sort_index()
+            )
+
+    return df
 
 
 @timeit
 async def daily_shorts(
-    token: str, offset: int = 0, limit: Optional[int] = None
+    start_date: Optional[date] = None,
+    end_date: Optional[date] = date.today(),
+    offset: Optional[int] = None,
 ) -> pd.DataFrame:
     """Download Daily Short details
 
     Input Arguments:
-        token -> obtained from the auth() function.
-        offset -> starting point (default 0).
-        limit -> end point (default not limit).
+        start_date -> Optional, start date for pulling short-date.
+        end_date -> last date (inclusive) for pulling short-date.
+        offset -> If start_date not provided, calculate start date as offset from end_date.
     Returns: If successful returns DataFrame with all details
     """
-    chunk_size, max_records = daily_shorts_chunk_and_size(token)
-
-    if limit:
-        max_records = min(max_records, limit)
-
-    print(
-        f"loading data (chunk_size={chunk_size}, offset={offset}, max_records={max_records-offset})..."
-    )
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        loop = asyncio.get_event_loop()
-        futures = [
-            loop.run_in_executor(
-                executor, _requests_get, token, chunk_size, offset
-            )
-            for offset in range(offset, max_records, chunk_size)
-        ]
-        df = (
-            pd.concat(await asyncio.gather(*futures))
-            .groupby(["tradeReportDate", "symbol"])
-            .sum()
+    if not start_date and not offset:
+        raise ValueError(
+            "daily_shorts(): must have either start_date or offset"
         )
+    elif not start_date and offset < 1:  # type: ignore
+        raise ValueError("daily_shorts(): offset >= 1")
 
-    df["short_percent"] = round(100.0 * df.shorts / df.volume, 1)
+    nyse = pandas_market_calendars.get_calendar("NYSE")
+    if not start_date:
+        start_date = _calc_start_date_from_offset(nyse, end_date, offset)  # type: ignore
 
-    return df
+    schedule = nyse.schedule(start_date=start_date, end_date=end_date)
+    days = schedule.index.to_list()
+
+    return _short_iterator(days) if len(days) else pd.DataFrame()