Skip to content

Commit

Permalink
Replaced industry codes with industry names
Browse files Browse the repository at this point in the history
Replaced industry codes with readable industry names in the backtesting data frame.
The industry mapping dataframe(dataloader.load_industry_mapping) now has 6 columns: ['Primary Industry', 'Secondary Industry', '一级行业', '二级行业', 'pri_indus_code', 'secon_indus_code'].
You can choose among industry codes, industry names in Chinese and industry names in English by modifying INDUSTRY_COLS in constants.py
  • Loading branch information
polo2444172276 committed Mar 1, 2022
1 parent 30b325a commit b59c628
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 24 deletions.
Binary file added Data/raw_data/industry_code_to_names.xlsx
Binary file not shown.
Binary file added Data/raw_data/~$industry_code_to_names.xlsx
Binary file not shown.
15 changes: 2 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,13 @@ Whenever you change the folder structure, please update the following diagram.
│   │   ├── is_st.h5
│   │   ├── is_suspended.h5
│   │   ├── listed_dates.h5
│   │   └── stock_names.h5
│   │   ├── stock_names.h5
│   │   └── industry_code_to_names.xlsx
│   ├── stock_data
│   │   ├── sh600000.csv
│   │   ...
│   │   └── sz301039.csv
├── README.md
├── __pycache__
│   ├── Dataloader_ricequant.cpython-37.pyc
│   ├── constants.cpython-37.pyc
│   ├── preprocess.cpython-37.pyc
│   └── utils.cpython-37.pyc
├── environment.yml
├── makefiles
│   ├── makefile_mac_notebook_to_py.sh
Expand All @@ -132,13 +128,6 @@ Whenever you change the folder structure, please update the following diagram.
│   └── single_factor_analysis.py
└── src
├── __init__.py
├── __pycache__
│   ├── __init__.cpython-37.pyc
│   ├── constants.cpython-37.pyc
│   ├── dataloader.cpython-37.pyc
│   ├── factor_combinator.cpython-37.pyc
│   ├── preprocess.cpython-37.pyc
│   └── utils.cpython-37.pyc
├── constants.py
├── dataloader.py
├── factor_combinator.py
Expand Down
6 changes: 2 additions & 4 deletions src/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,5 @@
rebalancing_dates = pd.date_range(start=START_DATE, end=END_DATE, freq='BM')

INDEX_COLS = ['date', 'stock']
FACTORS = {
'value': ['pb_ratio_ttm', 'pe_ratio_ttm', 'pcf_ratio_ttm']
}
NECESSARY_COLS = ['market_value', 'open', 'close', 'next_period_return', 'secon_indus_code', 'pri_indus_code']
INDUSTRY_COLS = ['一级行业', '二级行业']
NECESSARY_COLS = ['market_value', 'open', 'close', 'next_period_return', ] + INDUSTRY_COLS
30 changes: 29 additions & 1 deletion src/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pathos
from src.constants import *
from concurrent.futures import ThreadPoolExecutor
from src.utils import *

# Use rq_crendential.json to fill out Ricequant credentials
# WARNING: MAKE SURE rq_crendential.json ARE NOT COMMITTED TO GITHUB
Expand Down Expand Up @@ -96,6 +97,7 @@ def get_df(name):
# stock_info_list = executor.map(get_df, csv_names)
return list(stock_info_list)

@timer
def load_basic_info():
"""
Returns:
Expand All @@ -119,6 +121,9 @@ def load_basic_info():

def load_industry_mapping():
if not os.path.exists("./Data/raw_data/industry_mapping.h5"):
# Extract industry mapping data from ricequant if it's not on the local computer.
# Extracting from ricequant is quite time consuming. Alternaively, you can download the data from the
# cloud folder
indus_to_stock = {industry: rq.industry(industry) for industry in industry_codes}
stock_to_indus = {}
for indus, stock_names in indus_to_stock.items():
Expand All @@ -128,7 +133,30 @@ def load_industry_mapping():
df_indus_mapping = pd.Series(stock_to_indus, name='secon_indus_code').to_frame()
df_indus_mapping['pri_indus_code'] = df_indus_mapping['secon_indus_code'].str[0]
df_indus_mapping.to_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping')
df_indus_mapping = pd.read_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping')

# Load the full industry mapping containing industry codes(A to S), industry names in Chinese, and industry names in English of each stock for both primary and secondary industries.
# The full industry mapping dataframe is obtained by first loading a main dataframe mapping stocks to their industry codes, and then merging the rest two dataframe, which maps industry codes to industry
# names, onto this dataframe.
# 'industry_code_to_names.xlsx' is artificially created based on information on https://www.ricequant.com/doc/rqdata/python/stock-mod.html#industry-获取某行业股票列表
df_pri_indus_names = pd.read_excel(os.path.join(DATAPATH, 'raw_data', 'industry_code_to_names.xlsx'), 'Primary Industries')
df_secon_indus_names = pd.read_excel(os.path.join(DATAPATH, 'raw_data', 'industry_code_to_names.xlsx'), 'Secondary Industries')
df_indus_mapping = pd.read_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping').reset_index().rename(columns={'index': 'stock'})
df_indus_mapping = df_indus_mapping.merge(df_pri_indus_names, how='left', left_on='pri_indus_code', right_on='pri_indus_code' )
df_indus_mapping = df_indus_mapping.merge(df_secon_indus_names, how='left', left_on='secon_indus_code', right_on='secon_indus_code' )
df_indus_mapping = df_indus_mapping.set_index('stock')
assert(set(df_indus_mapping.columns).issuperset(
set(['Primary Industry', 'Secondary Industry', '一级行业', '二级行业', 'pri_indus_code', 'secon_indus_code'])
)
)
#depending on user input, choose which set of columns to use as industry names
# if form == 'english':
# indus_cols = ['Primary Industry', 'Secondary Industry']
# elif form == '中文':
# indus_cols = ['一级行业', '二级行业']
# elif form == 'code':
# indus_cols = ['pri_indus_code', 'secon_indus_code']
# else:
# raise Exception(f"'{form}' is not a valid input for form!")
return df_indus_mapping

def load_st_data(stock_names, dates) -> pd.DataFrame:
Expand Down
6 changes: 3 additions & 3 deletions src/factor_combinator.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def combine_factors(self, df_factor_weights: pd.DataFrame):
Args:
df_factor_weights (pd.DataFrame): This dataframe gives the factor weights
Its index should be a subset of the rebalancing dates in self.df_backtest
It must contain the weight columns in self.weight_cols
It must contain all columns in self.weight_cols
The factor weights in each row should be non-negative and add up to 1.
"""
#the factor weights must be non-negative
Expand Down Expand Up @@ -97,7 +97,7 @@ def get_factor_weights(self, ):

class FactorCombinatorByIC(FactorCombinator):
"""Combines factor exposures according to the weights that maximizes IC value
See Huatai MultiFactor Report #10
See Huatai MultiFactor Report #10 华泰金工多因子系列研报-10
"""
def __init__(self, hist_periods:int=12, *args, **kwargs):

Expand Down Expand Up @@ -155,7 +155,7 @@ def get_factor_weights(self, ) -> pd.DataFrame:
2. Solves a convex optimization problem to determine which set of factor weights gives the highest expected IC value
for the combined factor. Here IC values are assumed to be linearly addable and scalable.
Returns:
pd.DataFrame: This dataframe gives the optimal factor weights
pd.DataFrame: A dataframe giving the optimal factor weights
Its index should be a subset of the rebalancing dates in self.df_backtest
It must contain the weight columns in self.weight_cols
The factor weights in each row should be non-negative and add up to 1.
Expand Down
9 changes: 6 additions & 3 deletions src/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pandas as pd
import src.dataloader as dl
import matplotlib.pyplot as plt
import numpy as np

class TimeAndStockFilter:
"""
Expand Down Expand Up @@ -92,7 +93,7 @@ def postprocess(self):
# filter out unnecessary columns
self.df_backtest = self.df_backtest.loc[:, self.df_backtest.columns.isin(NECESSARY_COLS)]
#add primary and secondary industry codes to the dataframe
self.df_backtest = self.df_backtest.merge(dl.load_industry_mapping(), how='left', left_on='stock', right_index=True, )
self.df_backtest = self.df_backtest.merge(dl.load_industry_mapping()[INDUSTRY_COLS], how='left', left_on='stock', right_index=True, )

def run(self):
self.preprocess()
Expand Down Expand Up @@ -123,7 +124,7 @@ def get_factor_path(type, factor):
# all_factor_paths = [path for path in all_factor_paths if path not in df_backtest.columns]
print(all_factor_paths)

def get_factor_data(file_path):
def get_factor_data(file_path): #each call takes around 3 to 4 seconds
df_factor = pd.read_hdf(file_path)
df_factor = df_factor.reset_index().rename(columns={'order_book_id': 'stock'})
df_factor = df_factor[df_factor['date'].isin(rebalancing_dates)].set_index(INDEX_COLS).sort_index()
Expand All @@ -132,7 +133,9 @@ def get_factor_data(file_path):
with pathos.multiprocessing.ProcessPool(pathos.helpers.cpu_count()) as pool:
# with ThreadPoolExecutor() as pool:
factor_results = pool.map(get_factor_data, all_factor_paths)
df_factor = pd.concat(factor_results, axis=1)
df_factor = pd.concat(factor_results, axis=1)

df_factor = df_factor.replace([np.inf, -np.inf], np.nan)
df_backtest = df_backtest.merge(df_factor, how='left', left_index=True, right_index=True)
return df_backtest

Expand Down

0 comments on commit b59c628

Please sign in to comment.