Replaced industry codes with industry names

Replaced industry codes with readable industry names in the backtesting data frame. The industry mapping dataframe(dataloader.load_industry_mapping) now has 6 columns: ['Primary Industry', 'Secondary Industry', '一级行业', '二级行业', 'pri_indus_code', 'secon_indus_code']. You can choose among industry codes, industry names in Chinese and industry names in English by modifying INDUSTRY_COLS in constants.py
polo2444172276 · Mar 1, 2022 · b59c628 · b59c628
1 parent 30b325a
commit b59c628
Show file tree

Hide file tree

Showing 7 changed files with 42 additions and 24 deletions.
diff --git a/Data/raw_data/industry_code_to_names.xlsx b/Data/raw_data/industry_code_to_names.xlsx
diff --git a/Data/raw_data/~$industry_code_to_names.xlsx b/Data/raw_data/~$industry_code_to_names.xlsx
diff --git a/README.md b/README.md
@@ -96,17 +96,13 @@ Whenever you change the folder structure, please update the following diagram.
 │   │   ├── is_st.h5
 │   │   ├── is_suspended.h5
 │   │   ├── listed_dates.h5
-│   │   └── stock_names.h5
+│   │   ├── stock_names.h5
+│   │   └── industry_code_to_names.xlsx
 │   ├── stock_data
 │   │   ├── sh600000.csv
 │   │   ...
 │   │   └── sz301039.csv
 ├── README.md
-├── __pycache__
-│   ├── Dataloader_ricequant.cpython-37.pyc
-│   ├── constants.cpython-37.pyc
-│   ├── preprocess.cpython-37.pyc
-│   └── utils.cpython-37.pyc
 ├── environment.yml
 ├── makefiles
 │   ├── makefile_mac_notebook_to_py.sh
@@ -132,13 +128,6 @@ Whenever you change the folder structure, please update the following diagram.
 │   └── single_factor_analysis.py
 └── src
     ├── __init__.py
-    ├── __pycache__
-    │   ├── __init__.cpython-37.pyc
-    │   ├── constants.cpython-37.pyc
-    │   ├── dataloader.cpython-37.pyc
-    │   ├── factor_combinator.cpython-37.pyc
-    │   ├── preprocess.cpython-37.pyc
-    │   └── utils.cpython-37.pyc
     ├── constants.py
     ├── dataloader.py
     ├── factor_combinator.py

diff --git a/src/constants.py b/src/constants.py
@@ -17,7 +17,5 @@
 rebalancing_dates = pd.date_range(start=START_DATE, end=END_DATE, freq='BM')
 
 INDEX_COLS = ['date', 'stock']
-FACTORS = {
-    'value': ['pb_ratio_ttm', 'pe_ratio_ttm', 'pcf_ratio_ttm']
-}
-NECESSARY_COLS = ['market_value', 'open', 'close', 'next_period_return', 'secon_indus_code', 'pri_indus_code']
+INDUSTRY_COLS = ['一级行业', '二级行业']
+NECESSARY_COLS = ['market_value', 'open', 'close', 'next_period_return', ] + INDUSTRY_COLS
diff --git a/src/dataloader.py b/src/dataloader.py
@@ -6,6 +6,7 @@
 import pathos
 from src.constants import *
 from concurrent.futures import ThreadPoolExecutor
+from src.utils import *
 
 # Use rq_crendential.json to fill out Ricequant credentials
 # WARNING: MAKE SURE rq_crendential.json ARE NOT COMMITTED TO GITHUB
@@ -96,6 +97,7 @@ def get_df(name):
         # stock_info_list = executor.map(get_df, csv_names)
     return list(stock_info_list)
 
+@timer
 def load_basic_info():
     """
     Returns:
@@ -119,6 +121,9 @@ def load_basic_info():
 
 def load_industry_mapping():
     if not os.path.exists("./Data/raw_data/industry_mapping.h5"):
+        # Extract industry mapping data from ricequant if it's not on the local computer.
+        # Extracting from ricequant is quite time consuming. Alternaively, you can download the data from the 
+        # cloud folder
         indus_to_stock = {industry: rq.industry(industry) for industry in industry_codes}
         stock_to_indus = {}
         for indus, stock_names in indus_to_stock.items():
@@ -128,7 +133,30 @@ def load_industry_mapping():
         df_indus_mapping = pd.Series(stock_to_indus, name='secon_indus_code').to_frame()
         df_indus_mapping['pri_indus_code'] = df_indus_mapping['secon_indus_code'].str[0]
         df_indus_mapping.to_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping')
-    df_indus_mapping = pd.read_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping')
+
+    # Load the full industry mapping containing industry codes(A to S), industry names in Chinese, and industry names in English of each stock for both primary and secondary industries.
+    # The full industry mapping dataframe is obtained by first loading a main dataframe mapping stocks to their industry codes, and then merging the rest two dataframe, which maps industry codes to industry
+    # names, onto this dataframe.
+    # 'industry_code_to_names.xlsx' is artificially created based on information on https://www.ricequant.com/doc/rqdata/python/stock-mod.html#industry-获取某行业股票列表 
+    df_pri_indus_names = pd.read_excel(os.path.join(DATAPATH, 'raw_data', 'industry_code_to_names.xlsx'), 'Primary Industries')
+    df_secon_indus_names = pd.read_excel(os.path.join(DATAPATH, 'raw_data', 'industry_code_to_names.xlsx'), 'Secondary Industries')
+    df_indus_mapping = pd.read_hdf("./Data/raw_data/industry_mapping.h5", key='industry_mapping').reset_index().rename(columns={'index': 'stock'})
+    df_indus_mapping = df_indus_mapping.merge(df_pri_indus_names, how='left', left_on='pri_indus_code', right_on='pri_indus_code' )
+    df_indus_mapping = df_indus_mapping.merge(df_secon_indus_names, how='left', left_on='secon_indus_code', right_on='secon_indus_code' )
+    df_indus_mapping = df_indus_mapping.set_index('stock')
+    assert(set(df_indus_mapping.columns).issuperset(
+           set(['Primary Industry', 'Secondary Industry', '一级行业', '二级行业', 'pri_indus_code', 'secon_indus_code']) 
+           )
+    )
+    #depending on user input, choose which set of columns to use as industry names
+    # if form == 'english':
+    #     indus_cols = ['Primary Industry', 'Secondary Industry']
+    # elif form == '中文':
+    #     indus_cols = ['一级行业', '二级行业']
+    # elif form == 'code':
+    #     indus_cols = ['pri_indus_code', 'secon_indus_code']
+    # else:
+    #     raise Exception(f"'{form}' is not a valid input for form!")
     return df_indus_mapping
 
 def load_st_data(stock_names, dates) -> pd.DataFrame:

diff --git a/src/factor_combinator.py b/src/factor_combinator.py
@@ -60,7 +60,7 @@ def combine_factors(self, df_factor_weights: pd.DataFrame):
         Args:
             df_factor_weights (pd.DataFrame): This dataframe gives the factor weights
                                               Its index should be a subset of the rebalancing dates in self.df_backtest
-                                              It must contain the weight columns in self.weight_cols
+                                              It must contain all columns in self.weight_cols
                                               The factor weights in each row should be non-negative and add up to 1.
         """
         #the factor weights must be non-negative
@@ -97,7 +97,7 @@ def get_factor_weights(self, ):
 
 class FactorCombinatorByIC(FactorCombinator):
     """Combines factor exposures according to the weights that maximizes IC value
-       See Huatai MultiFactor Report #10
+       See Huatai MultiFactor Report #10 华泰金工多因子系列研报-10
     """
     def __init__(self, hist_periods:int=12, *args, **kwargs):
 
@@ -155,7 +155,7 @@ def get_factor_weights(self, ) -> pd.DataFrame:
         2. Solves a convex optimization problem to determine which set of factor weights gives the highest expected IC value
            for the combined factor. Here IC values are assumed to be linearly addable and scalable.
         Returns:
-            pd.DataFrame: This dataframe gives the optimal factor weights
+            pd.DataFrame: A dataframe giving the optimal factor weights
                             Its index should be a subset of the rebalancing dates in self.df_backtest
                             It must contain the weight columns in self.weight_cols
                             The factor weights in each row should be non-negative and add up to 1.

diff --git a/src/preprocess.py b/src/preprocess.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import src.dataloader as dl
 import matplotlib.pyplot as plt
+import numpy as np
 
 class TimeAndStockFilter:
     """
@@ -92,7 +93,7 @@ def postprocess(self):
         # filter out unnecessary columns
         self.df_backtest = self.df_backtest.loc[:, self.df_backtest.columns.isin(NECESSARY_COLS)]
         #add primary and secondary industry codes to the dataframe
-        self.df_backtest = self.df_backtest.merge(dl.load_industry_mapping(), how='left', left_on='stock', right_index=True, )
+        self.df_backtest = self.df_backtest.merge(dl.load_industry_mapping()[INDUSTRY_COLS], how='left', left_on='stock', right_index=True, )
 
     def run(self):
         self.preprocess()
@@ -123,7 +124,7 @@ def get_factor_path(type, factor):
     # all_factor_paths = [path for path in all_factor_paths if path not in df_backtest.columns]
     print(all_factor_paths)
 
-    def get_factor_data(file_path):
+    def get_factor_data(file_path): #each call takes around 3 to 4 seconds
         df_factor = pd.read_hdf(file_path)
         df_factor = df_factor.reset_index().rename(columns={'order_book_id': 'stock'})
         df_factor = df_factor[df_factor['date'].isin(rebalancing_dates)].set_index(INDEX_COLS).sort_index()
@@ -132,7 +133,9 @@ def get_factor_data(file_path):
     with pathos.multiprocessing.ProcessPool(pathos.helpers.cpu_count()) as pool:
     # with ThreadPoolExecutor() as pool:
         factor_results = pool.map(get_factor_data, all_factor_paths)
-    df_factor = pd.concat(factor_results, axis=1)  
+    df_factor = pd.concat(factor_results, axis=1)
+
+    df_factor = df_factor.replace([np.inf, -np.inf], np.nan)  
     df_backtest = df_backtest.merge(df_factor, how='left', left_index=True, right_index=True)
     return df_backtest