update sensitive word filtering module

Alannikos · Nov 19, 2024 · 50abb86 · 50abb86
1 parent a83e1f1
commit 50abb86
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -29,7 +29,8 @@ _____________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- [2024/11/18] 🌟🔧 Completed **Quick Start** section documentation updates and resolved environment setup bugs.
+- \[2024/11/19\] 🔒🛡️ Incorporate a sensitive word processing module to prevent the model from outputting harmful viewpoints.
+- [2024/11/18] 🔧🌟 Completed **Quick Start** section documentation updates and resolved environment setup bugs.
 - [2024/11/14] 🎉✨ Successfully quantized four models using the LMDeploy tool. The FunGPT family welcomes new members! Models are now available on [HuggingFace](https://huggingface.co/Alannikos768).
 - [2024/11/13] 🎉✨ Released two new 1.8B models, expanding the FunGPT family! The models are [BanterBot_1_8b-chat](https://huggingface.co/Alannikos768/BanterBot_1_8b-chat) and [BoostBot_1_8b-chat](https://huggingface.co/Alannikos768/BoostBot_1_8b-chat).
 - [2024/11/10] 🎉✨ Launched two brand-new 7B models, adding new members to the FunGPT family! The models are [BanterBot-7b-chat](https://huggingface.co/Alannikos768/BanterBot-7b-chat) and [BoostBot-7b-chat](https://huggingface.co/Alannikos768/BoostBot-7b-chat).

diff --git a/README_en.md b/README_en.md
@@ -29,7 +29,8 @@ _____________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- [2024/11/18] 🌟🔧 Completed **Quick Start** section documentation updates and resolved environment setup bugs.
+- \[2024/11/19\] 🔒🛡️ Incorporate a sensitive word processing module to prevent the model from outputting harmful viewpoints.
+- [2024/11/18] 🔧🌟 Completed **Quick Start** section documentation updates and resolved environment setup bugs.
 - [2024/11/14] 🎉✨ Successfully quantized four models using the LMDeploy tool. The FunGPT family welcomes new members! Models are now available on [HuggingFace](https://huggingface.co/Alannikos768).
 - [2024/11/13] 🎉✨ Released two new 1.8B models, expanding the FunGPT family! The models are [BanterBot_1_8b-chat](https://huggingface.co/Alannikos768/BanterBot_1_8b-chat) and [BoostBot_1_8b-chat](https://huggingface.co/Alannikos768/BoostBot_1_8b-chat).
 - [2024/11/10] 🎉✨ Launched two brand-new 7B models, adding new members to the FunGPT family! The models are [BanterBot-7b-chat](https://huggingface.co/Alannikos768/BanterBot-7b-chat) and [BoostBot-7b-chat](https://huggingface.co/Alannikos768/BoostBot-7b-chat).

diff --git a/README_zh.md b/README_zh.md
@@ -28,7 +28,8 @@ _____________________________________________________________________
 <details open>
 <summary><b>2024</b></summary>
 
-- \[2024/11/18\] 🌟🔧 完成**快速使用**部分的文档更新，解决关于环境搭建的bug。
+- \[2024/11/19\] 🔒🛡️ 加入敏感词处理模块，防止模型输出有害观点 
+- \[2024/11/18\] 🔧🌟 完成**快速使用**部分的文档更新，解决关于环境搭建的bug。
 - \[2024/11/14\] 🎉✨ 利用LMDeploy工具完成对四个模型的量化工作，FunGPT家族喜添新成员！模型已发布在[HuggingFace](https://huggingface.co/Alannikos768)。
 - \[2024/11/13\] 🎉✨ 项目推出两款1.8B模型，FunGPT家族喜添新成员！模型分别为[BanterBot_1_8b-chat](https://huggingface.co/Alannikos768/BanterBot_1_8b-chat)，以及[BoostBot_1_8b-chat](https://huggingface.co/Alannikos768/BoostBot_1_8b-chat)。
 - \[2024/11/10\] 🎉✨ 重磅推出两款全新7B模型，FunGPT家族喜添新成员！模型分别为[BanterBot-7b-chat](https://huggingface.co/Alannikos768/BanterBot-7b-chat)，以及[BoostBot-7b-chat](https://huggingface.co/Alannikos768/BoostBot-7b-chat)。

diff --git a/Utils/common_utils.py b/Utils/common_utils.py
@@ -1,6 +1,9 @@
 import sys
 import requests
 import streamlit as st
+import jieba
+import pickle
+from flashtext import KeywordProcessor
 
 sys.path.append("/root/Project_FunGPT/FunGPT/")
 
@@ -12,7 +15,47 @@
 cur_query_prompt = '<|im_start|>user\n{user}<|im_end|>\n\
     <|im_start|>assistant\n'
 
+
+class SecureSystem:
+    """
+    敏感词检测模块
+    """
+
+    def __init__(self):
+        self.sensitive_path = Config.PROJECT_PATH / "Data/BanterBot/sensitive_words/sensitive_words.pkl"
+
+        # 加载词库
+        self.load_words()
+
+    def load_words(self):
+        with open(self.sensitive_path, 'rb') as f:
+            sensitive_words = pickle.load(f)
+
+        self.sensitive_words = sensitive_words
+
+    def tokenize(self, text):
+        """
+        对输入进行分词
+        """
+        tokenized_text = " ".join(jieba.cut(text))
+        return tokenized_text
+
+    def check(self, text):
+        """
+        进行检测
+        """
+
+        tokenized_text = self.tokenize(text)
+
+        processor = KeywordProcessor()
+        processor.add_keywords_from_list(self.sensitive_words)
+        isContain = len(processor.extract_keywords(tokenized_text)) > 0
+        return isContain
+
 def initialize_session_state():
+    if "SecureSystem" not in st.session_state:
+        st.session_state.SecureSystem = SecureSystem()
+
     if "LLM_Model" not in st.session_state:
         st.session_state.LLM_Model = None
     if "ASR_Model" not in st.session_state:
@@ -54,6 +97,9 @@ def combine_history(prompt):
     return total_prompt
 
 def initialize_session_state_p2():
+    if "SecureSystem" not in st.session_state:
+        st.session_state.SecureSystem = SecureSystem()
+
     if "LLM_Model_p2" not in st.session_state:
         st.session_state.LLM_Model_p2 = None
     if "ASR_Model_p2" not in st.session_state:
@@ -102,4 +148,4 @@ def load_lottieurl(url: str):
     r = requests.get(url)
     if r.status_code != 200:
         return None
-    return r.json()
+    return r.json()
diff --git a/Utils/data_utils.py b/Utils/data_utils.py
@@ -9,7 +9,6 @@
 sys.path.append("/root/Project_FunGPT/FunGPT")
 from Utils.common_utils import get_avatar, combine_history, combine_history_p2
 
-
 def get_audio_input():
     with st.sidebar:
         st.markdown("---")
@@ -68,9 +67,15 @@ def get_audio_input():
     return None
 
 def show_dialog_interface(user_input, mode=1):
-    
+
     # 如果有用户输入,处理它
     if user_input:
+        # 进行敏感词检测
+        if st.session_state.SecureSystem.check(user_input):
+            st.chat_message('user', avatar=get_avatar("User_v1")).markdown(user_input)
+            st.chat_message('robot', avatar=get_avatar("BoostBot" if mode==1 else "BanterBot")).markdown("⚠️ 您输入的内容含有敏感词，请修改后再试。")
+            return
+
         with st.chat_message('user', avatar=get_avatar("User_v1")):
             st.markdown(user_input)
         if mode == 1:

diff --git a/requirements.txt b/requirements.txt
@@ -268,3 +268,4 @@ wheel==0.44.0
 wrapt==1.16.0
 xformers==0.0.23.post1
 zhipuai==2.1.5.20230904
+flashtext==2.7