11_10 update

Alannikos · Nov 10, 2024 · 1f5a551 · 1f5a551
1 parent bc10b7b
commit 1f5a551
Show file tree

Hide file tree

Showing 44 changed files with 743 additions and 190 deletions.
diff --git a/ASR/__pycache__/__init__.cpython-310.pyc b/ASR/__pycache__/__init__.cpython-310.pyc
diff --git a/ASR/models/__pycache__/sensevoice.cpython-310.pyc b/ASR/models/__pycache__/sensevoice.cpython-310.pyc
diff --git a/ASR/models/sensevoice.py b/ASR/models/sensevoice.py
@@ -15,12 +15,18 @@ def __init__(self, model_path):
         self.load_model(model_path)
 
     def load_model(self, model_path):
+        """
+        加载sensevoice模型
+        """
         self.model = AutoModel(model=model_path,
                         vad_model="fsmn-vad",
                         vad_kwargs={"max_single_segment_time": 30000},
                         trust_remote_code=True, device="cuda:0")
 
     def save_wavs(self, wav_bytes):
+        """
+        保存语音（用户输入）
+        """
         save_file = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".wav"
         wav_save_path = str(Path("./Work_dirs/ASR").joinpath(save_file).absolute())
 
@@ -43,8 +49,6 @@ def generate(self, wav_path):
         new_text = re.sub("<.*?>", "", texts[0]["text"])
         return new_text
 
-    # def record_voice(self):
-
 if __name__ == "__main__":
     model_path = "/root/Project_FunGPT/Developing/ASR/weights/SenseVoiceSmall"
     model = Sensevoice(model_path=model_path)

diff --git a/ASR/readme.md b/ASR/readme.md
@@ -0,0 +1 @@
+ASR模型的相关组件
diff --git a/Data/BanterBot/feasible_data/readme.md b/Data/BanterBot/feasible_data/readme.md
@@ -0,0 +1 @@
+存放格式正确的微调数据集
diff --git a/Data/BanterBot/raw/readme.md b/Data/BanterBot/raw/readme.md
@@ -0,0 +1 @@
+存放关于主题或者关键字的数据
diff --git a/Data/Dui_LLM/raw/readme.md → Data/BanterBot/readme.md b/Data/Dui_LLM/raw/readme.md → Data/BanterBot/readme.md
diff --git a/Data/BanterBot/sample/readme.md b/Data/BanterBot/sample/readme.md
@@ -0,0 +1 @@
+存放初步生成的数据集
diff --git a/Data/BanterBot/scripts/filter_bad_from_conv_data.py b/Data/BanterBot/scripts/filter_bad_from_conv_data.py
@@ -0,0 +1,63 @@
+import json
+import os
+from tqdm import tqdm
+
+
+def filter_bad_conversations(data):
+    filtered_data = []
+
+    for item in tqdm(data, desc="处理对话数据"):
+        filtered_conversations = []
+
+        for idx, conv in enumerate(item['conversation']):
+            if (idx == 0):  # 跳过系统消息
+                filtered_conversations.append(conv)
+                continue
+
+            bad_data = False
+
+            if conv['input'] == "" or conv['output'] == "":
+                bad_data = True
+
+
+            if not bad_data:
+                filtered_conversations.append(conv)
+            else:
+                print(conv)
+                print("=======================")
+
+        if filtered_conversations:
+            filtered_item = item.copy()
+            filtered_item['conversation'] = filtered_conversations
+            filtered_data.append(filtered_item)
+
+    return filtered_data
+
+def main():
+    # 配置路径
+    input_file = "/FunGPT/Data/Kua_LLM/feasible_data/ft_data_all.json"
+    output_file = "/FunGPT/Data/Dui_LLM/feasible_data/ft_data_all_filtered.json"
+
+    # 2. 加载JSON数据
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"找不到输入文件: {input_file}")
+        return
+    except json.JSONDecodeError as e:
+        print(e)
+        return
+
+    # 3. 执行过滤
+    filtered_data = filter_bad_conversations(data)
+
+    try:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(filtered_data, f, ensure_ascii=False, indent=2)
+        print(f"\n已将过滤后的数据保存到: {output_file}")
+    except Exception as e:
+        print(f"保存文件时出错: {str(e)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/Data/BanterBot/scripts/filter_sensitive_words_from_conv_data.py b/Data/BanterBot/scripts/filter_sensitive_words_from_conv_data.py
@@ -0,0 +1,108 @@
+import json
+import os
+from tqdm import tqdm
+
+def load_sensitive_words(folder_path):
+    """
+    从指定文件夹中加载所有txt文件的敏感词
+    每个文件每行一个敏感词
+    """
+    sensitive_words = set()  # 使用集合去重
+
+    # 确保文件夹路径存在
+    if not os.path.exists(folder_path):
+        raise FileNotFoundError(f"敏感词文件夹不存在: {folder_path}")
+
+    # 遍历文件夹中的所有txt文件
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.txt'):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    # 读取每行，去除空白字符，并添加到集合中
+                    words = {line.strip() for line in f if line.strip()}
+                    sensitive_words.update(words)
+                print(f"已加载敏感词文件: {filename}")
+            except Exception as e:
+                print(f"加载文件 {filename} 时出错: {str(e)}")
+
+    return list(sensitive_words)  # 转换回列表
+
+def filter_sensitive_conversations(data, sensitive_words):
+    filtered_data = []
+
+    for item in tqdm(data, desc="处理对话数据"):
+        filtered_conversations = []
+
+        for idx, conv in enumerate(item['conversation']):
+            if (idx == 0):  # 跳过系统消息
+                filtered_conversations.append(conv)
+                continue
+
+            try:
+                text_to_check = f"{conv['input']} {conv['output']}"
+            except:
+                # print(conv)
+                continue
+            contains_sensitive = False
+            which_sensitive_word = ""
+
+            for word in sensitive_words:
+                if word in text_to_check.lower():
+                    contains_sensitive = True
+                    which_sensitive_word = word
+                    break
+
+            if not contains_sensitive:
+                filtered_conversations.append(conv)
+            else:
+                # print(which_sensitive_word)
+                # print(conv)
+                print("=======================")
+
+
+        if filtered_conversations:
+            filtered_item = item.copy()
+            filtered_item['conversation'] = filtered_conversations
+            filtered_data.append(filtered_item)
+
+    return filtered_data
+
+def main():
+    # 配置路径
+    sensitive_words_folder = "/FunGPT/Data/Dui_LLM/sensitive_words/words"
+    input_file = "/FunGPT/Data/Kua_LLM/sample/multi_conversation_1.jsonl"
+    output_file = "/FunGPT/Data/Dui_LLM/feasible_data/ft_data_1_filtered.json"
+
+    # 1. 加载敏感词
+    try:
+        sensitive_words = load_sensitive_words(sensitive_words_folder)
+        print(f"总共加载了 {len(sensitive_words)} 个敏感词")
+    except Exception as e:
+        print(f"加载敏感词失败: {str(e)}")
+        return
+
+    # 2. 加载JSON数据
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except FileNotFoundError:
+        print(f"找不到输入文件: {input_file}")
+        return
+    except json.JSONDecodeError as e:
+        print(e)
+        return
+
+    # 3. 执行过滤
+    filtered_data = filter_sensitive_conversations(data, sensitive_words)
+
+    # 4. 保存过滤后的数据
+    try:
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(filtered_data, f, ensure_ascii=False, indent=2)
+        print(f"\n已将过滤后的数据保存到: {output_file}")
+    except Exception as e:
+        print(f"保存文件时出错: {str(e)}")
+
+if __name__ == "__main__":
+    main()
diff --git a/...LM/scripts/generate_mutil_conv_chatglm.py → ...ot/scripts/generate_mutil_conv_chatglm.py b/...LM/scripts/generate_mutil_conv_chatglm.py → ...ot/scripts/generate_mutil_conv_chatglm.py
@@ -6,7 +6,7 @@
 import logging  
 from tqdm import tqdm
 from zhipuai import ZhipuAI
-sys.path.append("/root/Project_FunGPT/Developing/")
+sys.path.append("/FunGPT")
 
 from LLM.templates.template import Template
 
@@ -38,14 +38,49 @@ def __init__(self, api_key):
 
     def generate(self, question="我最近有点不开心，别人说我长得有点丑"):
         style_features = [
+                    "断章取义：专挑话语中的细节进行抬杠。",
+                    "逻辑跳跃：无视上下文，跳到不相关的话题。",
+                    "过度细节：过分纠结无关紧要的小问题。",
+                    "否定一切：持绝对否定态度，无论观点如何。",
+                    "夸大其词：把对方的观点极端化进行反驳。",
+                    "自我中心：忽视他人观点，强调自身正确。",
+                    "挑错纠正：找语法或用词上的毛病。",
+                    "反问激将：用问题质疑对方逻辑。",
+                    "话里有话：含蓄地表达不满或批评。",
+                    "转移焦点：引导讨论偏离原本主题。",
+                    "顾左右而言他：不正面回应问题或观点。",
+                    "绝对化表述：用“总是”“从来”等极端词语。",
+                    "喜欢对比：把不相关的事物进行比较。",
+                    "追根究底：任何问题都要深究到底。",
+                    "口是心非：表面附和，实则内含讽刺。",
+                    "吊书袋：引用过多复杂理论来证明观点。",
+                    "夸大所知：表现出博学，但不相关。",
+                    "兜圈子：绕很长的弯子以不达要害。",
+                    "滥用类比：用不适当的类比来反驳。",
+                    "不停问难：用连续的问题让对方无法回答。"
                 ]
 
         random_feature = random.choice(style_features)
         response = self.model.chat.completions.create(
             model="glm-4-plus",
             messages=[
                 {"role": "system", "content": Template.DUI_GENERATE_DATA_TEMPLATE},
-                {"role": "user", "content": f''''''},
+                {"role": "user", "content": f'''
+                目标: 1. 请生成""{question}""为场景的连续多轮对话记录
+                      2. 你是场景里的杠精，和你对话的是你杠的对象。
+                      3. 使用更加口语化和不规则的表达。
+                      4. 提出的问题要多样化，有强烈的反对意味，有脾气。
+                      5. 要符合人类的说话习惯，不讲礼貌。
+                      6. 注意回答要按照你扮演的角色进行回答，可以适当加入emoji。
+                      7. 注意回答者的语气要真实，可以适当浮夸，可以引经据典来回答。
+                      8. 最后你的语言风格和特点是""{random_feature}""
+                      9. 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式，json模板:  
+                            [
+                                {{
+                                    "input":"AAA","output":"BBBB" 
+                                }}
+                            ]
+                         其中input字段表示你怼的对象的话语, output字段表示怼人专家的话语'''},
             ]
         )
         return response
@@ -117,8 +152,8 @@ def postprocessing(conversation_data):
 
 def save_dialogue_to_jsonl(data):
     # 将数据保存为JSONL文件
-    with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/sample/dui_multi_conversation_1.jsonl', 'w', encoding='utf-8') as f:
-        
+    with open('/FunGPT/Data/Dui_LLM/sample/dui_multi_conversation_1.jsonl', 'w', encoding='utf-8') as f:
+
         # for item in data:
         #     json_str = json.dumps(item, ensure_ascii=False)
         #     f.write(json_str + '\n')
@@ -135,7 +170,7 @@ def main():
 
 
     # 2. 遍历获取到的每个问题
-    with open("/root/Project_FunGPT/Developing/Data/Kua_LLM/raw/question_1000.json", 'r', encoding='utf-8') as file:
+    with open("/FunGPT/Data/Dui_LLM/raw/scene.json", 'r', encoding='utf-8') as file:
         data = json.load(file)
 
         # 依次读取每一个数据

diff --git a/...M/scripts/generate_mutil_topic_chatglm.py → ...t/scripts/generate_mutil_topic_chatglm.py b/...M/scripts/generate_mutil_topic_chatglm.py → ...t/scripts/generate_mutil_topic_chatglm.py
@@ -6,7 +6,7 @@
 import logging
 from tqdm import tqdm
 from zhipuai import ZhipuAI
-sys.path.append("/root/Project_FunGPT/Developing/")
+sys.path.append("/FunGPT")
 
 from LLM.templates.template import Template
 
@@ -16,14 +16,23 @@ def __init__(self, api_key):
         self.model_name = 'GLM4'
         self.model = ZhipuAI(api_key=api_key)  # 请填写自己的APIKey
 
-
     def generate(self, key_word="讽刺"):
         response = self.model.chat.completions.create(
             # model="glm-4",
             model="glm-4-plus",
             messages=[
                 {"role": "system", "content": Template.Dui_TOPIC_GENERATE_DATA_TEMPLATE},
-                {"role": "user", "content": f''''''},
+                {"role": "user", "content": f'''
+                目标: 1. 请生成一个和""{key_word}""这句话最符合的场景，只生成场景。
+                    2. 你需要结合这句话思考最合适的场景，同时确保生成的场景多样化。
+                    3. 请保证这个场景的质量，只生成一个场景。
+                    4. 具有强烈反对意图，带有故意贬低或讽刺的意味。
+                    5. 请保证场景简洁且真实，无需出现特定地点限制。
+                    6. 请勿使用明显的人物代名词。
+                    7. 请注意是你发现了某件事，然后你使用上面这句话讽刺他的场景。
+                    8. 严格遵循规则: 请严格以如下格式返回生成的数据, 确保只返回JSON格式，json模板:
+                    {{"场景": "AAA"}}
+                    其中AAA代表对应关键字的场景'''},
 
             ]
         )
@@ -41,7 +50,7 @@ def postprocessing(topic_data):
 
 def save_question_to_json(data):
     # 将数据保存为JSONL文件
-    with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/raw/scene.json', 'w', encoding='utf-8') as f:
+    with open('FunGPT/Data/Dui_LLM/raw/scene.json', 'w', encoding='utf-8') as f:
 
         # for item in data:
         #     json_str = json.dumps(item, ensure_ascii=False)
@@ -57,7 +66,7 @@ def main():
     # 场景数据集合
     res = []
 
-    with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/raw/key_word.txt', 'r', encoding='utf-8') as file:
+    with open('FunGPT/Data/Dui_LLM/raw/key_word.txt', 'r', encoding='utf-8') as file:
         lines_list = file.readlines()
         # 去掉每行末尾的换行符
         keywords = [line.strip() for line in lines_list]