Skip to content

Commit

Permalink
11_10 update
Browse files Browse the repository at this point in the history
  • Loading branch information
Alannikos committed Nov 10, 2024
1 parent bc10b7b commit 1f5a551
Show file tree
Hide file tree
Showing 44 changed files with 743 additions and 190 deletions.
Binary file removed ASR/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file removed ASR/models/__pycache__/sensevoice.cpython-310.pyc
Binary file not shown.
8 changes: 6 additions & 2 deletions ASR/models/sensevoice.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,18 @@ def __init__(self, model_path):
self.load_model(model_path)

def load_model(self, model_path):
"""
加载sensevoice模型
"""
self.model = AutoModel(model=model_path,
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
trust_remote_code=True, device="cuda:0")

def save_wavs(self, wav_bytes):
"""
保存语音(用户输入)
"""
save_file = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".wav"
wav_save_path = str(Path("./Work_dirs/ASR").joinpath(save_file).absolute())

Expand All @@ -43,8 +49,6 @@ def generate(self, wav_path):
new_text = re.sub("<.*?>", "", texts[0]["text"])
return new_text

# def record_voice(self):

if __name__ == "__main__":
model_path = "/root/Project_FunGPT/Developing/ASR/weights/SenseVoiceSmall"
model = Sensevoice(model_path=model_path)
Expand Down
1 change: 1 addition & 0 deletions ASR/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ASR模型的相关组件
1 change: 1 addition & 0 deletions Data/BanterBot/feasible_data/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
存放格式正确的微调数据集
1 change: 1 addition & 0 deletions Data/BanterBot/raw/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
存放关于主题或者关键字的数据
File renamed without changes.
1 change: 1 addition & 0 deletions Data/BanterBot/sample/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
存放初步生成的数据集
63 changes: 63 additions & 0 deletions Data/BanterBot/scripts/filter_bad_from_conv_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import json
import os
from tqdm import tqdm


def filter_bad_conversations(data):
filtered_data = []

for item in tqdm(data, desc="处理对话数据"):
filtered_conversations = []

for idx, conv in enumerate(item['conversation']):
if (idx == 0): # 跳过系统消息
filtered_conversations.append(conv)
continue

bad_data = False

if conv['input'] == "" or conv['output'] == "":
bad_data = True


if not bad_data:
filtered_conversations.append(conv)
else:
print(conv)
print("=======================")

if filtered_conversations:
filtered_item = item.copy()
filtered_item['conversation'] = filtered_conversations
filtered_data.append(filtered_item)

return filtered_data

def main():
# 配置路径
input_file = "/FunGPT/Data/Kua_LLM/feasible_data/ft_data_all.json"
output_file = "/FunGPT/Data/Dui_LLM/feasible_data/ft_data_all_filtered.json"

# 2. 加载JSON数据
try:
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"找不到输入文件: {input_file}")
return
except json.JSONDecodeError as e:
print(e)
return

# 3. 执行过滤
filtered_data = filter_bad_conversations(data)

try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
print(f"\n已将过滤后的数据保存到: {output_file}")
except Exception as e:
print(f"保存文件时出错: {str(e)}")

if __name__ == "__main__":
main()
108 changes: 108 additions & 0 deletions Data/BanterBot/scripts/filter_sensitive_words_from_conv_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import json
import os
from tqdm import tqdm

def load_sensitive_words(folder_path):
"""
从指定文件夹中加载所有txt文件的敏感词
每个文件每行一个敏感词
"""
sensitive_words = set() # 使用集合去重

# 确保文件夹路径存在
if not os.path.exists(folder_path):
raise FileNotFoundError(f"敏感词文件夹不存在: {folder_path}")

# 遍历文件夹中的所有txt文件
for filename in os.listdir(folder_path):
if filename.endswith('.txt'):
file_path = os.path.join(folder_path, filename)
try:
with open(file_path, 'r', encoding='utf-8') as f:
# 读取每行,去除空白字符,并添加到集合中
words = {line.strip() for line in f if line.strip()}
sensitive_words.update(words)
print(f"已加载敏感词文件: {filename}")
except Exception as e:
print(f"加载文件 {filename} 时出错: {str(e)}")

return list(sensitive_words) # 转换回列表

def filter_sensitive_conversations(data, sensitive_words):
filtered_data = []

for item in tqdm(data, desc="处理对话数据"):
filtered_conversations = []

for idx, conv in enumerate(item['conversation']):
if (idx == 0): # 跳过系统消息
filtered_conversations.append(conv)
continue

try:
text_to_check = f"{conv['input']} {conv['output']}"
except:
# print(conv)
continue
contains_sensitive = False
which_sensitive_word = ""

for word in sensitive_words:
if word in text_to_check.lower():
contains_sensitive = True
which_sensitive_word = word
break

if not contains_sensitive:
filtered_conversations.append(conv)
else:
# print(which_sensitive_word)
# print(conv)
print("=======================")


if filtered_conversations:
filtered_item = item.copy()
filtered_item['conversation'] = filtered_conversations
filtered_data.append(filtered_item)

return filtered_data

def main():
# 配置路径
sensitive_words_folder = "/FunGPT/Data/Dui_LLM/sensitive_words/words"
input_file = "/FunGPT/Data/Kua_LLM/sample/multi_conversation_1.jsonl"
output_file = "/FunGPT/Data/Dui_LLM/feasible_data/ft_data_1_filtered.json"

# 1. 加载敏感词
try:
sensitive_words = load_sensitive_words(sensitive_words_folder)
print(f"总共加载了 {len(sensitive_words)} 个敏感词")
except Exception as e:
print(f"加载敏感词失败: {str(e)}")
return

# 2. 加载JSON数据
try:
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"找不到输入文件: {input_file}")
return
except json.JSONDecodeError as e:
print(e)
return

# 3. 执行过滤
filtered_data = filter_sensitive_conversations(data, sensitive_words)

# 4. 保存过滤后的数据
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
print(f"\n已将过滤后的数据保存到: {output_file}")
except Exception as e:
print(f"保存文件时出错: {str(e)}")

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
from tqdm import tqdm
from zhipuai import ZhipuAI
sys.path.append("/root/Project_FunGPT/Developing/")
sys.path.append("/FunGPT")

from LLM.templates.template import Template

Expand Down Expand Up @@ -38,14 +38,49 @@ def __init__(self, api_key):

def generate(self, question="我最近有点不开心,别人说我长得有点丑"):
style_features = [
"断章取义:专挑话语中的细节进行抬杠。",
"逻辑跳跃:无视上下文,跳到不相关的话题。",
"过度细节:过分纠结无关紧要的小问题。",
"否定一切:持绝对否定态度,无论观点如何。",
"夸大其词:把对方的观点极端化进行反驳。",
"自我中心:忽视他人观点,强调自身正确。",
"挑错纠正:找语法或用词上的毛病。",
"反问激将:用问题质疑对方逻辑。",
"话里有话:含蓄地表达不满或批评。",
"转移焦点:引导讨论偏离原本主题。",
"顾左右而言他:不正面回应问题或观点。",
"绝对化表述:用“总是”“从来”等极端词语。",
"喜欢对比:把不相关的事物进行比较。",
"追根究底:任何问题都要深究到底。",
"口是心非:表面附和,实则内含讽刺。",
"吊书袋:引用过多复杂理论来证明观点。",
"夸大所知:表现出博学,但不相关。",
"兜圈子:绕很长的弯子以不达要害。",
"滥用类比:用不适当的类比来反驳。",
"不停问难:用连续的问题让对方无法回答。"
]

random_feature = random.choice(style_features)
response = self.model.chat.completions.create(
model="glm-4-plus",
messages=[
{"role": "system", "content": Template.DUI_GENERATE_DATA_TEMPLATE},
{"role": "user", "content": f''''''},
{"role": "user", "content": f'''
目标: 1. 请生成""{question}""为场景的连续多轮对话记录
2. 你是场景里的杠精,和你对话的是你杠的对象。
3. 使用更加口语化和不规则的表达。
4. 提出的问题要多样化,有强烈的反对意味,有脾气。
5. 要符合人类的说话习惯,不讲礼貌。
6. 注意回答要按照你扮演的角色进行回答,可以适当加入emoji。
7. 注意回答者的语气要真实,可以适当浮夸,可以引经据典来回答。
8. 最后你的语言风格和特点是""{random_feature}""
9. 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式,json模板:
[
{{
"input":"AAA","output":"BBBB"
}}
]
其中input字段表示你怼的对象的话语, output字段表示怼人专家的话语'''},
]
)
return response
Expand Down Expand Up @@ -117,8 +152,8 @@ def postprocessing(conversation_data):

def save_dialogue_to_jsonl(data):
# 将数据保存为JSONL文件
with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/sample/dui_multi_conversation_1.jsonl', 'w', encoding='utf-8') as f:
with open('/FunGPT/Data/Dui_LLM/sample/dui_multi_conversation_1.jsonl', 'w', encoding='utf-8') as f:

# for item in data:
# json_str = json.dumps(item, ensure_ascii=False)
# f.write(json_str + '\n')
Expand All @@ -135,7 +170,7 @@ def main():


# 2. 遍历获取到的每个问题
with open("/root/Project_FunGPT/Developing/Data/Kua_LLM/raw/question_1000.json", 'r', encoding='utf-8') as file:
with open("/FunGPT/Data/Dui_LLM/raw/scene.json", 'r', encoding='utf-8') as file:
data = json.load(file)

# 依次读取每一个数据
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import logging
from tqdm import tqdm
from zhipuai import ZhipuAI
sys.path.append("/root/Project_FunGPT/Developing/")
sys.path.append("/FunGPT")

from LLM.templates.template import Template

Expand All @@ -16,14 +16,23 @@ def __init__(self, api_key):
self.model_name = 'GLM4'
self.model = ZhipuAI(api_key=api_key) # 请填写自己的APIKey


def generate(self, key_word="讽刺"):
response = self.model.chat.completions.create(
# model="glm-4",
model="glm-4-plus",
messages=[
{"role": "system", "content": Template.Dui_TOPIC_GENERATE_DATA_TEMPLATE},
{"role": "user", "content": f''''''},
{"role": "user", "content": f'''
目标: 1. 请生成一个和""{key_word}""这句话最符合的场景,只生成场景。
2. 你需要结合这句话思考最合适的场景,同时确保生成的场景多样化。
3. 请保证这个场景的质量,只生成一个场景。
4. 具有强烈反对意图,带有故意贬低或讽刺的意味。
5. 请保证场景简洁且真实,无需出现特定地点限制。
6. 请勿使用明显的人物代名词。
7. 请注意是你发现了某件事,然后你使用上面这句话讽刺他的场景。
8. 严格遵循规则: 请严格以如下格式返回生成的数据, 确保只返回JSON格式,json模板:
{{"场景": "AAA"}}
其中AAA代表对应关键字的场景'''},

]
)
Expand All @@ -41,7 +50,7 @@ def postprocessing(topic_data):

def save_question_to_json(data):
# 将数据保存为JSONL文件
with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/raw/scene.json', 'w', encoding='utf-8') as f:
with open('FunGPT/Data/Dui_LLM/raw/scene.json', 'w', encoding='utf-8') as f:

# for item in data:
# json_str = json.dumps(item, ensure_ascii=False)
Expand All @@ -57,7 +66,7 @@ def main():
# 场景数据集合
res = []

with open('/root/Project_FunGPT/Developing/Data/Dui_LLM/raw/key_word.txt', 'r', encoding='utf-8') as file:
with open('FunGPT/Data/Dui_LLM/raw/key_word.txt', 'r', encoding='utf-8') as file:
lines_list = file.readlines()
# 去掉每行末尾的换行符
keywords = [line.strip() for line in lines_list]
Expand Down
Loading

0 comments on commit 1f5a551

Please sign in to comment.