Open
Description
import json
from pygtrans import Translate
# 加载原始数据集
with open('CoT_data.json', 'r', encoding='utf-8') as f:
original_data = json.load(f)
# 初始化翻译客户端
client = Translate(
domain='com.bo',
proxies={'http': 'http://172.xx.xx.xxx:7899',
'https': 'http://172.xx.xx.xxx:7899'}
)
# 创建纯藏语数据集容器
translated_data = [{
"instruction": "",
"input": "",
"output": ""
} for _ in original_data]
# 分字段批量处理核心逻辑
for field in ['instruction', 'input', 'output']:
# 提取字段内容并记录位置
texts = []
indexes = []
for idx, item in enumerate(original_data):
if field in item and item[field]:
texts.append(item[field])
indexes.append(idx)
results = []
# 批量翻译当前字段
if texts:
# print(texts)
for i in range(0, len(texts), 100):
results.extend(client.translate(texts[i:i + 100], target='bo'))
# 回填翻译结果
for result_idx, idx in enumerate(indexes):
translated_data[idx][field] = results[result_idx].translatedText
# 保存结果
with open('tibetan_instructions.json', 'w', encoding='utf-8') as f:
json.dump(translated_data, f, ensure_ascii=False, indent=2)
Metadata
Metadata
Assignees
Labels
No labels