JCC-CSScheduler/executor/internal/task/scripts/data_preprocess.py

284 lines
9.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from scipy.spatial.distance import cosine
import fitz
import shutil
import os
import logging
import backoff
from openai import OpenAI
import numpy as np
# 设置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
client = OpenAI(base_url="@base_url@", api_key="@api_key@")
# 设置输入和输出路径
input_file_path = "@input_file_path@"
output_dir = "saveChunk/"
output_file_path = "@output_file_path@"
# 设置每个文本块的最大分词数量
chunk_max_length = @chunk_max_length@
# 分块阈值
start_chunk_threshold = @start_chunk_threshold@
# 相似度阈值
similarity_threshold = @similarity_threshold@
# 数据分析次数
entries_per_file = @entries_per_file@
def read_file(file_path: str) -> str:
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
@backoff.on_exception(backoff.expo, Exception, max_tries=3)
def generate_single_entry(text: str):
prompt = f"""
基于以下文本生成1个用于指令数据集的高质量条目。条目应该直接关联到给定的文本内容提出相关的问题或任务。
请确保生成多样化的指令类型,例如:
- 分析类:"分析..."
- 比较类:"比较..."
- 解释类:"解释..."
- 评价类:"评价..."
- 问答类:"为什么..."
文本内容:
{text}
请以下面的格式生成条目,确保所有字段都有适当的内容:
{{
"instruction": "使用上述多样化的指令类型之一,提出一个具体的、与文本相关的问题或任务",
"input": "如果需要额外的上下文信息请在这里提供否则跟上面的instruction保持一致",
"output": "对instruction的详细回答或任务的完成结果"
}}
确保所有生成的内容都与给定的文本直接相关生成的是完整、有效的JSON格式并且内容高质量、准确、详细当有多个json时用空行分隔。
"""
try:
resp = client.chat.completions.create(
model="glm4-chat01",
messages=[
{"role": "system", "content": "你是一个指令生成专家"},
{"role": "user", "content": prompt}
]
)
response = resp.choices[0].message.content
result = ""
jsonStrList = response.split("}")
for item in jsonStrList:
if item == "":
continue
json_str = item + "}"
try:
data = json.loads(json_str)
# 检查必要的键是否存在
required_keys = {"instruction", "input", "output"}
if not required_keys.issubset(data.keys()):
logger.error(f"生成的条目缺少一些关键字段,请检查:{required_keys - data.keys()}")
continue
result = result + item + "},"
except json.JSONDecodeError as e:
logger.error(f"解析JSON字符串时发生错误: {str(e)}, output: {json_str}")
logger.info(f"output: {result}")
return result
except Exception as e:
logger.error(f"生成条目时发生错误: {str(e)}")
raise
def generate_dataset(folder_path: str, output_file_path, entries_per_file: int = 2):
dataset = []
result = "[ \\n"
for filename in os.listdir(folder_path):
if filename.endswith(".txt"):
file_path = os.path.join(folder_path, filename)
logger.info(f"正在处理文件: {filename}")
text = read_file(file_path)
for j in range(entries_per_file):
logger.info(f" 生成第 {j + 1}/{entries_per_file} 个条目")
entry = generate_single_entry(text)
if entry == None:
logger.error("生成条目时发生错误,跳过当前条目")
continue
result = result + entry
result = result[:-1] + "\\n]"
# 将结果写入到文件中
with open(output_file_path, "w") as f:
f.write(result)
return result
def get_sentence_embedding(sentence, client):
"""
获取句子的嵌入表示
参数:
sentence (str): 输入的句子
client: OpenAI 客户端实例
返回:
numpy.ndarray: 句子的嵌入向量
"""
# 使用 Xinference 嵌入 API 获取句子嵌入
global embedding
try:
response = client.embeddings.create(model="bge-large-zh-v1.5", input=sentence)
embedding = response.data[0].embedding
except Exception as e:
logger.error(e)
return np.array(embedding)
def split_text_by_semantic(text, chunk_max_length, similarity_threshold=0.5):
"""
基于语义相似度对文本进行分块
参数:
text (str): 输入的长文本
chunk_max_length (int): 每个文本块的最大长度以token为单位
similarity_threshold (float): 语义相似度阈值默认为0.5
返回:
list: 分割后的文本块列表
"""
chunks = []
# 按句子分割文本(使用常见的中文标点符号)
# sentences = re.split(r"(。|||)", text)
sentences = re.split(r"\\n\s*\\n", text)
# 重新组合句子和标点
sentences = [s + p for s, p in zip(sentences[::2], sentences[1::2]) if s]
current_chunk = sentences[0]
# 获取当前 chunk 的嵌入表示
current_embedding = get_sentence_embedding(current_chunk, client)
for sentence in sentences[1:]:
# 过滤掉空数据
if not sentence.strip():
continue
# 删除文本中的空行
sentence = re.sub(r"\\n\s*\\n", "", sentence)
# 获取当前句子的嵌入表示
sentence_embedding = get_sentence_embedding(sentence, client)
# 计算当前 chunk 和当前句子的余弦相似度
similarity = 1 - cosine(current_embedding, sentence_embedding)
logger.info(f"similarity: {similarity}, and sentence: {sentence}")
# 如果相似度高于阈值且合并后不超过最大长度,则合并
if similarity > similarity_threshold and len(current_chunk + sentence) <= chunk_max_length:
current_chunk += sentence
# 更新当前 chunk 的嵌入表示
current_embedding = (current_embedding + sentence_embedding) / 2
else:
# 否则,保存当前 chunk 并开始新的 chunk
chunks.append(current_chunk)
current_chunk = sentence
current_embedding = sentence_embedding
# 添加最后一个 chunk
if current_chunk:
chunks.append(current_chunk)
return chunks
def read_text_file(file_path):
with open(file_path, "r", encoding="utf-8") as file:
return file.read()
def save_chunks_to_files(chunks, output_dir):
"""
将分割后的文本块保存到文件
参数:
chunks (list): 文本块列表
output_dir (str): 输出目录路径
"""
# 如果输出目录不存在,则创建
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 将每个文本块保存为单独的文件
for i, chunk in enumerate(chunks):
chunk_file_path = os.path.join(output_dir, f"chunk_{i + 1}.txt")
with open(chunk_file_path, "w", encoding="utf-8") as file:
file.write(chunk)
logger.info(f"已保存第 {i + 1} 个文本块到 {chunk_file_path}")
def pdf_to_text(pdf_path, txt_path):
pdf_document = fitz.open(pdf_path)
with open(txt_path, "w", encoding="utf-8") as text_file:
for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
text = page.get_text()
text_file.write(text)
pdf_document.close()
def clean_dir(directory):
try:
shutil.rmtree(directory)
logger.info(f"成功删除文件夹及其内容: {directory}")
except FileNotFoundError:
logger.info(f"文件夹 {directory} 不存在")
except PermissionError:
logger.info(f"没有权限删除文件夹 {directory}")
except Exception as e:
logger.info(f"删除失败: {e}")
try:
os.makedirs(directory, exist_ok=True)
logger.info(f"成功创建文件夹: {directory}")
except OSError as e:
logger.info(f"创建文件夹失败: {e}")
def get_file_type(file_path):
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext in [".txt"]:
return "txt"
elif ext in [".pdf"]:
return "pdf"
else:
return "unknown"
if __name__ == "__main__":
clean_dir(output_dir)
for root, dirs, files in os.walk(input_file_path):
for file in files:
input_file = os.path.join(root, file)
if get_file_type(input_file) == "pdf":
pdf_to_text(input_file, input_file + ".txt")
input_file = input_file + ".txt"
elif get_file_type(input_file) == "unknown":
raise ValueError("输入文件类型不正确请输入文本文件或PDF文件")
# 读取长文本
long_text = read_text_file(input_file)
text_chunks = [long_text]
if len(long_text) > start_chunk_threshold:
text_chunks = split_text_by_semantic(long_text, chunk_max_length, similarity_threshold)
save_chunks_to_files(text_chunks, output_dir)
logger.info("开始生成数据集")
output_file_path = output_file_path + "/"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
output_file = os.path.join(output_file_path, file+".json")
dataset = generate_dataset(output_dir, output_file, entries_per_file)
logger.info(f"数据集已生成并保存到 {output_file}")