大模型相关目录
大模型,包括部署微调prompt/Agent应用开发、知识库增强、数据库增强、知识图谱增强、自然语言处理、多模态等大模型应用开发内容
从0起步,扬帆起航。
- 基于Dify的智能分类方案:大模型结合KNN算法(附代码)
- OpenCompass:大模型测评工具
- 一文读懂多模态大模型基础架构
- 大模型管理平台:one-api使用指南
- 大模型RAG、ROG、RCG概念科普
- RAGOnMedicalKG:大模型结合知识图谱的RAG实现
- DSPy:变革式大模型应用开发
- 最简明的Few-shot Prompt指南
- Semantic Kernel:微软大模型开发框架——LangChain 替代
- 对话大模型Prompt是否需要礼貌点?
- swift与Internvl下的多模态大模型分布式微调指南(附代码和数据)
- 多模态大模型Internvl-1.5-26B微调后部署及测试实录(附代码)
- 多模态大模型Internvl-2-26B的OCR赋能方案(附代码)
整体介绍
代码实现
模型部署:
# 拉去swfit项目后
CUDA_VISIBLE_DEVICES=0,1,2,3 swift deploy --host 0.0.0.0 --port 23333 --model_id_or_path /data/hfd/InternVL2-26B --model_type internvl2-26b --device_max_memory 15GB 15GB 15GB 15GB --dtype bf16 --max_length 4096
functions.py:
import os
import fitz # PyMuPDF
import json
import shutil
import re
from openai import OpenAI
import base64
from config import *
Conf = Config()
configs = Conf.get_config()
def error_log(content):
"""
Write content to a text file.
:param file_path: Absolute path of the text file to be written.
:param content: Content to be written to the file.
"""
with open('error_log.txt', 'a') as file:
file.write(content)
file.write('\n')
def create_directory(path):
"""Create a new directory at the given path."""
try:
os.makedirs(path, exist_ok=True)
return f"Directory created at {path}"
except Exception as e:
return f"An error occurred: {e}"
def list_files(directory):
"""List all files in the given directory."""
return [file for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
def list_files_with_absolute_paths(directory):
"""List all files in the given directory with their absolute paths."""
return [os.path.abspath(os.path.join(directory, file)) for file in os.listdir(directory) if os.path.isfile(os.path.join(directory, file))]
def extract_file_name_from_path(path):
"""
Extract the file name from the given absolute path.
:param path: Absolute path of the file.
:return: File name.
"""
return os.path.basename(path)
def pdf_to_jpg_pymupdf(pdf_path, output_folder):
"""
Convert a PDF file to a series of JPG images using PyMuPDF.
:param pdf_path: Absolute path to the PDF file.
:param output_folder: Folder where the JPG images will be saved.
"""
file_name = extract_file_name_from_path(pdf_path).split('.')[0]
# Open the PDF file
document = fitz.open(pdf_path)
# Ensure output folder exists
if not os.path.exists(output_folder):
os.makedirs(output_folder)
if len(document) != 1:
current_time_str = configs.now_time_str
print(pdf_path,'pdf format error!')
error_log('pdf format error!'+' '+pdf_path+' '+current_time_str)
else:
# Iterate through each page in the PDF
for page_number in range(len(document)):
# Get the page
page = document[page_number]
# Render the page to an image
pix = page.get_pixmap()
# Define the output image file path
output_path = os.path.join(output_folder, f'{file_name}.jpg')
# Save the image
pix.save(output_path)
# Close the document
document.close()
return f"PDF converted to JPG images and saved in {output_folder}"
def copy_non_pdf_files(src_directory, dest_directory):
"""
Copy non-PDF files from source directory to destination directory.
:param src_directory: Absolute path of the source directory.
:param dest_directory: Absolute path of the destination directory.
"""
# Ensure destination directory exists
if not os.path.exists(dest_directory):
os.makedirs(dest_directory)
# List all files in the source directory
for file_name in os.listdir(src_directory):
file_path = os.path.join(src_directory, file_name)
# print(file_path)
# Check if it's a file and not a directory
if "+" in file_path:
current_time_str = configs.now_time_str
print(file_path,'file name error!')
error_log('file name error!'+' '+file_path+' '+current_time_str)
if os.path.isfile(file_path):
# Check if the file is a PDF
if file_path.lower().endswith('.pdf'):
# If it's a PDF, pass
pdf_to_jpg_pymupdf(file_path,dest_directory)
else:
# If it's not a PDF, copy the file to the destination directory
shutil.copy(file_path, os.path.join(dest_directory, file_name))
#图片转base64函数
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
#原图片转base64
def get_response(input_file_name,input_image_path):
base64_image = encode_image(input_image_path)
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://172.20.32.127:23333/v1')
model_name = client.models.list().data[0].id
response = client.chat.completions.create(
model=model_name,
messages=[
{
"role": "system",
"content": "## 职位:你是一个OCR文字识别专家,具备图像文字提取、信息规整能力,会输出json格式化的识别结果."
},
{
"role": "user",
"content":[
{
"type": "text",
"text": f'学生名称可参考{input_file_name}'+'''
提取后,返回的json格式参考如下:
{
"学生姓名":"张三",
"性别":"男",
"出生日期":"2000年1月1日",
"学历":"本科",
"毕业院校":"中国大学",
"专业":"电子信息",
"毕业时间":"2024年6月30日",
"证书编号":"162391201905007614"
}
注意,出生日期和毕业时间要为阿拉伯数字。
注意,提取不到时将提取内容标注“null”。
注意,我不需要任何代码,请输出json格式结果,json严格按照上述格式。
'''
},
{
"type": "image_url",
"image_url":{
"url":f"data:image/jpeg;base64,{base64_image}"
}
},
]
}
],
temperature=0,
top_p=1)
return response.choices[0].message.content
def post_processing(input_data):
# 使用正则表达式匹配{}之间的内容
pattern = r'{(.*?)}'
match = re.search(pattern, input_data, re.DOTALL)
# 匹配后做数据后处理
if match:
match = '{' + match.group(1) + '}'
json_str = match.replace(': {\n ', ':null,').replace('\n', '').replace('},', ',').replace(':',':')
json_str = json_str.strip('"\"')
return json_str
else:
return str({'error':'llm out error!'})
def save_json_to_file(json_data, file_path):
"""
将JSON数据保存到文件。
参数:
json_data: 要保存的JSON数据。
file_path: 保存JSON数据的文件路径。
"""
try:
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(json_data, file, ensure_ascii=False, indent=4)
return "JSON数据已成功保存到文件。"
except Exception as e:
current_time_str = configs.now_time_str
error_log('json save error!'+' '+file_path+' '+current_time_str)
print(f"保存JSON数据时发生错误: {e}")
config.py:
import os
import argparse
import datetime
class Config:
def __init__(self):
self.parser = argparse.ArgumentParser(description='Parser For Arguments')
current_path = os.getcwd()
# 路径参数设定
self.parser.add_argument('-first_path',type=str,default=r"C:\Users\12258\Desktop\3-1yg人员资料识别\资质证书",
help='初始文件夹存储路径')
self.parser.add_argument('-second_path', type=str, default=r'c:\Users\12258\Desktop\3-1yg人员资料识别\资质证书\mid',
help='批处理后图像数据文件夹的存储路径')
self.parser.add_argument('-third_path', type=str, default=os.path.join(current_path, "json1"),
help='批处理后json数据文件夹存储路径')
self.parser.add_argument('-excel_path', type=str,default=os.path.join(current_path, "result.xlsx"),
help='step3输出的excel文件存储路径')
# 时间参数设定
self.parser.add_argument('-now_time_str', type=str, default=datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") ,
help='存储当前时间的时间的字符串格式,例如:"2023-07-06 14:30:00"')
# 格式转换,获取实际的参数容器
self.parser = self.parser.parse_args()
def get_config(self):
return self.parser
数据预处理:
import functions as fc
from config import *
Conf = Config()
configs = Conf.get_config()
first_path = configs.first_path
fc.create_directory(first_path+'\\mid')
fc.copy_non_pdf_files(first_path,first_path+'\\mid')
预处理结果:
处理前数据格式众多:
处理后pdf转化为jpg图像,并存储与新的文件夹中。
OCR识别:
import functions as fc
import json
from config import *
Conf = Config()
configs = Conf.get_config()
first_path = configs.first_path
second_path = configs.second_path
file_paths = fc.list_files_with_absolute_paths(second_path)
third_path = configs.third_path
fc.create_directory(third_path)
for file_path in file_paths:
try:
vllm_response = fc.get_response(file_path.split('\\')[-1],file_path)
data_dict = json.loads(fc.post_processing(vllm_response).replace('\\n','').replace('\\',''))
fc.save_json_to_file(data_dict,third_path+'\\'+file_path.split('\\')[-1].split('.')[0]+'.json')
except Exception as e:
current_time_str = configs.now_time_str
fc.error_log('json save error!'+' '+file_path+' '+current_time_str)
print(f"保存JSON数据时发生错误: {e}")
识别后将每一图片信息提取为指定格式的json
示例(为保证隐私,信息已经改写):
{
"学生姓名": "太钟",
"性别": "男",
"出生日期": "2022年01月14日",
"学历": "本科",
"毕业院校": "广州小学",
"专业": "地理系统",
"毕业时间": "2024年01月10日",
"证书编号": "11079999 9999 9999 99"
}
合并json信息,其中考虑每个人最高学历进行处理:
import json
import functions as fc
from config import *
Conf = Config()
configs = Conf.get_config()
def get_elements_with_string(lst, target_string):
return [element for element in lst if target_string in element]
def read_json_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
except FileNotFoundError:
return "文件未找到。请检查文件路径是否正确。"
except json.JSONDecodeError:
return "文件内容不是有效的JSON。"
except Exception as e:
return f"读取文件时发生错误: {e}"
### 需修改
path = configs.third_path
file_absolute_paths = fc.list_files_with_absolute_paths(path)
file_paths = fc.list_files(path)
n = len(file_paths)
name_list = []
for index in range(n):
try:
raw_name = file_paths[index].split('-')[0]
if '-' in raw_name:
current_time_str = configs.now_time_str
print(file_paths[index],'file name error!')
fc.error_log('file name error!'+' '+file_paths[index]+' '+current_time_str)
name_list.append(raw_name)
name_set = [i for i in set(name_list)]
except:
current_time_str = configs.now_time_str
print(file_paths[index],'file name error!')
fc.error_log('file name error!'+' '+file_paths[index]+' '+current_time_str)
finall_json_ls = []
for name in name_set:
#对于每个学生,定位其名下所有文件
mid_file_paths = get_elements_with_string(file_absolute_paths,name)
zhuanke_files = get_elements_with_string(mid_file_paths,'专')
len_zhuanke_files = len(zhuanke_files)
benke_files = get_elements_with_string(mid_file_paths,'本科')
len_benke_files = len(benke_files)
shuoshi_files = get_elements_with_string(mid_file_paths,'硕士')
len_shuoshi_files = len(shuoshi_files)
yanjiusheng_files = get_elements_with_string(mid_file_paths,'研究生')
len_yanjiusheng_files = len(yanjiusheng_files)
boshi_files = get_elements_with_string(mid_file_paths,'博士')
len_boshi_files = len(boshi_files)
error_flag = len_zhuanke_files+len_benke_files+len_shuoshi_files+len_yanjiusheng_files+len_boshi_files
# print(name,error_flag)
if error_flag == 0:
current_time_str = configs.now_time_str
print(name,'file name error!')
fc.error_log('file name error!'+' '+name+' '+current_time_str)
else:
if len_boshi_files:
mid_file = get_elements_with_string(boshi_files,'验证')
if len(mid_file) == 0:
pass
else:
finall_json_ls.append(mid_file[0])
continue
if yanjiusheng_files+shuoshi_files:
mid_file = get_elements_with_string(yanjiusheng_files+shuoshi_files,'验证')
if len(mid_file) == 0:
finall_json_ls.append((yanjiusheng_files+shuoshi_files)[0])
continue
else:
finall_json_ls.append(mid_file[0])
continue
if len_benke_files:
mid_file = get_elements_with_string(benke_files,'验证')
if len(mid_file) == 0:
finall_json_ls.append((benke_files)[0])
continue
else:
finall_json_ls.append(mid_file[0])
continue
if len_zhuanke_files:
mid_file = get_elements_with_string(zhuanke_files,'验证')
if len(mid_file) == 0:
finall_json_ls.append((zhuanke_files)[0])
continue
else:
finall_json_ls.append(mid_file[0])
continue
name_ls = []
sex_ls = []
birthday_ls = []
qualification_ls = []
time_ls = []
school_ls = []
major_ls = []
code_ls = []
for finall_json in finall_json_ls:
extract_flag = 0
single_json_data = read_json_file(finall_json)
try:
name_ls.append(single_json_data['学生姓名'])
except:
extract_flag = 1
name_ls.append('null')
try:
sex_ls.append(single_json_data['性别'])
except:
extract_flag = 1
sex_ls.append('null')
try:
birthday_ls.append(single_json_data['出生日期'])
except:
extract_flag = 1
birthday_ls.append('null')
try:
qualification_ls.append(single_json_data['学历'])
except:
extract_flag = 1
qualification_ls.append('null')
try:
school_ls.append(single_json_data['毕业院校'])
except:
extract_flag = 1
school_ls.append('null')
try:
major_ls.append(single_json_data['专业'])
except:
extract_flag = 1
major_ls.append('null')
try:
time_ls.append(single_json_data['毕业时间'])
except:
extract_flag = 1
time_ls.append('null')
try:
code_ls.append(single_json_data['证书编号'].replace(' ',''))
except:
extract_flag = 1
code_ls.append('null')
if extract_flag:
current_time_str = configs.now_time_str
print(finall_json,'file extract format error!')
fc.error_log('file extract format error!'+' '+finall_json+' '+current_time_str)
import pandas as pd
result_dict = {
'学生姓名':name_ls,
'性别':sex_ls,
'出生年月':birthday_ls,
'学历':qualification_ls,
'毕业院校':school_ls,
'专业':major_ls,
'毕业时间':time_ls,
'证书编号':code_ls
}
pd.DataFrame(result_dict).to_excel('result.xlsx',index=False)
总结
开发测试数据132项。
对于文件命名、文件内容、抽取规范都做了异常error记录,在人工校验中重点关注;本次测试error项3个。
无error的文件因为本地模型性能问题现在生成内容也存在一定内容问题,其中明显未识别、识别误差的字段内容7项。
大模型是概率生成原理的人工智能手段,应用上人工校验、订正必不可少。后续会在这个应用上进行微调、细节设计等研究,增强精度和易用性。
error_log:
C:\Users\12258\Desktop\毕业信息批量处理\132人毕业证(部分无学信网截图)\yb-本科毕业证、学士学位证、硕士毕业证、硕士学位证.pdf pdf foamat error!
C:\Users\12258\Desktop\毕业信息批量处理\132人毕业证(部分无学信网截图)\zxt-学历证明验证-专科+本科.pdf pdf foamat error!
file name error! klt 2024-07-15 16:26:24