政务问答系统构建语义模型和构建索引库和召回评估

2024-06-13 12:22:03
开发
30

import os
import random
from functools import partial
import numpy as np
import paddle
from scipy import stats
from paddlenlp.data import Pad, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.transformers import AutoModel, AutoTokenizer
import paddle.nn as nn
import paddle.nn.functional as F
import utils

dropout=0.2
save_dir='./checkpoints/zwqa/'
batch_size=16
epochs = 5
max_seq_length= 64
output_emb_size= 256
dup_rate= 0.3
train_set_file='./datasets/data/train.csv'
device='gpu'
seed=1000
model_name_or_path = 'rocketqa-zh-dureader-query-encoder'
margin=0.1
scale=10.

paddle.set_device(device)

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    paddle.seed(seed)

set_seed(seed)

def read_simcse_text(data_path):
    with open(data_path, "r", encoding="utf-8") as f:
        for line in f:
            data = line.rstrip()
            yield {"text_a": data, "text_b": data}

train_ds = load_dataset(
read_simcse_text, data_path=train_set_file, lazy=False)

train_ds[:5]

pretrained_model = AutoModel.from_pretrained(\
model_name_or_path, hidden_dropout_prob=dropout, attention_probs_dropout_prob=dropout)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
    result = []
    for key, text in example.items():
        if "label" in key:
            # do_evaluate
            result += [example["label"]]
        else:
            # do_train
            encoded_inputs = tokenizer(text=text,max_length=max_seq_length,truncation=True)
            input_ids = encoded_inputs["input_ids"]
            token_type_ids = encoded_inputs["token_type_ids"]
            result += [input_ids, token_type_ids]
    return result

trans_func = partial(
    convert_example,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length)

trans_func(train_ds[0])#(前后加开始结束符)

batchify_fn=lambda samples,fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # query_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # query_segment
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # title_input
        Pad(axis=0, pad_val=tokenizer.pad_token_type_id, dtype="int64"), # title_segment
    ):fn(samples)

def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):
    if trans_fn:
        dataset = dataset.map(trans_fn)
    shuffle = True if mode == "train" else False
    if mode == "train":
        batch_sampler = paddle.io.DistributedBatchSampler(dataset,batch_size=batch_size,shuffle=shuffle)
    else:
        batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)

return paddle.io.DataLoader(\
dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)

train_data_loader = create_dataloader(#构建dataloader
        train_ds,
        mode='train',
        batch_size=batch_size,
        batchify_fn=batchify_fn,
        trans_fn=trans_func)

for i in train_data_loader:
print(i)
break

dropout=0.
dropout if dropout is not None else 0.1

class SimCSE(nn.Layer):
    def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_emb_size=None):
        super().__init__()
        self.ptm = pretrained_model#预训练模型
        #dropout is not None和dropout是不一样的,dropout＝０．时,dropout是Ｆalse,dropout is not None是Ｔrue
        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
        self.output_emb_size = output_emb_size
        if output_emb_size > 0:#如果output_emb_size>0,线性转换
            weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
            self.emb_reduce_linear = paddle.nn.Linear(768, output_emb_size, weight_attr=weight_attr)
        self.margin = margin
        self.scale = scale

    @paddle.jit.to_static(
        input_spec=[
            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
            paddle.static.InputSpec(shape=[None, None], dtype="int64"),
        ]
    )
    def get_pooled_embedding(
        self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None, with_pooler=True
    ):
        # Note: cls_embedding is poolerd embedding with act tanh
        sequence_output, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)
        if with_pooler is False:#如果ptm不返回池化层,把［CLS］输出作为池化输出

原文地址:https://blog.csdn.net/LIjin_1006/article/details/139634979 本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：https://www.suanlizi.com/kf/1801107742400843776.html 如若内容造成侵权/违法违规/事实不符，请联系《酸梨子》网邮箱：1419361763@qq.com进行投诉反馈，一经查实，立即删除！

阅读全部

政务问答系统构建语义模型和构建索引库和召回评估

相关推荐

最近更新

热门阅读