gpt4 book ai didi

python - 用于句子分类的 Huggingface GPT2 和 T5 模型 API?

转载 作者:行者123 更新时间:2023-12-04 01:21:37 25 4
gpt4 key购买 nike

我已经成功使用了 Huggingface Transformers BERT model使用 BERTForSequenceClassification 进行句子分类类和API。我已将其用于 1 句情感分析和 2 句 NLI。

我可以看到其他模型也有类似的类,例如XLNetForSequenceClassificationRobertaForSequenceClassification .这种类型的句子分类通常涉及在代表整个句子的密集向量之上放置一个分类器层。

现在我正在尝试使用 GPT2T5楷模。但是,当我查看每个可用的类和 API 时,没有等效的“ForSequenceClassification”类。例如,对于 GPT2 有 GPT2Model , GPT2LMHeadModel , 和 GPT2DoubleHeadsModel类。也许我对 GPT2 和 T5 的研究不够熟悉,但我确信这两个模型都能够进行句子分类。

所以我的问题是:

  1. 我应该使用 GPT2 和 T5 的哪些 Huggingface 类来进行单句分类?

  2. 我应该使用什么类来进行 2 句(句子对)分类(如自然语言推理)?

感谢您的帮助。

最佳答案

您需要使用 GPT2Model 类来生成文本的句子嵌入。一旦你将嵌入提供给线性 NN 和 softmax 函数以获得 logits,下面是我正在处理的使用 GPT2 进行文本分类的组件(仍在进行中,所以我愿意接受建议),它遵循我刚才描述的逻辑:

from torch_model_base import TorchModelBase
import torch
import torch.nn as nn
import torch.utils.data
from transformers import GPT2Tokenizer, GPT2Model
import random
from spacy.util import minibatch, compounding
import numpy as np
from sklearn.base import TransformerMixin, BaseEstimator
import pandas as pd
from typing import List, Tuple


def mean_across_all_tokens(hidden_states):
return torch.mean(hidden_states[-1], dim=1)

def sum_all_tokens(hidden_states):
return torch.sum(hidden_states[-1], dim=1)

def concat_all_tokens(hidden_states):
batch_size, max_tokens, emb_dim = hidden_states[-1].shape
return torch.reshape(hidden_states[-1], (batch_size, max_tokens * emb_dim))



class GPT2SequenceClassifierModel(nn.Module):
def __init__(
self,
hidden_size: int,
num_classes: int,
gpt_model_name: str,
max_seq_length: int = 280,
embedding_func=mean_across_all_tokens,
combine_sentence_tokens=True
):
super(GPT2SequenceClassifierModel, self).__init__()
self.hidden_size = hidden_size
self.fc1 = nn.Linear(hidden_size, num_classes)
self.model = GPT2Model.from_pretrained(
gpt_model_name,
output_hidden_states=True
)
self.tokenizer = GPT2Tokenizer.from_pretrained(gpt_model_name)
self.combine_sentence_tokens = combine_sentence_tokens;
self.embedding_func = embedding_func;
self.model.eval()
self.max_length = max_seq_length

def _tokenize(self, text_list: List[str]) -> Tuple[torch.tensor, torch.tensor]:
# Tokenize the text with the provided tokenizer
#self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
self.tokenizer.add_special_tokens({'cls_token': '[CLS]'})
self.model.resize_token_embeddings(len(self.tokenizer))
input_ids = self.tokenizer.batch_encode_plus(text_list,
add_special_tokens=True,
max_length=self.max_length,
pad_to_max_length=True
)["input_ids"]

return torch.LongTensor(input_ids)

def _tokenize_and_predict(self, text_list: List[str]) -> torch.tensor:
input_ids_tensor = self._tokenize(text_list)
out = self.model(input_ids=input_ids_tensor)
hidden_states = out[2]
if (self.combine_sentence_tokens):
return self.embedding_func(hidden_states)
else:
return hidden_states[-1];


def forward(self, text_list: List[str]):
"""
:param input_ids: (torch.LongTensor of shape (batch_size, input_ids_length))
:return: logits for class
"""
if isinstance(text_list, pd.Series):
text_list = text_list.tolist()
with torch.no_grad():
# fine tuning GPT2 model is too expensive, so won't do it
gpt_out = self._tokenize_and_predict(text_list)
batch_size = len(text_list)
assert gpt_out.shape == (batch_size, self.hidden_size)
prediction_vector = self.fc1(gpt_out) # (batch_size , max_len, num_classes)
logits = torch.softmax(prediction_vector, dim=1)
return logits


class GPT2Classifier(TorchModelBase):
"""GPT2 + NN head for classification problems.
The network will work for any kind of classification task.

Parameters
----------
embed_dim: dimension of byte-pair/token embeddings generated by the model, check the model card(n_embd prop), since each model is compatible with only 1 no. of dimensions
max_seq_length: max tokens in a sequence(n_positions param in hugging face model config), if sequenc is shorter will get padded
"""
def __init__(self,
model_name="distilgpt2",
embed_dim=768,
max_seq_length=1024,
**kwargs
):
self.model_name = model_name
self.embed_dim = embed_dim
self.max_seq_length = max_seq_length
self.model = None # call fit() to set this
self.tokenizer = None # call fit() to set this
self.classes = None # call fit() to set this
super(GPT2Classifier, self).__init__(**kwargs)
self.params += ['model_name']

def fit(self, X, y):
"""Standard `fit` method.

Parameters
----------
X : np.array
y : array-like
Returns
-------
self

"""
self.classes = list(set(y))
self.model = GPT2SequenceClassifierModel(
hidden_size=self.embed_dim,
num_classes=len(self.classes),
gpt_model_name=self.model_name,
max_seq_length=self.max_seq_length
)
self.opt = self.optimizer(
self.model.parameters()
)
self.model.train()
loss = nn.CrossEntropyLoss()
print("Training... max iters: ", self.max_iter)
for ephoc in range(self.max_iter):
print("ephoc no: ", ephoc)
zipped_data = list(zip(X,y))
random.shuffle(zipped_data)
batches = minibatch(zipped_data, size=self.batch_size)
for batch in batches:
X_batch, y_batch = zip(*batch)
batch_preds = self.model(X_batch)
err = loss(batch_preds, torch.LongTensor(y_batch))
# Backprop:
self.opt.zero_grad()
err.backward()
self.opt.step()
return self

def predict_proba(self, X):
"""Predicted probabilities for the examples in `X`.

Parameters
----------
X : np.array

Returns
-------
np.array with shape (len(X), self.n_classes_)

"""
self.model.eval()
with torch.no_grad():
preds = self.model(X)
preds = preds.numpy()
return preds

def predict(self, X):
"""Predicted labels for the examples in `X`. These are converted
from the integers that PyTorch needs back to their original
values in `self.classes_`.

Parameters
----------
X : np.array

Returns
-------
list of length len(X)

"""
probs = self.predict_proba(X)
return [self.classes[i] for i in probs.argmax(axis=1)]

关于python - 用于句子分类的 Huggingface GPT2 和 T5 模型 API?,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/62561471/

25 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com