gpt4 book ai didi

python - Tensorflow将tf.CsvDataset.map()转换为Bert输入格式

转载 作者:太空宇宙 更新时间:2023-11-03 20:52:28 24 4
gpt4 key购买 nike

注意:这个问题的开头不同,但我删除了所有以前的(现在不必要的)信息。

我有一个 CsvDataset,它由标签(浮点)和文本(字符串)组成。我想转换每一行,以便我可以将其输入到预训练的 Bert 模型中。不幸的是,我无法通过 .map 函数

files = glob.glob("example*.tsv")
d = tf.data.experimental.CsvDataset(files,
[tf.float32, tf.string],
select_cols=[3,4],
field_delim="\t",
header=True)
parsed_dataset = d.map(lambda label, text: tf.py_func(_decode_record, [label, text], [tf.float32, tf.string]))

def _decode_record(label, text):
"""Decodes a row to a TensorFlow example."""
label_list = [1, 2, 3, 4, 5]
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(text)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0: (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

input_ids = tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)

# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length

label_id = label_map[label]
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature([label_id])
features["is_real_example"] = create_int_feature(
[int(True)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
return tf_example

这会破坏:tensorflow.python.framework.errors_impl.UnimplementedError:不支持的对象类型示例[[{{node PyFunc}}]] [Op:IteratorGetNextSync]

最佳答案

我找到了问题的解决方案。下面的代码完成了这项工作。我的问题是我误解了 tf.py_func 的 Tout 参数

def _convert(label, text):
"""Decodes a csv-line to a TensorFlow Example, serialized as a string."""
label_list = [1, 2, 3, 4, 5]
label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i
tokens_a = tokenizer.tokenize(text)
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0: (max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

input_ids = tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)

# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length

label_id = label_map[label]
print("types", type(label_id), type(input_ids))
features = collections.OrderedDict()
features["input_ids"] = create_int_feature(input_ids)
features["input_mask"] = create_int_feature(input_mask)
features["segment_ids"] = create_int_feature(segment_ids)
features["label_ids"] = create_int_feature([label_id])
features["is_real_example"] = create_int_feature(
[int(True)])
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
# we cannot return the example here because tf.py_func only accepts true tf datatypes
return tf_example.SerializeToString()

name_to_features = {
'input_ids': tf.FixedLenFeature([128], tf.int64),
'input_mask': tf.FixedLenFeature([128], tf.int64),
'segment_ids': tf.FixedLenFeature([128], tf.int64),
'label_ids': tf.FixedLenFeature([1], tf.int64),
'is_real_example': tf.FixedLenFeature([1], tf.int64)
}

def _decode_record(record):
"""Decodes a record to a TensorFlow example."""
example = tf.parse_single_example(record, name_to_features)
# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
# So cast all int64 to int32.
for name in list(example.keys()):
t = example[name]
if t.dtype == tf.int64:
t = tf.to_int32(t)
example[name] = t
print(example)
return example

parsed_dataset = d.map(lambda label, text: tf.py_func(_convert, [label, text], tf.string))
parsed_dataset = parsed_dataset.map(_decode_record)

请注意,此解决方案使用 tf.py_func因此不能与 GPU 或 TPU 等加速器一起使用

关于python - Tensorflow将tf.CsvDataset.map()转换为Bert输入格式,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/56227420/

24 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com