gpt4 book ai didi

python - 错误 : IndexError: index 6319 is out of bounds for axis 0 with size 0

转载 作者:行者123 更新时间:2023-12-04 12:08:38 26 4
gpt4 key购买 nike

下面的代码取自 https://github.com/arunarn2/HierarchicalAttentionNetworks/blob/master/HierarchicalAttn.py有一些小的调整。虽然我理解错误的含义,但我无法弄清楚它是如何在以下代码中蔓延的以及如何纠正它。我已经坚持了很长一段时间,非常感谢一些帮助。谢谢!
(这是完整的代码)

maxlen = 100
max_sentences = 15
max_words = 20000
embedding_dim = 100
validation_split = 0.2
reviews = []
labels = []
texts = []
glove_dir = "./glove.6B"
embeddings_index = {}


# class defining the custom attention layer
class HierarchicalAttentionNetwork(Layer):
def __init__(self, attention_dim):
self.init = initializers.get('normal')
self.supports_masking = True
self.attention_dim = attention_dim
super(HierarchicalAttentionNetwork, self).__init__()

def build(self, input_shape):
assert len(input_shape) == 3
self.W = K.variable(self.init((input_shape[-1], self.attention_dim)))
self.b = K.variable(self.init((self.attention_dim,)))
self.u = K.variable(self.init((self.attention_dim, 1)))
self.trainable_weights = [self.W, self.b, self.u]
super(HierarchicalAttentionNetwork, self).build(input_shape)

def compute_mask(self, inputs, mask=None):
return mask

def call(self, x, mask=None):
# size of x :[batch_size, sel_len, attention_dim]
# size of u :[batch_size, attention_dim]
# uit = tanh(xW+b)
uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))

ait = K.exp(K.squeeze(K.dot(uit, self.u), -1))

if mask is not None:
# Cast the mask to floatX to avoid float64 upcasting
ait *= K.cast(mask, K.floatx())
ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
weighted_input = x * K.expand_dims(ait)
output = K.sum(weighted_input, axis=1)

return output

def compute_output_shape(self, input_shape):
return input_shape[0], input_shape[-1]


def remove_html(str_a):
p = re.compile(r'<.*?>')
return p.sub('', str_a)


# replace all non-ASCII (\x00-\x7F) characters with a space
def replace_non_ascii(str_a):
return re.sub(r'[^\x00-\x7f]', r'', str_a)


# Tokenization/string cleaning for dataset
def clean_str(string):
string= string.decode("utf-8")
string = re.sub(r"\\", "", string)
string = re.sub(r"\'", "", string)
string = re.sub(r"\"", "", string)
return string.strip().lower()



input_data = pd.read_csv(io.BytesIO(uploaded['labeledTrainData.tsv']), sep='\t')

for idx in range(input_data.review.shape[0]):
text = BeautifulSoup(input_data.review[idx], features="html5lib")
text = clean_str(text.get_text().encode('ascii', 'ignore'))
texts.append(text)
sentences = tokenize.sent_tokenize(text)
reviews.append(sentences)
np.append(labels, input_data.sentiment[idx])

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

data = np.zeros((len(texts), max_sentences, maxlen), dtype='int32')

for i, sentences in enumerate(reviews):
for j, sent in enumerate(sentences):
if j < max_sentences:
wordTokens = text_to_word_sequence(sent)
k = 0
for _, word in enumerate(wordTokens):
if k < maxlen and tokenizer.word_index[word] < max_words:
data[i, j, k] = tokenizer.word_index[word]
k = k + 1

word_index = tokenizer.word_index
print('Total %s unique tokens.' % len(word_index))

if np.any(np.array(labels)):
labels = np_utils.to_categorical(np.array(labels))
#labels = to_categorical(np.asarray(labels))

print('Shape of reviews (data) tensor:', data.shape)
print('Shape of sentiment (label) tensor:', np.shape(labels))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = np.asarray(labels)[indices.astype(int)]
#labels = labels[indices]
nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of positive and negative reviews in training and validation set')
print (y_train.sum(axis=0))
print (y_val.sum(axis=0))


f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

# building Hierachical Attention network
embedding_matrix = np.random.random((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1, embedding_dim, weights=[embedding_matrix],
input_length=maxlen, trainable=True, mask_zero=True)

sentence_input = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
lstm_word = Bidirectional(GRU(100, return_sequences=True))(embedded_sequences)
attn_word = HierarchicalAttentionNetwork(100)(lstm_word)
sentenceEncoder = Model(sentence_input, attn_word)

review_input = Input(shape=(max_sentences, maxlen), dtype='int32')
review_encoder = TimeDistributed(sentenceEncoder)(review_input)
lstm_sentence = Bidirectional(GRU(100, return_sequences=True))(review_encoder)
attn_sentence = HierarchicalAttentionNetwork(100)(lstm_sentence)
preds = Dense(2, activation='softmax')(attn_sentence)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

print("model fitting - Hierachical attention network")
model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=10, batch_size=100)
完整的错误:
Error Stacktrace

最佳答案

Python 是 complaining因为您试图通过索引访问 labels数组但它是空的,如控制台输出中所示:

Shape of sentiment (label) tensor: (0,)
问题出在这一行:
np.append(labels, input_data.sentiment[idx])
在您引用的原始代码中,一个新值附加到 labels list .这个变化就发生在原地 list被修改。相反,如 numpy documentation 所示, 在描述 np.append 返回的值时, 正在 arr原始数组:

A copy of arr with values appended to axis. Note that append does not occur in-place: a new array is allocated and filled.


即,您原来的 labels list ,一个空数组,永远不会被修改,这会导致在代码后面尝试通过索引访问数组时出错。
如果你想实现类似的行为,你需要像这样修改你的代码:
labels = np.append(labels, input_data.sentiment[idx])
请注意,由于解释的原因,此操作将非常低效,最好将情感结果直接附加到原始 labels 上。 list如原始代码:
labels.append(input_data.sentiment[idx])
请参阅 this related SO question以及。

关于python - 错误 : IndexError: index 6319 is out of bounds for axis 0 with size 0,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/69074018/

26 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com