gpt4 book ai didi

deep-learning - Pytorch:网络根本不学习+权重太低

转载 作者:行者123 更新时间:2023-12-04 05:45:46 28 4
gpt4 key购买 nike

关于输入。抱歉格式错误。每两行第一行是键,第二行是值。 18~20_ride是标签,不包含在输入中。下面是一个输入。训练集由其中的 400000 个组成。

bus_route_id    station_code    latitude    longitude   6~7_ride    
0 4270000 344 33.48990 126.49373
7~8_ride 8~9_ride 9~10_ride 10~11_ride 11~12_ride 6~7_takeoff
0.0 1.0 2.0 5.0 2.0 6.0
7~8_takeoff 8~9_takeoff 9~10_takeoff 10~11_takeoff 11~12_takeoff
0.0 0.0 0.0 0.0 0.0
18~20_ride weekday dis_jejusi dis_seoquipo
0.0 6 2.954920 26.256744

示例权重:在第 4 个纪元捕获。经过 20 个 epoch 的训练后,我得到的值要小得多(例如 -7e-44 或 1e-55)

 2.3937e-11, -2.6920e-12, -1.0445e-11,  ..., -1.0754e-11, 1.1128e-11, -1.4814e-11

模型的预测和目标

#Target
[2.],
[0.],
[0.]

#Prediction
[1.4187],
[1.4187],
[1.4187]

我的数据集.py

from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torch
import os

class MyDataset(Dataset):
def __init__(self, csv_filename):
self.dataset = pd.read_csv(csv_filename, index_col=0)
self.labels = self.dataset.pop("18~20_ride")
self.dataset = self.dataset.values
self.labels = np.reshape(self.labels.values,(-1,1))

def __len__(self):
return len(self.dataset)

def __getitem__(self, idx):
return self.dataset[idx], self.labels[idx]

模型

class Network(nn.Module):
def __init__(self, input_num):
super(Network, self).__init__()
self.fc1 = nn.Sequential(
nn.Linear(input_num, 64),
nn.BatchNorm1d(64),
GELU()
)

self.fc2 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc3 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc4 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc5 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc6 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU)
)
self.fc7 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU()
)
self.fc8 = nn.Sequential(
nn.Linear(64, 64),
nn.BatchNorm1d(64),
GELU())
)
self.fc9 = nn.Linear(64, 1)

训练和验证

def train(model, device, train_loader, optimizer, loss_fn, log_interval, epoch):
print("Training")
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.float().to(device), target.float().to(device)
optimizer.zero_grad()
output = model(data)
loss = loss_fn(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx+1) * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

def validate(model, device, loader, loss_fn):
print("\nValidating")
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(loader):
data, target = data.float().to(device), target.float().to(device)
output = model(data)
test_loss += loss_fn(output, target).item() # sum up batch loss

test_loss /= len(loader)

print('Validation average loss: {:.4f}\n'.format(
test_loss))
return test_loss

训练和验证的全过程

from MyDataset import MyDataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from datetime import datetime

train_dataset_path = "/content/drive/My Drive/root/bus/dataset/train.csv"
val_dataset_path = "/content/drive/My Drive/root/bus/dataset/val.csv"
model_base_path = "/content/drive/My Drive/root/bus/models/"

model_file = "/content/drive/My Drive/root/bus/models/checkpoints/1574427776.202017.pt"

"""
Training Config
"""
epochs = 10
batch_size = 32
learning_rate = 0.5

check_interval = 4

log_interval = int(40000/batch_size)
gamma = 0.1

load_model = False
save_model = True
make_checkpoint = True
"""
End of config
"""

# Read test set
train_set = MyDataset(train_dataset_path)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_set = MyDataset(val_dataset_path)
val_loader = DataLoader(val_set, batch_size=1)
print("Data READY")

device = torch.device("cuda")
net = Network(19).float().to(device)
if load_model:
net.load_state_dict(torch.load(model_file))
loss_fn = torch.nn.MSELoss()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)

best_loss = float('inf')
isAbort = False
for epoch in range(1, epochs+1):
train(net, device, train_loader, optimizer, loss_fn, log_interval, epoch)
val_loss = validate(net, device, val_loader, loss_fn)
if epoch%check_interval==0:
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
"""
if val_loss < best_loss and epoch%check_interval==0:
best_loss = val_loss
if make_checkpoint:
print("Saving new checkpoint")
torch.save(net.state_dict(), model_base_path+"checkpoints/"+str(datetime.today().timestamp())+".pt")
else:
print("Model overfit detected. Aborting training")
isAbort = True
break
"""
if save_model and not isAbort:
torch.save(net.state_dict(), model_base_path+"finals/"+str(datetime.today().timestamp())+".pt")

因此,我尝试使用 google colab 为回归问题训练一个完全连接的模型。但是它没有得到很好的训练;损失绝对没有减少。于是我往下挖,发现权重真的很小。知道为什么会发生这种情况以及如何避免这种情况吗?谢谢我使用 MSE 进行损失并使用 ADaW 优化器。以下是我尝试过的事情

  1. 尝试了其他架构(改变层数大小、改变激活函数 ReLU、GELU)但损失并没有减少
  2. 尝试改变学习率从3e-1~1e-3,甚至尝试过1
  3. 尝试对数据进行其他预处理(使用日/月/年而不是工作日)
  4. 给定输入数据中的标签但损失没有减少
  5. 尝试了不同的 batch_sizes(4, 10, 32, 64)
  6. 删除了 batch_normalization
  7. 其他类型的优化器,例如 SGD、Adam
  8. 训练了 20 个 epoch 但损失并没有减少
  9. 权重在 loss.backward() 时发生变化

最佳答案

TL;DR: Invalid input data!! Check for NaN or NULL

好吧,这个问题已经有一段时间了。尝试了几乎所有的东西,虽然可能搞砸了项目设置。所以我删除了项目并再次尝试:相同。再次删除并迁移到 TF2:同样的结果!所以我发现设置没有任何问题。所以我搜索了其他地方。最后我确实找到了原因。输入栏其实是我自己修改的。 (去除一些高度相关的特征)。这不是原创的。在修改过程中,我弄乱了一些浮点值,最终得到了 NaN 值。因此,请检查您的数据集是否包含无效值。

关于deep-learning - Pytorch:网络根本不学习+权重太低,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/58994627/

28 4 0
Copyright 2021 - 2024 cfsdn All Rights Reserved 蜀ICP备2022000587号
广告合作:1813099741@qq.com 6ren.com