I'm using Pytorch with Faster RCNN on dataset having 2 classes and about 100 images for training and 35 for validation in a multi node and multi gpu environment. Just to simplify the debugging, I'm running on a single GPU for the moment, and batch size=1.
The problem is the losses from training is Nan from the beginning, also using other learning rate values. My dataset has COCO format. I checked the training annotations and are ok.
我在数据集上使用Pytorch和Faster RCNN,该数据集有2个类和大约100个图像用于训练,35个图像用于在多节点和多gpu环境中进行验证。为了简化调试,我暂时在一个GPU上运行,批处理大小为1。问题是训练中的损失是楠从一开始,也使用了其他的学习率值。我的数据集具有COCO格式。我检查了培训注释,一切正常。
The following my setup:
以下是我的设置:
train_dataset = CustomCOCODataset(root_dir=train_image_folder, annotations_file=train_annotations_file, transforms=transform)
val_dataset = CustomCOCODataset(root_dir=val_image_folder, annotations_file=val_annotations_file, transforms=transfor
train_sampler = DistributedSampler(train_dataset)
val_sampler = DistributedSampler(val_dataset)
batch_size = 1
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=(train_sampler is None), sampler=train_sampler)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=(val_sampler is None), sampler=val_sampler))
num_train_batches = len(train_dataloader)
num_val_batches = len(val_dataloader)
num_classes=3 #cat dogs
model = fasterrcnn_resnet50_fpn(pretrained_backbone=True, num_classes=num_classes)
model = model.to(local_rank)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank])
optimizer = torch.optim.SGD(model.parameters(), lr=0.0005)
My training and validation loop:
我的培训和验证循环:
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for i, (images, targets) in enumerate(train_dataloader):
images = [image.to(local_rank) for image in images]
targets = [{k: v.to(local_rank) for k, v in t.items()} for t in targets]
loss_dict = model(images, targets)
losses = sum(loss for loss in loss_dict.values())
losses.backward() # Calcolare i gradienti, qua viene fatto il broadcast tra i gradienti
optimizer.step() # Aggiornare i pesi del modello
running_loss += losses.item()
print(f"Epoch: {epoch} Loss: {running_loss / (i + 1)}")
cpu_device = torch.device("cpu")
model.eval()
target = []
preds = []
metric_summary = {}
for i, (images, targets) in enumerate(val_dataloader):
images = [image.to(local_rank) for image in images]
with torch.no_grad():
outputs = model(images)
#Skip no predictions
if outputs[0]['boxes'].numel() == 0:
continue
#####################################
for i in range(len(images)):
true_dict = dict()
preds_dict = dict()
true_dict['boxes'] = targets[i]['boxes'].detach().cpu()
true_dict['labels'] = targets[i]['labels'].detach().cpu()
preds_dict['boxes'] = outputs[i]['boxes'].detach().cpu()
preds_dict['scores'] = outputs[i]['scores'].detach().cpu()
preds_dict['labels'] = outputs[i]['labels'].detach().cpu()
preds.append(preds_dict)
target.append(true_dict)
#####################################
metric = MeanAveragePrecision()
# Copia preds e target sulla GPU
for i in range(len(preds)):
for key in preds[i]:
preds[i][key] = preds[i][key].to(local_rank) # "device" dovrebbe essere la GPU corretta
for key in target[i]:
target[i][key] = target[i][key].to(local_rank)
metric.update(preds, target)
metric_summary = metric.compute()
if 'map' in metric_summary:
print(f"Epoch: {epoch} Metric summary: {metric_summary['map']}")
if global_rank == 0 :
orch.save(model.module.state_dict(), f'modello_salvato_cat_dog_epoch_{epoch+1}.pth')
rint("Modello salvato!")
I did a CustomCocoDataset class, where in get_item i convert boxes as:
我做了一个CustomCocoDataset类,在get_item中,我将框转换为:
torchvision.ops.box_convert(torch.tensor(annotation['bbox']), in_fmt="xywh", out_fmt="xyxy")
Such conversion should be correct, since I start from COCO dataset where the bbox format is xywh.
这样的转换应该是正确的,因为我从COCO数据集开始,其中的bbox格式是xywh。
The following my complete Custom class:
以下是我完整的自定义类:
class CustomCOCODataset(Dataset):
def __init__(self, root_dir, annotations_file, transforms=None):
self.root_dir = root_dir
self.transforms = transforms
self.coco = COCO(annotations_file)
self.image_ids = list(self.coco.imgs.keys())
self.image_ids = self.filter_empty_images()
self.log_annot_issue_x = True
self.log_annot_issue_y = True
self.square_training = False
self.img_size = 640 #detto il default
def __len__(self):
return len(self.image_ids)
def filter_empty_images(self):
# Filtra le immagini senza annotazioni
valid_image_ids = []
for img_id in self.image_ids:
annotation_ids = self.coco.getAnnIds(imgIds=img_id)
annotations = self.coco.loadAnns(annotation_ids)
if annotations:
valid_image_ids.append(img_id)
return valid_image_ids
def load_image_and_annotations(self, index):
img_id = self.image_ids[index]
img_info = self.coco.loadImgs(img_id)[0]
img_path = os.path.join(self.root_dir, img_info['file_name'])
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
annotation_ids = self.coco.getAnnIds(imgIds=img_info['id'])
# Ottieni tutte le annotazioni per questa immagine
annotations = self.coco.loadAnns(annotation_ids)
boxes = []
orig_boxes = []
labels = []
image_width = image.shape[1]
image_height = image.shape[0]
for annotation in annotations:
# Estrai le coordinate della bounding box
xmin = annotation['bbox'][0]
ymin = annotation['bbox'][1]
xmax = annotation['bbox'][2]
ymax = annotation['bbox'][3]
orig_boxes.append([xmin, ymin, xmax, ymax])
xmax_final = xmax
ymin_final = ymin
ymax_final = ymax
boxes.append([xmin_final, ymin_final, xmax_final, ymax_final])
return image, annotations
def __getitem__(self, idx):
image, annotations = self.load_image_and_annotations(idx)
#print("idx: ", idx, "image_size: ", image.size)
if self.transforms:
image = self.transforms(image)
target_list = []
target = {
'boxes': [],
'labels': [],
'area': [],
'iscrowd': []
}
image_id = torch.tensor([idx])
for annotation in annotations:
target = {
'image_id' : image_id,
'boxes': torchvision.ops.box_convert(torch.tensor(annotation['bbox']), in_fmt="xywh", out_fmt="xyxy"),
'labels' : torch.tensor(annotation['category_id']),
'area': torch.tensor(annotation['area']),
'iscrowd': torch.tensor(annotation['iscrowd'])
}
target_list.append(target)
return image, target_list
Why does this happen and how to resolve it?
为什么会发生这种情况以及如何解决?
更多回答
我是一名优秀的程序员,十分优秀!