2017-08-24 2 views
1

Ich habe eine kleine Änderung in meinem Code gemacht, so dass esDataParallel und DistributedDataParallel nicht verwendet. Der Code lautet wie folgt:PyTorch geben Cuda Laufzeitfehler

import argparse 
import os 
import shutil 
import time 

import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.backends.cudnn as cudnn 
import torch.distributed as dist 
import torch.optim 
import torch.utils.data 
import torch.utils.data.distributed 
import torchvision.transforms as transforms 
import torchvision.datasets as datasets 
import torchvision.models as models 

model_names = sorted(name for name in models.__dict__ 
    if name.islower() and not name.startswith("__") 
    and callable(models.__dict__[name])) 

parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 
parser.add_argument('data', metavar='DIR', 
        help='path to dataset') 
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18', 
        choices=model_names, 
        help='model architecture: ' + 
         ' | '.join(model_names) + 
         ' (default: resnet18)') 
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 
        help='number of data loading workers (default: 4)') 
parser.add_argument('--epochs', default=90, type=int, metavar='N', 
        help='number of total epochs to run') 
parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 
        help='manual epoch number (useful on restarts)') 
parser.add_argument('-b', '--batch-size', default=256, type=int, 
        metavar='N', help='mini-batch size (default: 256)') 
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 
        metavar='LR', help='initial learning rate') 
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 
        help='momentum') 
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, 
        metavar='W', help='weight decay (default: 1e-4)') 
parser.add_argument('--print-freq', '-p', default=10, type=int, 
        metavar='N', help='print frequency (default: 10)') 
parser.add_argument('--resume', default='', type=str, metavar='PATH', 
        help='path to latest checkpoint (default: none)') 
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 
        help='evaluate model on validation set') 
parser.add_argument('--pretrained', dest='pretrained', action='store_true', 
        help='use pre-trained model') 
parser.add_argument('--world-size', default=1, type=int, 
        help='number of distributed processes') 
parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 
        help='url used to set up distributed training') 
parser.add_argument('--dist-backend', default='gloo', type=str, 
        help='distributed backend') 

best_prec1 = 0 


def main(): 
    global args, best_prec1 
    args = parser.parse_args() 

    args.distributed = args.world_size > 1 

    if args.distributed: 
     dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 
           world_size=args.world_size) 

    # create model 
    if args.pretrained: 
     print("=> using pre-trained model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch](pretrained=True) 
    else: 
     print("=> creating model '{}'".format(args.arch)) 
     model = models.__dict__[args.arch]() 

    if not args.distributed: 
     if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 
      #model.features = torch.nn.DataParallel(model.features) 
      model.cuda() 
     #else: 
      #model = torch.nn.DataParallel(model).cuda() 
    else: 
     model.cuda() 
     #model = torch.nn.parallel.DistributedDataParallel(model) 

    # define loss function (criterion) and optimizer 
    criterion = nn.CrossEntropyLoss().cuda() 

    optimizer = torch.optim.SGD(model.parameters(), args.lr, 
           momentum=args.momentum, 
           weight_decay=args.weight_decay) 

    # optionally resume from a checkpoint 
    if args.resume: 
     if os.path.isfile(args.resume): 
      print("=> loading checkpoint '{}'".format(args.resume)) 
      checkpoint = torch.load(args.resume) 
      args.start_epoch = checkpoint['epoch'] 
      best_prec1 = checkpoint['best_prec1'] 
      model.load_state_dict(checkpoint['state_dict']) 
      optimizer.load_state_dict(checkpoint['optimizer']) 
      print("=> loaded checkpoint '{}' (epoch {})" 
        .format(args.resume, checkpoint['epoch'])) 
     else: 
      print("=> no checkpoint found at '{}'".format(args.resume)) 

    cudnn.benchmark = True 

    # Data loading code 
    traindir = os.path.join(args.data, 'train') 
    valdir = os.path.join(args.data, 'val') 
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 
            std=[0.229, 0.224, 0.225]) 

    train_dataset = datasets.ImageFolder(
     traindir, 
     transforms.Compose([ 
      transforms.RandomSizedCrop(224), 
      transforms.RandomHorizontalFlip(), 
      transforms.ToTensor(), 
      normalize, 
     ])) 

    if args.distributed: 
     train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) 
    else: 
     train_sampler = None 

    train_loader = torch.utils.data.DataLoader(
     train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), 
     num_workers=args.workers, pin_memory=True, sampler=train_sampler) 

    val_loader = torch.utils.data.DataLoader(
     datasets.ImageFolder(valdir, transforms.Compose([ 
      transforms.Scale(256), 
      transforms.CenterCrop(224), 
      transforms.ToTensor(), 
      normalize, 
     ])), 
     batch_size=args.batch_size, shuffle=False, 
     num_workers=args.workers, pin_memory=True) 

    if args.evaluate: 
     validate(val_loader, model, criterion) 
     return 

    for epoch in range(args.start_epoch, args.epochs): 
     if args.distributed: 
      train_sampler.set_epoch(epoch) 
     adjust_learning_rate(optimizer, epoch) 

     # train for one epoch 
     train(train_loader, model, criterion, optimizer, epoch) 

     # evaluate on validation set 
     prec1 = validate(val_loader, model, criterion) 

     # remember best [email protected] and save checkpoint 
     is_best = prec1 > best_prec1 
     best_prec1 = max(prec1, best_prec1) 
     save_checkpoint({ 
      'epoch': epoch + 1, 
      'arch': args.arch, 
      'state_dict': model.state_dict(), 
      'best_prec1': best_prec1, 
      'optimizer' : optimizer.state_dict(), 
     }, is_best) 


def train(train_loader, model, criterion, optimizer, epoch): 
    batch_time = AverageMeter() 
    data_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to train mode 
    model.train() 

    end = time.time() 
    for i, (input, target) in enumerate(train_loader): 
     # measure data loading time 
     data_time.update(time.time() - end) 

     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input) 
     target_var = torch.autograd.Variable(target) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # compute gradient and do SGD step 
     optimizer.zero_grad() 
     loss.backward() 
     optimizer.step() 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Epoch: [{0}][{1}/{2}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        epoch, i, len(train_loader), batch_time=batch_time, 
        data_time=data_time, loss=losses, top1=top1, top5=top5)) 


def validate(val_loader, model, criterion): 
    batch_time = AverageMeter() 
    losses = AverageMeter() 
    top1 = AverageMeter() 
    top5 = AverageMeter() 

    # switch to evaluate mode 
    model.eval() 

    end = time.time() 
    for i, (input, target) in enumerate(val_loader): 
     target = target.cuda(async=True) 
     input_var = torch.autograd.Variable(input, volatile=True) 
     target_var = torch.autograd.Variable(target, volatile=True) 

     # compute output 
     output = model(input_var) 
     loss = criterion(output, target_var) 

     # measure accuracy and record loss 
     prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) 
     losses.update(loss.data[0], input.size(0)) 
     top1.update(prec1[0], input.size(0)) 
     top5.update(prec5[0], input.size(0)) 

     # measure elapsed time 
     batch_time.update(time.time() - end) 
     end = time.time() 

     if i % args.print_freq == 0: 
      print('Test: [{0}/{1}]\t' 
        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 
        'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 
        '[email protected] {top1.val:.3f} ({top1.avg:.3f})\t' 
        '[email protected] {top5.val:.3f} ({top5.avg:.3f})'.format(
        i, len(val_loader), batch_time=batch_time, loss=losses, 
        top1=top1, top5=top5)) 

    print(' * [email protected] {top1.avg:.3f} [email protected] {top5.avg:.3f}' 
      .format(top1=top1, top5=top5)) 

    return top1.avg 


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'): 
    torch.save(state, filename) 
    if is_best: 
     shutil.copyfile(filename, 'model_best.pth.tar') 


class AverageMeter(object): 
    """Computes and stores the average and current value""" 
    def __init__(self): 
     self.reset() 

    def reset(self): 
     self.val = 0 
     self.avg = 0 
     self.sum = 0 
     self.count = 0 

    def update(self, val, n=1): 
     self.val = val 
     self.sum += val * n 
     self.count += n 
     self.avg = self.sum/self.count 


def adjust_learning_rate(optimizer, epoch): 
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 
    lr = args.lr * (0.1 ** (epoch // 30)) 
    for param_group in optimizer.param_groups: 
     param_group['lr'] = lr 


def accuracy(output, target, topk=(1,)): 
    """Computes the [email protected] for the specified values of k""" 
    maxk = max(topk) 
    batch_size = target.size(0) 

    _, pred = output.topk(maxk, 1, True, True) 
    pred = pred.t() 
    correct = pred.eq(target.view(1, -1).expand_as(pred)) 

    res = [] 
    for k in topk: 
     correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) 
     res.append(correct_k.mul_(100.0/batch_size)) 
    return res 


if __name__ == '__main__': 
    main() 

Und wenn ich diesen Code auf einer Reihe von Bildern mit der alexnet Neuralnet Architektur laufe, gibt es einen seltsamen cuda Fehler, der wie folgt lautet:

=> creating model 'alexnet' 
THCudaCheck FAIL file=/pytorch/torch/lib/THC/THCGeneral.c line=70 error=30 : unknown error 
Traceback (most recent call last): 
    File "imagenet2.py", line 319, in <module> 
    main() 
    File "imagenet2.py", line 87, in main 
    model.cuda() 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in cuda 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 118, in _apply 
    module._apply(fn) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 124, in _apply 
    param.data = fn(param.data) 
    File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 147, in <lambda> 
    return self._apply(lambda t: t.cuda(device_id)) 
    File "/usr/local/lib/python2.7/dist-packages/torch/_utils.py", line 66, in _cuda 
    return new_type(self.size()).copy_(self, async) 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 266, in _lazy_new 
    _lazy_init() 
    File "/usr/local/lib/python2.7/dist-packages/torch/cuda/__init__.py", line 85, in _lazy_init 
    torch._C._cuda_init() 
RuntimeError: cuda runtime error (30) : unknown error at /pytorch/torch/lib/THC/THCGeneral.c:70 

Befehl zum Ausführen des Codes: python imagenet.py --world-size 1 --arch 'alexnet' <image_folder>

Wo bin ich falsch gelaufen?

PS: Wird auf einer AWS g2.2xlarge Ubuntu-Instanz ausgeführt.

Die CUDA-Version ist wie folgt:

nvcc: NVIDIA (R) Cuda compiler driver 
Copyright (c) 2005-2016 NVIDIA Corporation 
Built on Tue_Jan_10_13:22:03_CST_2017 
Cuda compilation tools, release 8.0, V8.0.61 

Antwort

1
  1. CUDNN gibt nutzlos Fehlermeldungen. Testen Sie zur Fehlersuche Ihr Netz auf der CPU mit net.cpu() oder entfernen Sie einfach die net.cuda(). Sie müssen dasselbe mit Trainings-, Validierungs- und Ausgabevariablen tun.

  2. Es scheint, dass Sie AlexNet auf einer anderen Größe als 224x224 verwendet haben. Laut der Dokumentation sollte es funktionieren, solange die Bildgröße mindestens 224x224 beträgt.

  3. Dies ist wahrscheinlich ein Tensor-Shaping-Problem aufgrund eines fest codierten Parameters in der Implementierung von AlexNet von pytorch. In vision/torchvision/models/alexnet.py auf 44 Zeile sagt es

x = x.view(x.size(0), 256 * 6 * 6) 

Änderung es

x = x.view(x.size(0), -1) 

Diese erlauben sollte es mit unterschiedlichen Bildgrößen zu arbeiten.

  1. Ich habe diese Änderung an das Github-Repository gesendet, aber ich denke, es wurde noch nicht aktualisiert.