Cause PCIe error callbacks using AER injection

1k views Asked by At

I am trying to cause a callback in the Linux nvme driver by using AER injection. I've modified the AER source code to directly inject errors through module loading rather than from userland program.

I've verified that I got the correct bus, dev, and fn and that the error injection went successfully, yet the error handling functions for the nvme driver did not pick up anything.

Here is the modification for the aer_injection.c file

This fills out the error structure

 554    static int __init aer_inject_init(void)
 555    {
 556    
 557        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 558        int ret = misc_register(&aer_inject_device);
 559        struct aer_error_inj ae = {
 560           .bus             = 0x84,
 561           .dev             = 0x00,
 562        .fn              = 0x00, 
 563           .uncor_status    = 0x00040000,    //poisoned TLP
 564           .cor_status      = 0x0,
 565           .header_log0     = 0x0,
 566           .header_log1     = 0x1,
 567           .header_log2     = 0x2,
 568           .header_log3     = 0x3,
 569           .domain          = 0x00
 570        }; 
 571        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 572        aer_inject(&ae); 
 573        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 574        return ret;
 575    }

This is the aer_inject() function:

 320    static int aer_inject(struct aer_error_inj *einj)
 321    {
 322        struct aer_error *err, *rperr;
 323        struct aer_error *err_alloc = NULL, *rperr_alloc = NULL;
 324        struct pci_dev *dev, *rpdev;
 325        struct pcie_device *edev;
 326        unsigned long flags;
 327        unsigned int devfn = PCI_DEVFN(einj->dev, einj->fn);
 328        int pos_cap_err, rp_pos_cap_err;
 329        u32 sever, cor_mask, uncor_mask, cor_mask_orig = 0, uncor_mask_orig = 0;
 330        int ret = 0;
 331        
 332        //einj->domain = 0x0000;
 333        //einj->bus = 0x84;
 334        //devfn = 0x0;  
 335    
 336        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 337        dev = pci_get_domain_bus_and_slot((int)einj->domain, einj->bus, devfn);
 338        printk(KERN_INFO "dev->vendor %#x\n", dev->vendor);
 339        printk(KERN_INFO "dev->device %#x\n", dev->device);
 340        if (!dev) {
 341            printk(KERN_INFO "ENODEV %s %d\n", __func__, __LINE__);
 342            return -ENODEV;
 343        }
 344        rpdev = pcie_find_root_port(dev);
 345        if (!rpdev) {
 346            ret = -ENODEV;
 347            goto out_put;
 348        }
 349    
 350        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 351        pos_cap_err = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR);
 352        if (!pos_cap_err) {
 353            ret = -EPERM;
 354            goto out_put;
 355        }
 356        pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_SEVER, &sever);
 357        pci_read_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK, &cor_mask);
 358        pci_read_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
 359                      &uncor_mask);
 360    
 361        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 362        rp_pos_cap_err = pci_find_ext_capability(rpdev, PCI_EXT_CAP_ID_ERR);
 363        if (!rp_pos_cap_err) {
 364            ret = -EPERM;
 365            goto out_put;
 366        }
 367    
 368        err_alloc =  kzalloc(sizeof(struct aer_error), GFP_KERNEL);
 369        if (!err_alloc) {
 370            ret = -ENOMEM;
 371            goto out_put;
 372        }
 373        rperr_alloc =  kzalloc(sizeof(struct aer_error), GFP_KERNEL);
 374        if (!rperr_alloc) {
 375            ret = -ENOMEM;
 376            goto out_put;
 377        }
 378    
 379        if (aer_mask_override) {
 380            cor_mask_orig = cor_mask;
 381            cor_mask &= !(einj->cor_status);
 382            pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK,
 383                           cor_mask);
 384    
 385            uncor_mask_orig = uncor_mask;
 386            uncor_mask &= !(einj->uncor_status);
 387            pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
 388                           uncor_mask);
 389        }
 390    
 391        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 392        spin_lock_irqsave(&inject_lock, flags);
 393    
 394        err = __find_aer_error_by_dev(dev);
 395        if (!err) {
 396            err = err_alloc;
 397            err_alloc = NULL;
 398            aer_error_init(err, einj->domain, einj->bus, devfn,
 399                       pos_cap_err);
 400            list_add(&err->list, &einjected);
 401        }
 402        err->uncor_status |= einj->uncor_status;
 403        printk(KERN_INFO "err->uncor_status %#x\n", err->uncor_status);
 404        err->cor_status |= einj->cor_status;
 405        err->header_log0 = einj->header_log0;
 406        err->header_log1 = einj->header_log1;
 407        err->header_log2 = einj->header_log2;
 408        err->header_log3 = einj->header_log3;
 409    
 410        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 411        if (!aer_mask_override && einj->cor_status &&
 412            !(einj->cor_status & ~cor_mask)) {
 413            ret = -EINVAL;
 414            printk(KERN_WARNING "The correctable error(s) is masked "
 415                    "by device\n");
 416            spin_unlock_irqrestore(&inject_lock, flags);
 417            goto out_put;
 418        }
 419        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 420        if (!aer_mask_override && einj->uncor_status &&
 421            !(einj->uncor_status & ~uncor_mask)) {
 422            ret = -EINVAL;
 423            printk(KERN_WARNING "The uncorrectable error(s) is masked "
 424                    "by device\n");
 425            spin_unlock_irqrestore(&inject_lock, flags);
 426            goto out_put;
 427        }
 428    
 429        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 430        rperr = __find_aer_error_by_dev(rpdev);
 431        if (!rperr) {
 432                printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 433            rperr = rperr_alloc;
 434            rperr_alloc = NULL;
 435            aer_error_init(rperr, pci_domain_nr(rpdev->bus),
 436                       rpdev->bus->number, rpdev->devfn,
 437                       rp_pos_cap_err);
 438            list_add(&rperr->list, &einjected);
 439        }
 440        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 441        if (einj->cor_status) {
 442            if (rperr->root_status & PCI_ERR_ROOT_COR_RCV)
 443                rperr->root_status |= PCI_ERR_ROOT_MULTI_COR_RCV;
 444            else
 445                rperr->root_status |= PCI_ERR_ROOT_COR_RCV;
 446            rperr->source_id &= 0xffff0000;
 447            rperr->source_id |= (einj->bus << 8) | devfn;
 448        }
 449        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 450        if (einj->uncor_status) {
 451                printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 452            if (rperr->root_status & PCI_ERR_ROOT_UNCOR_RCV)
 453            {
 454                printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 455                rperr->root_status |= PCI_ERR_ROOT_MULTI_UNCOR_RCV;
 456            if (sever & einj->uncor_status) {
 457                rperr->root_status |= PCI_ERR_ROOT_FATAL_RCV;
 458                if (!(rperr->root_status & PCI_ERR_ROOT_UNCOR_RCV))
 459                    rperr->root_status |= PCI_ERR_ROOT_FIRST_FATAL;
 460            } else
 461                rperr->root_status |= PCI_ERR_ROOT_NONFATAL_RCV;
 462            rperr->root_status |= PCI_ERR_ROOT_UNCOR_RCV;
 463            rperr->source_id &= 0x0000ffff;
 464            rperr->source_id |= ((einj->bus << 8) | devfn) << 16;
 465        }
 466        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 467        spin_unlock_irqrestore(&inject_lock, flags);
 468    
 469        if (aer_mask_override) {
 470            pci_write_config_dword(dev, pos_cap_err + PCI_ERR_COR_MASK,
 471                           cor_mask_orig);
 472            pci_write_config_dword(dev, pos_cap_err + PCI_ERR_UNCOR_MASK,
 473                           uncor_mask_orig);
 474        }
 475    
 476        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 477        ret = pci_bus_set_aer_ops(dev->bus);
 478        if (ret)  
 479            goto out_put;
 480        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 481        ret = pci_bus_set_aer_ops(rpdev->bus);
 482        if (ret)
 483            goto out_put;
 484    
 485        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 486        if (find_aer_device(rpdev, &edev)) {
 487            if (!get_service_data(edev)) {
 488                printk(KERN_WARNING "AER service is not initialized\n");
 489                ret = -EINVAL;
 490        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 491                goto out_put;
 492            }
 493            aer_irq(-1, edev);
 494        }
 495        else
 496            ret = -EINVAL;
 497        printk(KERN_INFO "%s %d\n", __func__, __LINE__);
 498    out_put:
 499        kfree(err_alloc);
 500        kfree(rperr_alloc);
 501        pci_dev_put(dev);
 502        return ret;
 503    }

This is the function I am attempting to hit in the nvme driver:

2563 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
2564                 enum pci_channel_state state)
2565 {
2566 
2567                 printk(KERN_INFO "nvme_error_detected called\n");
2568                 dev_printk(KERN_ERR, &pdev->dev, "%s\n", __func__);
2569 }

Verification from kernel:

aer_inject_init 555
aer_inject_init 569
aer_inject 336
dev->vendor 0x1c58
dev->device 0x3
aer_inject 350
aer_inject 361
aer_inject 391
err->uncor_status 0x40000
aer_inject 410
aer_inject 419
aer_inject 429
aer_inject 432
aer_inject 440
aer_inject 449
aer_inject 451
aer_inject 464
aer_inject 474
aer_inject 478
aer_inject 483
aer_inject 495
aer_inject_init 571

As we can see, the kernel messages did not print from the driver but executed fully throughout the AER injection method. What could be the cause of this? Thank you for apologies for the messy code formatting.

0

There are 0 answers