Cache issue when creating shared memory between kernel space and user space

127 views Asked by At

I am writing a kernel module on NVIDIA's Jetson AGX Xavier (ARM architecture). I am creating a shared memory between kernel space and user space in the following way:

  1. Allocate memory in kernel space using kzalloc().
  2. Use dma_map_single() to get bus address for the allocated memory.
  3. Use remap_pfn_range() to map the memory to user space.
  4. In user space, use mmap() to get a pointer to this shared memory.
  5. Write data to the shared memory via memcpy().
  6. Read the shared memory in kernel space to see if it contains the data that was written from user space.

I am facing an issue that when I read the memory from kernel space, it sometimes contains up-to-date data, and sometimes it does not contain up-to-date data, which makes me suspect a cache related issue. I have used pgprot_noncached() in the kernel module to make the memory non-cached, but still facing this issue. The relevant portion of the kernel module code is shown below.

static int tx1_mem_get(struct file* file_ptr, struct vm_area_struct* mem_struct)
{
    int ret_val;
    unsigned int i;
    unsigned long start_addr;
    unsigned long page_frame_num;
    unsigned long mem_size;
    struct device* device_ptr;

    if (tx1_mem_count < BUFFER_COUNT) {
        /* Obtain the device pointer for the device which is requesting the memory */
        device_ptr = &pcidev_global_ptr->dev;

        /* Get the array index to store the addresses */
        i = tx1_mem_count;

        /* Allocate TX1 Buffer */
        tx1_data_buffers[i] = kzalloc(BUFFER_SIZE, GFP_DMA | GFP_ATOMIC);
        if (tx1_data_buffers[i] == NULL) {
            sprintf(msg_buffer, "WARNING: failed to allocate tx1 buffer %d", i);
            umtrx_ep_msg(msg_buffer);
            return 0;
        }

        /* Get the TX1 DMA address */
        tx1_dma_buffers[i] = dma_map_single(device_ptr, tx1_data_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
        if (tx1_dma_buffers[i] == DMA_MAPPING_ERROR) {
            sprintf(msg_buffer, "WARNING: failed to obtain dma memory for tx1 buffer %d", i);
            umtrx_ep_msg(msg_buffer);
            kfree(tx1_data_buffers[i]);
            tx1_data_buffers[i] = NULL;
            return 0;
        }

        /* Set memory attribute flags (VM_DONTEXPAND | VM_DONTDUMP = VM_RESERVED; VM_RESERVED flag not supported in new kernel versions)*/
        mem_struct->vm_flags |= VM_READ | VM_WRITE | VM_SHARED | VM_LOCKED | VM_DONTEXPAND | VM_DONTDUMP;

        /* Set the page to non-cached */
        mem_struct->vm_page_prot = pgprot_noncached(mem_struct->vm_page_prot);

        /* Obtain required parameters for creating a mapping */
        start_addr = mem_struct->vm_start;
        page_frame_num = virt_to_phys(tx1_data_buffers[i]) >> PAGE_SHIFT;
        mem_size = mem_struct->vm_end - mem_struct->vm_start;
        if (mem_size > BUFFER_SIZE) {
            sprintf(msg_buffer, "WARNING: couldn't map tx1 buffer %d, can't map more than %d bytes", i, BUFFER_SIZE);
            umtrx_ep_msg(msg_buffer);
            dma_unmap_single(device_ptr, tx1_dma_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
            tx1_dma_buffers[i] = 0;
            kfree(tx1_data_buffers[i]);
            tx1_data_buffers[i] = NULL;
            return 0;
        }

        ret_val = remap_pfn_range(mem_struct, start_addr, page_frame_num, mem_size, mem_struct->vm_page_prot);
        if (ret_val != 0) {
            sprintf(msg_buffer, "WARNING: couldn't map tx1 buffer %d", i);
            umtrx_ep_msg(msg_buffer);
            dma_unmap_single(device_ptr, tx1_dma_buffers[i], BUFFER_SIZE, DMA_BIDIRECTIONAL);
            tx1_dma_buffers[i] = 0;
            kfree(tx1_data_buffers[i]);
            tx1_data_buffers[i] = NULL;
            return 0;
        }

        /* Reserve the obtained memory, so that it is not swapped out by the kernel. Kernel space and user space can both access this buffer, so it must be
         * reserved. */
        reserve_mem_buffer(tx1_data_buffers[i], (unsigned int)(BUFFER_SIZE));

        /* Memory has been obtained, and reserved. Now populate it with the physical address using GPC-DMA */
        write_phys_addr(tx1_data_buffers[i]);

        sprintf(msg_buffer, "tx1 buffer %d acquired", i);
        umtrx_ep_msg(msg_buffer);

        tx1_mem_count = tx1_mem_count + 1;
    } else {
        umtrx_ep_msg("WARNING: failed to acquire tx1 buffer. tx1 memory is full");
    }

    return 0;
}

I have tried to use dma_mmap_attrs() API instead of remap_pfn_range() (as per my understanding, dma_mmap_attrs() creates a mapping to user space without the need for a physical address or page frame number). However, this makes no difference.

Currently SMMU is enabled (it is enabled by default on NVIDIA AGX Xavier). If I disable SMMU and use the following user space code to explicitly flush cache, then the issue is resolved.

void flush_cache(void* start, size_t size)
{
    // Ensure start address is aligned to cache line size
    uintptr_t addr = (uintptr_t)start & ~(uintptr_t)(CACHE_LINE_SIZE - 1);

    // Calculate the end address alligned to cache line size
    uintptr_t end = ((uintptr_t)start + size + CACHE_LINE_SIZE - 1) & ~(uintptr_t)(CACHE_LINE_SIZE - 1);

    // Flush the cache for each cache line in the specified range
    for (; addr < end; addr+= CACHE_LINE_SIZE) {
        __asm__ __volatile__("dc civac, %0" : : "r"(addr));
    }

    // Ensure completion of cache maintenance operations
    __asm__ __volatile("dsb sy");
}

However, I can't disable SMMU due to some other limitation in my application. As per my understanding, the method I have used to create a shared memory should give me an uncached memory. What am I doing wrong here? Am I missing something obvious here?

Any help would be greatly appreciated.

Note: The size of the shared memory is 4MB, but I have tried going all the way down to 4KB but still no luck (4KB is the page size, and the smallest size which the kernel allocates successfully using kzalloc()).

0

There are 0 answers