How to do unaligned int store on ARM Cortex M4?

104 views Asked by At

How can C code take advantage of the Cortex M4's unaligned 4-byte store instruction? *(int*)p = x mostly works, but occasionally I end up with something like this:

void Store(uint8_t* p, uint32_t a, uint32_t b)
{
  *(uint32_t*)(p + 0) = a;
  *(uint32_t*)(p + 4) = b;
}

which gcc then compiles to:

00000000 <Store>:
   0:   e9c0 1200       strd    r1, r2, [r0]
   4:   4770            bx      lr
   6:   bf00            nop

which of course gives a HardFault because 8-byte stores must be aligned. Is there a way to write the stores so that gcc knows they can be unaligned? Is there maybe an unaligned int type, perhaps?

3

There are 3 answers

2
Eric Postpischil On BEST ANSWER

Sinc you are using GCC, you can use a packed structure:

struct p { uint32_t u; } __attribute__((__packed__));

void Store(void *vp, uint32_t a, uint32_t b)
{
    struct p *pp = vp;
    pp[0].u = a;
    pp[1].u = b;
}

This generates:

Store:
        str     r1, [r0]  @ unaligned
        str     r2, [r0, #4]      @ unaligned
        bx      lr

Note that you should consider the aliasing rules when doing this. If the destination is defined with a different type, storing the data with this structure type could require some workaround for that.

0
personal_cloud On

Two approaches that worked are memcpy, and defining an unaligned int struct such as

typedef struct
{
  uint8_t d[4];
} my_uint32_t;

The result is then the desired separate 4-byte store instructions:

00000000 <Store>:
   0:   6001            str     r1, [r0, #0]
   2:   6042            str     r2, [r0, #4]
   4:   4770            bx      lr
   6:   bf00            nop
12
gulpr On

Your code (pointer punning )invokes undefined behaviour as you violating strict-aliasing rules.

You can't access one type by dereferencing tthe pointer of another type, except accessing via char pointer.

The best option is to use memcpy as it will always work

void Store(uint8_t* p, uint32_t a, uint32_t b)
{
    memcpy(p, &a, sizeof(a));
    memcpy(p + sizeof(a), &b, sizeof(b));
}

void Store2(uint8_t* p, uint32_t a, uint32_t b)
{
    struct 
    {
        uint32_t b32[2];
    } __attribute__((packed))*x = (void *)p;
    x -> b32[0] = a;
    x -> b32[1] = b;
}


void Store3(uint8_t* p, uint32_t a, uint32_t b)
{
    struct x
    {
        uint32_t b32[2];
    } __attribute__((packed))*x = (void *)p;

    *x = (struct x){a,b};
}

void Store4(uint8_t* p, uint32_t a, uint32_t b)
{
    memcpy(p, (uint32_t[]){a,b}, sizeof(uint32_t[2]));
}


void Store5(uint8_t* p, uint32_t a, uint32_t b)
{
  *(volatile uint32_t*)(p + 0) = a;
  *(volatile uint32_t*)(p + 4) = b;
}
Store:
        str     r1, [r0]  @ unaligned
        str     r2, [r0, #4]      @ unaligned
        bx      lr
Store2:
        str     r1, [r0]  @ unaligned
        str     r2, [r0, #4]      @ unaligned
        bx      lr
Store3:
        str     r1, [r0]  @ unaligned
        str     r2, [r0, #4]      @ unaligned
        bx      lr
Store4:
        sub     sp, sp, #8
        strd    r1, r2, [sp]
        mov     r2, sp
        mov     r3, r0
        ldmia   r2!, {r0, r1}
        str     r0, [r3]  @ unaligned
        str     r1, [r3, #4]      @ unaligned
        add     sp, sp, #8
        bx      lr
Store5:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr

https://godbolt.org/z/sYhdW8rK8

BTW clang is optimizing better: https://godbolt.org/z/d11javbW9

Store:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr
Store2:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr
Store3:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr
Store4:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr
Store5:
        str     r1, [r0]
        str     r2, [r0, #4]
        bx      lr