Replacing memcpy and .divsi3_skip_div0_test with smaller code on ARM MCU

Question

Replacing memcpy and .divsi3_skip_div0_test with smaller code on ARM MCU

301 views Asked by fadedbee At 22 December 2016 at 12:05

My entry for https://hackaday.com/2016/11/21/step-up-to-the-1-kb-challenge/ includes a couple of huge functions which are not generated from any C code which I have written.

000004e4 <.divsi3_skip_div0_test>:
 4e4:   b410        push    {r4}
 4e6:   1c04        adds    r4, r0, #0
 4e8:   404c        eors    r4, r1
 4ea:   46a4        mov ip, r4
 4ec:   2301        movs    r3, #1
 4ee:   2200        movs    r2, #0
 4f0:   2900        cmp r1, #0
 4f2:   d500        bpl.n   4f6 <.divsi3_skip_div0_test+0x12>
 4f4:   4249        negs    r1, r1
 4f6:   2800        cmp r0, #0
 4f8:   d500        bpl.n   4fc <.divsi3_skip_div0_test+0x18>
 4fa:   4240        negs    r0, r0
 4fc:   4288        cmp r0, r1
 4fe:   d32c        bcc.n   55a <.divsi3_skip_div0_test+0x76>
 500:   2401        movs    r4, #1
 502:   0724        lsls    r4, r4, #28
 504:   42a1        cmp r1, r4
 506:   d204        bcs.n   512 <.divsi3_skip_div0_test+0x2e>
 508:   4281        cmp r1, r0
 50a:   d202        bcs.n   512 <.divsi3_skip_div0_test+0x2e>
 50c:   0109        lsls    r1, r1, #4
 50e:   011b        lsls    r3, r3, #4
 510:   e7f8        b.n 504 <.divsi3_skip_div0_test+0x20>
 512:   00e4        lsls    r4, r4, #3
 514:   42a1        cmp r1, r4
 516:   d204        bcs.n   522 <.divsi3_skip_div0_test+0x3e>
 518:   4281        cmp r1, r0
 51a:   d202        bcs.n   522 <.divsi3_skip_div0_test+0x3e>
 51c:   0049        lsls    r1, r1, #1
 51e:   005b        lsls    r3, r3, #1
 520:   e7f8        b.n 514 <.divsi3_skip_div0_test+0x30>
 522:   4288        cmp r0, r1
 524:   d301        bcc.n   52a <.divsi3_skip_div0_test+0x46>
 526:   1a40        subs    r0, r0, r1
 528:   431a        orrs    r2, r3
 52a:   084c        lsrs    r4, r1, #1
 52c:   42a0        cmp r0, r4
 52e:   d302        bcc.n   536 <.divsi3_skip_div0_test+0x52>
 530:   1b00        subs    r0, r0, r4
 532:   085c        lsrs    r4, r3, #1
 534:   4322        orrs    r2, r4
 536:   088c        lsrs    r4, r1, #2
 538:   42a0        cmp r0, r4
 53a:   d302        bcc.n   542 <.divsi3_skip_div0_test+0x5e>
 53c:   1b00        subs    r0, r0, r4
 53e:   089c        lsrs    r4, r3, #2
 540:   4322        orrs    r2, r4
 542:   08cc        lsrs    r4, r1, #3
 544:   42a0        cmp r0, r4
 546:   d302        bcc.n   54e <.divsi3_skip_div0_test+0x6a>
 548:   1b00        subs    r0, r0, r4
 54a:   08dc        lsrs    r4, r3, #3
 54c:   4322        orrs    r2, r4
 54e:   2800        cmp r0, #0
 550:   d003        beq.n   55a <.divsi3_skip_div0_test+0x76>
 552:   091b        lsrs    r3, r3, #4
 554:   d001        beq.n   55a <.divsi3_skip_div0_test+0x76>
 556:   0909        lsrs    r1, r1, #4
 558:   e7e3        b.n 522 <.divsi3_skip_div0_test+0x3e>
 55a:   1c10        adds    r0, r2, #0
 55c:   4664        mov r4, ip
 55e:   2c00        cmp r4, #0
 560:   d500        bpl.n   564 <.divsi3_skip_div0_test+0x80>
 562:   4240        negs    r0, r0
 564:   bc10        pop {r4}
 566:   4770        bx  lr
 568:   2800        cmp r0, #0
 56a:   d006        beq.n   57a <.divsi3_skip_div0_test+0x96>
 56c:   db03        blt.n   576 <.divsi3_skip_div0_test+0x92>
 56e:   2000        movs    r0, #0
 570:   43c0        mvns    r0, r0
 572:   0840        lsrs    r0, r0, #1
 574:   e001        b.n 57a <.divsi3_skip_div0_test+0x96>
 576:   2080        movs    r0, #128    ; 0x80
 578:   0600        lsls    r0, r0, #24
 57a:   b407        push    {r0, r1, r2}
 57c:   4802        ldr r0, [pc, #8]    ; (588 <.divsi3_skip_div0_test+0xa4>)
 57e:   a102        add r1, pc, #8  ; (adr r1, 588 <.divsi3_skip_div0_test+0xa4>)
 580:   1840        adds    r0, r0, r1
 582:   9002        str r0, [sp, #8]
 584:   bd03        pop {r0, r1, pc}
 586:   46c0        nop         ; (mov r8, r8)
 588:   00000019    .word   0x00000019

and:

000005a4 <memcpy>:
 5a4:   b5f0        push    {r4, r5, r6, r7, lr}
 5a6:   2a0f        cmp r2, #15
 5a8:   d935        bls.n   616 <memcpy+0x72>
 5aa:   1c03        adds    r3, r0, #0
 5ac:   430b        orrs    r3, r1
 5ae:   079c        lsls    r4, r3, #30
 5b0:   d135        bne.n   61e <memcpy+0x7a>
 5b2:   1c16        adds    r6, r2, #0
 5b4:   3e10        subs    r6, #16
 5b6:   0936        lsrs    r6, r6, #4
 5b8:   0135        lsls    r5, r6, #4
 5ba:   1945        adds    r5, r0, r5
 5bc:   3510        adds    r5, #16
 5be:   1c0c        adds    r4, r1, #0
 5c0:   1c03        adds    r3, r0, #0
 5c2:   6827        ldr r7, [r4, #0]
 5c4:   601f        str r7, [r3, #0]
 5c6:   6867        ldr r7, [r4, #4]
 5c8:   605f        str r7, [r3, #4]
 5ca:   68a7        ldr r7, [r4, #8]
 5cc:   609f        str r7, [r3, #8]
 5ce:   68e7        ldr r7, [r4, #12]
 5d0:   3410        adds    r4, #16
 5d2:   60df        str r7, [r3, #12]
 5d4:   3310        adds    r3, #16
 5d6:   42ab        cmp r3, r5
 5d8:   d1f3        bne.n   5c2 <memcpy+0x1e>
 5da:   1c73        adds    r3, r6, #1
 5dc:   011b        lsls    r3, r3, #4
 5de:   18c5        adds    r5, r0, r3
 5e0:   18c9        adds    r1, r1, r3
 5e2:   230f        movs    r3, #15
 5e4:   4013        ands    r3, r2
 5e6:   2b03        cmp r3, #3
 5e8:   d91b        bls.n   622 <memcpy+0x7e>
 5ea:   1f1c        subs    r4, r3, #4
 5ec:   08a4        lsrs    r4, r4, #2
 5ee:   3401        adds    r4, #1
 5f0:   00a4        lsls    r4, r4, #2
 5f2:   2300        movs    r3, #0
 5f4:   58ce        ldr r6, [r1, r3]
 5f6:   50ee        str r6, [r5, r3]
 5f8:   3304        adds    r3, #4
 5fa:   42a3        cmp r3, r4
 5fc:   d1fa        bne.n   5f4 <memcpy+0x50>
 5fe:   18ed        adds    r5, r5, r3
 600:   18c9        adds    r1, r1, r3
 602:   2303        movs    r3, #3
 604:   401a        ands    r2, r3
 606:   d005        beq.n   614 <memcpy+0x70>
 608:   2300        movs    r3, #0
 60a:   5ccc        ldrb    r4, [r1, r3]
 60c:   54ec        strb    r4, [r5, r3]
 60e:   3301        adds    r3, #1
 610:   4293        cmp r3, r2
 612:   d1fa        bne.n   60a <memcpy+0x66>
 614:   bdf0        pop {r4, r5, r6, r7, pc}
 616:   1c05        adds    r5, r0, #0
 618:   2a00        cmp r2, #0
 61a:   d1f5        bne.n   608 <memcpy+0x64>
 61c:   e7fa        b.n 614 <memcpy+0x70>
 61e:   1c05        adds    r5, r0, #0
 620:   e7f2        b.n 608 <memcpy+0x64>
 622:   1c1a        adds    r2, r3, #0
 624:   e7f8        b.n 618 <memcpy+0x74>
 626:   46c0        nop         ; (mov r8, r8)

I am guessing that I could code far smaller, but less time-efficient, replacements myself.

Is this likely?

Where will I find the source which I need to edit - I'm guessing that I should look for the source of a libc under gcc-arm-none-eabi/lib/gcc/arm-none-eabi/4.8.3/. I think that I've found the compiled symbols, but I can't find the source.

~/gcc-arm-none-eabi$ grep -R divsi3_skip_div0_test *
Binary file lib/gcc/arm-none-eabi/4.8.3/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/thumb/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv6-m/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/fpu/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/softfp/libgcc.a matches
Binary file lib/gcc/arm-none-eabi/4.8.3/armv7-ar/thumb/fpu/libgcc.a matches

Alternatively, is there a way to tell gcc not to use memcpy when copying structures? (They are 10 bytes, so three thumb instructions should do the job.) I've tried adding -mno-memcpy and -Wa,mno-memcpy but neither are recognised.

Update:

I've solved the memcpy part of this question - adding a partial, but sufficient, memcpy function stops the other from being added.

size_t memcpy(uint8_t *restrict dst, uint8_t *restrict const src, size_t size) {
    int i;
    for (i = 0; i < size; i++) {
        dst[i] = src[i];
    }
    return i;
}

It's much smaller, but less efficient and won't handle dst < src + size overlap.

000003ec <memcpy>:
 3ec:   b510        push    {r4, lr}
 3ee:   2300        movs    r3, #0
 3f0:   4293        cmp r3, r2
 3f2:   d003        beq.n   3fc <memcpy+0x10>
 3f4:   5ccc        ldrb    r4, [r1, r3]
 3f6:   54c4        strb    r4, [r0, r3]
 3f8:   3301        adds    r3, #1
 3fa:   e7f9        b.n 3f0 <memcpy+0x4>
 3fc:   1c18        adds    r0, r3, #0
 3fe:   bd10        pop {r4, pc}

To clarify, I'm now only asking what I might do to replace the .divsi3_skip_div0_test code with a less efficient, but smaller, code.

It is not clear to me where the source of this code is, or how to edit its source. It looks to be more complicated to replace than memcpy as it does not look like a C function, as its name begins with a ..

Original Q&A

TechQA.

Replacing memcpy and .divsi3_skip_div0_test with smaller code on ARM MCU

There are 0 answers

Related Questions in ARM

Related Questions in LIBC

Popular Questions

Trending Questions