Get the delayed-load function binding correctly written into the image executable (dlltool)

540 views Asked by At

I've been studying the delayed-load (delayimp) pipeline as a possible backend for the missing RPATH functionality on Windows, by the following example:

#include <stdio.h>

int __declspec(dllimport) foo(int arg);

int main(int argc, char* argv[])
{
    printf("foo() = %d\n", foo(foo(argc)));
    return 0;
}

Both GNU and LLVM implement delayed loading similarly with the "dlltool" (yet, LLVM's dlltool seems to have merged into "ld-link"). Essentially, the task performed in LLVM's lld/COFF/DLL.cpp or BinUtil's dlltool.c is two-fold:

  1. Generate jump table stub for a delayed-load function (see example below)
  2. Generate a trampoline that shall deploy the __delayLoadHelper2 code (see example below)

Upon the successful binding, the __delayLoadHelper2 seems to write a resolved function address right into the executable code section:

extern "C"
FARPROC WINAPI
__delayLoadHelper2(
    PCImgDelayDescr     pidd,
    FARPROC *           ppfnIATEntry
    ) {
...
SetEntryHookBypass:
    *ppfnIATEntry = pfnRet; // access violation
...
}

In order for executable image modification, Microsoft has developed some fancy functions that temporarily add write permissions to the corresponding memory region.

Now the question is: the code to be modified is within the jump table stub that goes into ".idata" section, and it fails to get write permissions:

        if ((Characteristics & IMAGE_SCN_MEM_WRITE) == 0) {

            //
            // This delay load helper module does not support merging the delay
            // load section to a read only section because memory management
            // would not guarantee that there is commit available - and thus a
            // low memory failure path where the delay load failure hook could
            // not be safely invoked (the delay load section would still be
            // read only) might be encountered.
            //
            // It is a build time configuration problem to produce such a
            // binary so abort here and now so that the problem can be
            // identified & fixed.
            //

/* Exception thrown at 0x000000013F3B3F3F in dlltool_test_executable.exe: 0xC0000005: Access violation reading */
            __fastfail(FAST_FAIL_DLOAD_PROTECTION_FAILURE);
        }

So, currently the hard-binding does not work, and gives "write access violation". I'm wondering what kind of binary configuration am I missing here?

My test config: LLVM upstream from github, BinUtils upstream from git, MSVC2019, Windows 7.

$ cat trampoline.s
# Import trampoline
        .section        .text
        .global __tailMerge_C__Users_marcusmae_dlltool_build_import_test_lib
__tailMerge_C__Users_marcusmae_dlltool_build_import_test_lib:
        pushq %rcx
        pushq %rdx
        pushq %r8
        pushq %r9
        subq  $40, %rsp
        movq  %rax, %rdx
        leaq  __DELAY_IMPORT_DESCRIPTOR_C__Users_marcusmae_dlltool_build_import_test_lib(%rip), %rcx
        call __delayLoadHelper2
        addq  $40, %rsp
        popq %r9
        popq %r8
        popq %rdx
        popq %rcx
        jmp *%rax

# DELAY_IMPORT_DESCRIPTOR
.section        .text$2
.global __DELAY_IMPORT_DESCRIPTOR_C__Users_marcusmae_dlltool_build_import_test_lib
__DELAY_IMPORT_DESCRIPTOR_C__Users_marcusmae_dlltool_build_import_test_lib:
        .long 1 # grAttrs
        .rva    __C__Users_marcusmae_dlltool_build_import_test_lib_iname        # rvaDLLName
        .rva    __DLL_HANDLE_C__Users_marcusmae_dlltool_build_import_test_lib   # rvaHmod
        .rva    __IAT_C__Users_marcusmae_dlltool_build_import_test_lib  # rvaIAT
        .rva    __INT_C__Users_marcusmae_dlltool_build_import_test_lib  # rvaINT
        .long   0       # rvaBoundIAT
        .long   0       # rvaUnloadIAT
        .long   0       # dwTimeStamp

.section .data
__DLL_HANDLE_C__Users_marcusmae_dlltool_build_import_test_lib:
        .long   0       # Handle
        .long   0

#Stuff for compatibility
        .section        .idata$5
        .long   0
        .long   0
__IAT_C__Users_marcusmae_dlltool_build_import_test_lib:
        .section        .idata$4
        .long   0
        .long   0
        .section        .idata$4
__INT_C__Users_marcusmae_dlltool_build_import_test_lib:
        .section        .idata$2
$ objdump -d dorks00000.o

dorks00000.o:     file format pe-x86-64


Disassembly of section .text:

0000000000000000 <foo>:
   0:   ff 25 00 00 00 00       jmpq   *0x0(%rip)        # 6 <foo+0x6>
   6:   48 8d 05 00 00 00 00    lea    0x0(%rip),%rax        # d <foo+0xd>
   d:   e9 00 00 00 00          jmpq   12 <foo+0x12>
        ...
1

There are 1 answers

8
mstorsjo On BEST ANSWER

So you are generating the delay import structures using GNU dlltool, but linking against it with LLD or MS link.exe?

I think the difference here lies in the fact that GNU dlltool places the addresses that are updated at runtime within .idata, and GNU ld normally links .idata as writable, while LLD and MS link.exe normally has read-only .idata (and places the addresses that will be updated at runtime by the delay loading mechanism in .data instead).

LLD happens to have a bit of extra code to take read-write .idata sections from GNU import libraries and merge them into the rest of LLD's read-only .idata - which makes normal GNU import libraries work, but unfortunately breaks using it together with the GNU dlltool delayimport libraries.

So with LLD, just use LLD's built-in delay import mechanism, by passing e.g. -delayload:user32.dll when linking. This works when using MSVC style import libraries, but unfortunately not when using GNU style import libraries (import libraries generated by GNU dlltool or GNU ld).