Convert const char* to UTF16 from C on macOS and Windows?

542 views Asked by At

My attempts seem hacky and overly convoluted. Is there a simple way to convert ASCII to UTF16 on Windows and macOS?

(note that the prUTF16Char I can't change )

Attempt (written via https://stackoverflow.com/a/54376330)

Prelude

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#if defined(__APPLE__) && defined(__MACH__)
#include <xcselect.h>
#include <wchar.h>
#include <CoreFoundation/CoreFoundation.h>
typedef unsigned short int prUTF16Char;
#else
typedef wchar_t prUTF16Char;
#endif

#define WIDEN2(x) L ## x
#define WIDEN(x) WIDEN2(x)
#define PROJECT_NAME "foo"

Functions

void copy2ConvertStringLiteralIntoUTF16(const wchar_t* inputString, prUTF16Char* destination) {
    size_t length = wcslen(inputString);
#if (defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)) && defined(PLUGIN_MODE)
    wcscpy_s(destination, length + 1, inputString);
#elif defined(__APPLE__) && defined(__MACH__)
    CFRange range = {0, 150}; range.length = length;
    CFStringRef inputStringCFSR = CFStringCreateWithBytes(
        kCFAllocatorDefault, reinterpret_cast<const UInt8 *>(inputString),
        length * sizeof(wchar_t), kCFStringEncodingUTF32LE, false);
    CFStringGetBytes( inputStringCFSR, range, kCFStringEncodingUTF16, 0, false,
                      reiterpret_cast<UInt8 *>(destination), length * (sizeof (prUTF16Char)), NULL);
    destination[length] = 0; // Set NULL-terminator
    CFRelease(inputStringCFSR);
#endif
}

const prUTF16Char * to_wchar(const char* message) {
    const size_t cSize = strlen(message);
    wchar_t *w_str = new wchar_t[cSize];
#if defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
    size_t outSize;
    mbstowcs_s(&outSize, w_str, cSize, message, cSize-1);
    return w_str;
#else
    mbstowcs(w_str, message, cSize);
#endif
#if defined(__APPLE__) && defined(__MACH__)
    prUTF16Char *ut16str = new prUTF16Char[cSize];
    copy2ConvertStringLiteralIntoUTF16(w_str, ut16str);
    return ut16str;
#else
    return w_str;
#endif
}

Then I can just define a global var:

static const prUTF16Char* PROJECT_NAME_W =
#if defined(__APPLE__) && defined(__MACH__)
    to_wchar
#elif defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
    WIDEN
#endif
        (PROJECT_NAME);

And the body of a generic print function taking message:

#if WCHAR_UTF16
wprintf(L"%s",
#else
    printf("%ls\n",
#endif
    message);

Full attempt:

https://github.com/SamuelMarks/premiere-pro-cmake-plugin/blob/f0d2278/src/common/logger.cpp [rewriting from C++ to C]

Error:

error: initializer element is not a compile-time constant


EDIT: Super hacky, but with @barmak-shemirani's solution I can:

#if defined(__APPLE__) && defined(__MACH__)
extern
#elif defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
static
#endif
const prUTF16Char* PROJECT_NAME_W
#if defined(__APPLE__) && defined(__MACH__)
    ;
#elif defined(_WIN32) || defined(__WIN32__) || defined(__WINDOWS__)
    WIDEN(PROJECT_NAME);
#endif

…and only initialise and free on the extern variant.

1

There are 1 answers

8
Barmak Shemirani On

message includes the null terminating character. strlen does not count this last character, cSize has to increase by 1.

Usually you need to call setlocal if for example message was typed in non-English computer. But it's okay if message is guaranteed to be ASCII.

Windows Example:

const wchar_t* to_wchar(const char* message) 
{ 
    const size_t cSize = strlen(message) + 1;
    //wchar_t* w_str = new wchar_t[cSize]; using C++?
    wchar_t* w_str = malloc(cSize * sizeof(wchar_t));

    size_t outSize;
    mbstowcs(w_str, message, cSize);
    // or mbstowcs_s(&outSize, w_str, cSize, message, cSize);

    return w_str;
}

Note that wchar_t is 2 bytes in Windows, and 4 bytes in POSIX. Then UTF-16 has 2 different version, little-endian and big-endian. UTF-16 has 2-bytes per character for ASCII equivalent codes, but 4-bytes for some non-Latin languages.

You should consider UTF-8 output. Most Windows programs are prepared to read UTF-8 from file or network.

Windows byte output for "123":

49 00 50 00 51 00 00 00 <- little-endian
0  49 00 50 00 51 00 00 <- big-endian

Linux byte output from above code (this won't be recognized as UTF-16 by Winodws):

49 00 00 00 50 00 00 00 51 00 00 00 00 00 00 00 

You can write your own function if you are 100% certain that the message is ASCII

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
 
typedef unsigned short prUTF16Char;//remove this line later

prUTF16Char* to_wchar(const char* message)
{
    if (!message) return NULL;

    size_t len = strlen(message);
    int bufsize = (len + 1) * 2;
    char* buf = malloc(bufsize);

    int little_endian = 1;
    little_endian = ((char*)&little_endian)[0];
    memset(buf, 0, bufsize);
    for (size_t i = 0; i < len; i++)
        buf[i * 2 + little_endian ? 0 : 1] = message[i];

    return (prUTF16Char*)buf;
}

prUTF16Char* wstr;
int main()
{
    wstr = to_wchar("ASCII");
    wprintf(L"%s\n", wstr);
    free(wstr);
    return 0;
}