I pass to program unicode symbol through command line argument.

$ ./program ●

Program should return code of this symbol.

#include <stdio.h>  

int main(int argc, char *argv[])
{
    wchar_t glyph;

    glyph = *((wchar_t *) argv[1]);
    printf("%u\n", glyph);
}

Code of ● symbol is 9679 (HEX 25cf), program returns 9410530. Length of argv[1] argument is 3 bytes, not 4 (32bit for unicode symbol), it contains 8f 97 e2 \0 bytes. How to get symbol code correct way?

1 Answers

0
Kirill Bugaev On

Solution using mbstowcs() to convert UTF-8 encoded character from multibyte character string to wide character.

#include <stdio.h>
#include <stdlib.h>
#include <locale.h>

int main(int argc, char *argv[])
{
    wchar_t u;

    /* Set locale according to the environment variables */
    if (setlocale(LC_ALL, "") == NULL) {
        perror("setlocale");
        exit(EXIT_FAILURE);
    }

    /* Convert the multibyte character string in argv[1] to a
       wide character */
    if (mbstowcs(&u, argv[1], 1) == (size_t) -1) {
        perror("mbstowcs");
        exit(EXIT_FAILURE);
    }

    printf("%u\n", u);
}

Another solution decodes UTF-8 character manually. Code is imported from st (suckless terminal emulator).

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#define UTF_INVALID   0xFFFD
#define UTF_SIZ       4

typedef unsigned char uchar;
typedef uint_least32_t Rune;

#define LEN(a)          (sizeof(a) / sizeof(a)[0])
#define BETWEEN(x, a, b)    ((a) <= (x) && (x) <= (b))

static uchar utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0};
static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8};
static Rune utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000};
static Rune utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF};

Rune
utf8decodebyte(char c, size_t *i)
{
    for (*i = 0; *i < LEN(utfmask); ++(*i))
        if (((uchar)c & utfmask[*i]) == utfbyte[*i])
            return (uchar)c & ~utfmask[*i];

    return 0;
}

size_t
utf8validate(Rune *u, size_t i)
{
    if (!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF))
        *u = UTF_INVALID;
    for (i = 1; *u > utfmax[i]; ++i)
        ;

    return i;
}

size_t
utf8decode(const char *c, Rune *u, size_t clen)
{
    size_t i, j, len, type;
    Rune udecoded;

    *u = UTF_INVALID;
    if (!clen)
        return 0;
    udecoded = utf8decodebyte(c[0], &len);
    if (!BETWEEN(len, 1, UTF_SIZ))
        return 1;
    for (i = 1, j = 1; i < clen && j < len; ++i, ++j) {
        udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type);
        if (type != 0)
            return j;
    }
    if (j < len)
        return 0;
    *u = udecoded;
    utf8validate(u, len);

    return len;
}


int main(int argc, char *argv[])
{
    Rune u;

    utf8decode(argv[1], &u, UTF_SIZ);
    printf("%u\n", u);
}