Collation with ICU doesn't match POSIX collation

320 views Asked by At

I try to make cross-platform locale aware sorting with ICU library, but result doesn't match with POSIX locale aware sorting.

For example, simple code on C with ICU:

#include <stdio.h>
#include <unicode/ustring.h>
#include <unicode/ucol.h>


int main() {
    UErrorCode status = U_ZERO_ERROR;
    UChar u_a[2], u_A[2];

    u_strFromUTF8(u_a, 2, NULL, "a", -1, &status);
    if (U_FAILURE(status)) {
        printf("failure: %s\n", u_errorName(status));
    }

    u_strFromUTF8(u_A, 2, NULL, "A", -1, &status);
    if (U_FAILURE(status)) {
        printf("failure: %s\n", u_errorName(status));
    }

    const char *localeList[] = {"en_US", "en_CA"};
    int localeListLen = sizeof(localeList) / sizeof(localeList[0]);

    int i;
    for (i=0; i<localeListLen; i++) {
        char localeId[64];
        sprintf(localeId, "%s@collation=posix", localeList[i]);

        UCollator *collator = ucol_open(localeId, &status);

        if (U_SUCCESS(status)) {
            UCollationResult result = ucol_strcoll(collator, u_a, -1, u_A, -1);

            printf("%s: ", localeList[i]);
            if (result == UCOL_LESS) {
                printf("%s %s\n", u_a, u_A);
            }
            else {
                printf("%s %s\n", u_A, u_a);
            }

            ucol_close(collator);
        }
        else {
            printf("failure: %s\n", u_errorName(status));
        }
    }
}

Result to:

> gcc `pkg-config --libs --cflags icu-uc icu-io` -o ucol_tst ucol_tst.c && ./ucol_tst
en_US: a A
en_CA: a A

And simple code on Python, using POSIX:

#!/usr/bin/env python3

import locale


if __name__ == '__main__':
    ar = ('a', 'A')
    locale_list = ('en_US', 'en_CA')

    for locale_id in locale_list:
        locale.setlocale(locale.LC_ALL, locale_id)
        print("{}: {}".format(locale_id, sorted(ar, key=locale.strxfrm)))

Result to:

> python3 col_tst.py 
en_US: ['a', 'A']
en_CA: ['A', 'a']

Why sorting result for en_CA locale with ICU and POSIX doesn't match? I guess that incorrectly using the ICU library, can anyone show the correct use?

0

There are 0 answers