How to use gperf to create hash for a range of values?

1.3k views Asked by At

I have range of hex numbers like these

0xabcd****
0xabdc**89
0x****abcd
0xde****ab
# 50 or so more entries like these
# where * is any hex number

I need a hash a function that will accept a 4byte value and generate Y/N answer for membership.

I tried using gperf but unfortunately it doesn't interpret * as wildcard. Has anyone faced this problem before? My code is in C.

2

There are 2 answers

2
sjnarv On BEST ANSWER

If I can trust my arithmetic, each **** has 16^4 possible values, so the four wild-card specs enumerate 3 * 16^4 + 16^2 values - something on the order of 200,000 - a bit out of the remit of gperf (its docs say a "large" key set is 15,000).

Wildcards mean regular expressions for me, so why not try that? Here's a try that defines "4byte value" as a uint32_t, and presents a text encoding of such a value to the regex(3) machinery. This may well not be what you were looking for, but as it was fun to cobble up, here you go.

#include <sys/types.h>
#include <regex.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

static regex_t the_re;

int
matcher_init(void)
{
    static const char the_re_string[] = "(abcd....|abdc..89|....abcd|de....ab|"
                                         /* OP's 4, plus 46 more... */
                                         "..d80a..|..7bc5..|c6..514a|..b7585a|"
                                         "4732ecc4|7c22e4da|5a5e..63|....e866|"
                                         "..fdc367|ac....b4|70249edc|..e97e32|"
                                         "....94d8|....fa6c|4591..ff|..e4..67|"
                                         "aab285..|....f81b|15bb22ba|3cf4....|"
                                         "57d3ad86|..bd..1e|..ec67b7|..693aaf|"
                                         "323c..18|cab237cb|d4b2c6b4|2a15..2f|"
                                         "....d196|..5e..10|....b1f1|b54e9838|"
                                         "..0cf1..|5c1a..fb|....f34d|19..d34c|"
                                         "..cacb48|..4c2d09|48..bc..|f98cc7..|"
                                         "ac..2b1a|..beb5..|98..03..|..61c35e|"
                                         "....1245|61..5ca8)";
    int res;

    if ((res = regcomp(&the_re, the_re_string, REG_EXTENDED|REG_NOSUB)) != 0) {
        char ebuf[256];

        (void) regerror(res, &the_re, ebuf, sizeof ebuf);
        (void) fprintf(stderr, "regcomp failed: %s\n", ebuf);

        return -1;
    }

    return 0;
}

int
matcher_matches(uint32_t u)
{
    char ubuf[9];

    (void) sprintf(ubuf, "%08x", u);

    return regexec(&the_re, ubuf, 0, 0, 0) == 0;
}

int
main(void)
{
    int i;
    unsigned tf, iterations, matches;
    time_t start;

    uint32_t tvals[] = {
                0xabcd0000, 0xabdc0089, 0x0000abcd, 0xde0000ab,
                0x00d80a00, 0x007bc500, 0xc600514a, 0x00b7585a,
                0x4732ecc4, 0x7c22e4da, 0x5a5e0063, 0x0000e866,
                0x00fdc367, 0xac0000b4, 0x70249edc, 0x00e97e32,
                0x000094d8, 0x0000fa6c, 0x459100ff, 0x00e40067,
                0xaab28500, 0x0000f81b, 0x15bb22ba, 0x3cf40000,
                0x57d3ad86, 0x00bd001e, 0x00ec67b7, 0x00693aaf,
                0x323c0018, 0xcab237cb, 0xd4b2c6b4, 0x2a15002f,
                0x0000d196, 0x005e0010, 0x0000b1f1, 0xb54e9838,
                0x000cf100, 0x5c1a00fb, 0x0000f34d, 0x1900d34c,
                0x00cacb48, 0x004c2d09, 0x4800bc00, 0xf98cc700,
                0xac002b1a, 0x00beb500, 0x98000300, 0x0061c35e,
                0x00001245, 0x61005ca8 };

    if (matcher_init() == -1) {
        return 1;
    }

    /* test known values */

    tf = 0;

    for (i = 0; i < sizeof(tvals) / sizeof(tvals[0]); i++) {
        if (!matcher_matches(tvals[i])) {
            (void) printf("0x%08x should match; didn't...\n", tvals[i]);
            tf = 1;
        }
    }

    if (tf) {
        return 1;
    }

    /* some random probes */

    srand((time(0) << 16) | (getpid() & 0xFFFF));

    iterations = matches = 0;

    (void) time(&start);

    for (i = 0; i < 1000000; i++) {
        uint32_t u = (uint32_t) ((rand() << 16) | (rand() & 0xFFFF));

        /* printf("Test: 0x%08x\n", u); */

        if (matcher_matches(u)) {
            (void) printf("Match: %08x\n", u);
            (void) fflush(stdout);
            matches++;
        }

        iterations++;
    }

    printf("iterations: %d; matches: %d (%u seconds)\n",
                        iterations, matches,
                        (unsigned) (time(0) - start));

    return 0;
}

The responses to this reminded me of the question itself, and on reflection a more straightforward way occurred to me. Why the better answers don't just occur first, I'll never know.

Anyway, don't use regular expressions, just a value and a mask. The code above loses its matcher_init call (along with everything regex-related), and the matcher_matches call support could look like the following. The looked-for values match the additional 46 I produced for the first answer, so the same test code in the first answer's main will continue to work.

I suppose you could sort the struct vm array so that entries with fewer bits set in the mask occur first and buy a modest performance gain, but even as is this second attempt is about five times faster than the regex-based one for me.

static struct {
    uint32_t val, mask;
} vm [] = {
    { 0xabcd0000, 0xffff0000 },
    { 0xabdc0089, 0xffff00ff },
    { 0x0000abcd, 0x0000ffff },
    { 0xde0000ab, 0xff0000ff },
    { 0x00d80a00, 0x00ffff00 },
    { 0x007bc500, 0x00ffff00 },
    { 0xc600514a, 0xff00ffff },
    { 0x00b7585a, 0x00ffffff },
    { 0x4732ecc4, 0xffffffff },
    { 0x7c22e4da, 0xffffffff },
    { 0x5a5e0063, 0xffff00ff },
    { 0x0000e866, 0x0000ffff },
    { 0x00fdc367, 0x00ffffff },
    { 0xac0000b4, 0xff0000ff },
    { 0x70249edc, 0xffffffff },
    { 0x00e97e32, 0x00ffffff },
    { 0x000094d8, 0x0000ffff },
    { 0x0000fa6c, 0x0000ffff },
    { 0x459100ff, 0xffff00ff },
    { 0x00e40067, 0x00ff00ff },
    { 0xaab28500, 0xffffff00 },
    { 0x0000f81b, 0x0000ffff },
    { 0x15bb22ba, 0xffffffff },
    { 0x3cf40000, 0xffff0000 },
    { 0x57d3ad86, 0xffffffff },
    { 0x00bd001e, 0x00ff00ff },
    { 0x00ec67b7, 0x00ffffff },
    { 0x00693aaf, 0x00ffffff },
    { 0x323c0018, 0xffff00ff },
    { 0xcab237cb, 0xffffffff },
    { 0xd4b2c6b4, 0xffffffff },
    { 0x2a15002f, 0xffff00ff },
    { 0x0000d196, 0x0000ffff },
    { 0x005e0010, 0x00ff00ff },
    { 0x0000b1f1, 0x0000ffff },
    { 0xb54e9838, 0xffffffff },
    { 0x000cf100, 0x00ffff00 },
    { 0x5c1a00fb, 0xffff00ff },
    { 0x0000f34d, 0x0000ffff },
    { 0x1900d34c, 0xff00ffff },
    { 0x00cacb48, 0x00ffffff },
    { 0x004c2d09, 0x00ffffff },
    { 0x4800bc00, 0xff00ff00 },
    { 0xf98cc700, 0xffffff00 },
    { 0xac002b1a, 0xff00ffff },
    { 0x00beb500, 0x00ffff00 },
    { 0x98000300, 0xff00ff00 },
    { 0x0061c35e, 0x00ffffff },
    { 0x00001245, 0x0000ffff },
    { 0x61005ca8, 0xff00ffff }
};

int
matcher_matches(uint32_t u)
{
    size_t i;

    for (i = 0; i < sizeof(vm) / sizeof(vm[0]); i++) {
        if ((u & vm[i].mask) == vm[i].val) {
            return 1;
        }
    }
    return 0;
}

Wildcards are now the zeros in the mask field of the struct, with corresponding bits "don't care" values (set to zero) in the val field.

0
jxh On

Since you are unwilling to enumerate the values for gperf (and it seems gperf cannot handle that many inputs anyway), then you cannot use gperf for your task, So, the answer to your question is you cannot use gperf to create your hash.

My recommendation would be to forget about perfect hashing (you do not describe any requirement for perfect hashing other than your desire to use gperf). The values themselves are well distributed to use as a hash value as is.