having problems inserting words on a patricia/radix tree

196 views Asked by At

i am trying to make a code for a college aplication of a patricia/radix tree that inserts words read from each line in a txt file in the tree, so if i read a file that says

roman
romance
romantic

its suposed to be implemented like

    (root)
      |
    roman$
   /     \
 ce$      tic$

ive also added checkpoints so i could test how far the program gets before dying so my struct is set like this:

typedef struct no no; // declaration of the struct no

typedef struct no {
    char *str; // pointer to store a string
    struct no *child[26]; // Array of pointers to all possible children of the tree
    int isword; // Flag to know if its a full word
    int isroot; // Flag to know if its the root of the tree
} tree;

and my insertintree declaration:

no* InsertWord(no *r, char *str){
    printf("checkpoint 5 %s\n", str);
    //caso 1, we are at the root
    if(r->isroot){
        r->child[r->str[0] - 'a'] = InsertWord(r->str[r->str[0] - 'a'], str);
        return r;
    }
    //caso 2, found an empty node
    if(r->str == NULL){
        strcpy(r->str, str);
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    //caso 3, word is a perfect fit for the prefix
    if(strcmp(r->str, str)){
        r->isword = 1;
        printf("word %s inserted\n", str);
        return r;
    }
    int i;
    for(i = 0; str[i] == r->str[i]; i++);
    //caso 4, the prefix fits the first part of the word, but there is still more to be inserted
    if(i == strlen(r->str)){
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
    //caso 5, the word fits the prefix, but there is still more prefix
    if(i == strlen(str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->filhos[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        return r;
    }
    //caso 6, the first part of the word fits with the first part of the preix, but then they differ
    if(i < strlen(str) && i < strlen(r->str)){
        char *aux1;
        char *aux2;
        strncpy(aux1, r->str, i);
        strcpy(aux2, r->str + i);
        no *aux = r;
        for(int j = 0; j < 26; j++){
            free(r->child[j]);
        }
        r->child[aux2[0] - 'a'] = aux;
        strcpy(r->str, aux1);
        strcpy(r->child[aux2[0] - 'a'], aux2);
        r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
        return r;
    }
}

and my current output looks something like this

checkpoint 1
checkpoint 2
checkpoint 3 roman
checkpoint 4 roman
checkpoint 5
checkpoint 5

and then the program dies, the checkpoints 1 through 4 are all related to the functions that read the file and validate the word, and they all seem to run fine

i tried adding a dynamic allocation to the program, didnt seem to help with the problem, i am aware i should provide a minimal reproduceable example but im not really sure as to where the problem lies (the code was not originally in english i tried my best to provide a translation but if any part is still in portuguese and i didnt notice im sorry)

since it was requested of me i will add the rest of the functions that call this function, its a bit lenghty but here you go

no *ValidateWord(no *r, char *str) {
    int i;
    printf("checkpoint 3 %s\n", str);
    for (i = 0; str[i] != '\0'; i++) {
        if (str[i] < 'A' || (str[i] > 'Z' && str[i] < 'a') || str[i] > 'z') {
            return r;
        } else if (str[i] >= 'A' && str[i] <= 'Z') {
            str[i] = str[i] + 32;
        }
    }
    printf("checkpoint 4 %s\n", str);
    return InsertWord(r, str);
}

no *LoadFile(no *r) {
    printf("checkpoint 1");
    printf("Insert the name of the chosen file: ");
    char namefile[100];
    scanf("%s", namefile);
    FILE *fp1;
    fp1 = fopen(namefile, "r");
    if (fp1 == NULL) {
        printf("couldnt open the file.\n");
        return r;
    }
    char str[100];
    while (fgets(str, sizeof(str), fp1)) {
        printf("checkpoint 2 %s\n", str);
        str[strcspn(str, "\n")] = '\0';
        r = ValidateWord(r, str);
    }
    fclose(fp1);
    return r;
}

this is the main file that calls all of those

void menu(no* r) {
    int c = 0;
    printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
    scanf("%d", &c);
    while (c != 0) {
        if (c == 1)
            r = LoadFile(r); // Calls the LoadFile function
        else if (c == 2)
            CheckWords(r); // calls the function CheckWords
        else if (c == 3)
            PrintDictionary(r); // Calls the fucntion PrintDictionary
        else if (c == 4)
            r = LoadStopwordsFile(r); // calls the LoadStopwordsFile function
        printf("choose an operation:\n");
    printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
        scanf("%d", &c);
    }
}

int main() {
    no* r = (no*) malloc(sizeof(no)); // alocates memory for the root
    if (r == NULL) {
        printf("Error alocating memory.\n");
        return 1; // closes the aplication
    }
    r->isroot = 1; // defines the isroot flag to true
    menu(&r); // calls the menu function
    free(r); // frees the memory alocated for the root
    return 0;
}
1

There are 1 answers

9
Neil On

A Patrica tree is a special case of an implicit binary radix tree, which is more compact than what you have here, (to be pedantic, for sure.)

I'm not sure you need isroot; what is it used for? The empty tree?

isword is an out-of-band symbol for $; you do not need (or want) two symbols for the same thing. (A natural trie entry for strings which are null-terminated is to use '\0' as a terminal suffix $.)

Morin, Patrick. "Data Structures for Strings"

As an optimization, edge labels can be stored in constant size by using two pointers to a string (for the first and last elements)

Instead of duplicating and manipulating the string, another option is to not copy the string, but to just point at the substring.

Edit: In the Java code in the tutorial which you follow, it uses an out-of-band isword and copies the substring. It seems like you are allocating nodes, but not the copy of strings, which Java handles automatically.

I think the following is a good analog: the substring is stored in a flexible array member at the end of the struct; this way, a node and a copy of the string are in a contiguous memory allocation that reduces fragmentation and calls to malloc.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>

// A flexible array member for the substring.
struct node {
    struct node *child[26];
    int isword;
    char str[];
};
struct trie { struct node *root; };

// graph helper
static void graph_logic(const struct node *const node,
    const struct node *const parent, FILE *const fp) {
    fprintf(fp,
        "\tnode%p [label = \"%s\"];\n"
        "\tnode%p -> node%p [label=\"%s\"];\n",
        (const void *)node, node->isword ? "word" : "not",
        (const void *)parent, (const void *)node, node->str);
    for(struct node *const*i = node->child,
        *const*end = i + sizeof node->child / sizeof *node->child;
        i < end; i++) if(*i) graph_logic(*i, node, fp);
}
// graphs a trie in GraphViz format: https://graphviz.org/
static void graph(const struct trie *const trie, const char *const fn) {
    FILE *fp;
    assert(trie && fn);
    if(!(fp = fopen(fn, "w"))) { perror(fn); return; }
    fprintf(fp, "digraph {\n"
        "\tgraph [truecolor=true, bgcolor=transparent, fontname=modern];\n"
        "\tnode [shape=none, fontname=modern];\n"
        "\tnode%p [label=\"root\"];\n", (void *)0);
    if(trie->root) graph_logic(trie->root, 0, fp);
    fprintf(fp, "}\n");
    fclose(fp);
}

static int InsertWord(struct trie *t, char *str){
    // The tree is empty.
    if(t->root == NULL){
        struct node *r;
        const size_t len = strlen(str);
        if(!(r = malloc(sizeof t->root + len + 1))) return 0; // FAM
        *r = (struct node){0};
        r->isword = 1;
        memcpy(r->str, str, len + 1);
        t->root = r;
        printf("word %s inserted as the first word\n", str);
        return 1;
    }
    assert(0); // continue this function . . .
    /* (For each edge.)
        (For each character, check that it's a match for the edge, if it's
        not, create two nodes from the split. Free the original.)
        (Or if it's the empty string, the word is already there.)
        (May be helpful to port `getFirstMismatchLetter`.) */
    return 0;
}
static int ValidateWord(struct trie *t, char *str) {
    int i;
    assert(t && str);
    printf("ValidateWord checkpoint: %s\n", str);
    // simplified
    for (i = 0; str[i] != '\0'; i++) if(!islower(str[i])) return 0;
    return InsertWord(t, str);
}

// this inputs https://en.wikipedia.org/wiki/Radix_tree
static int test(const char *const fn) {
    FILE *fp = 0;
    int success = 1;
    if(!(fp = fopen(fn, "w"))) goto catch;
    fprintf(fp,
        "romane\n"
        "romanus\n"
        "romulus\n"
        "rubens\n"
        "ruber\n"
        "rubicon\n"
        "rubicundus\n");
    fclose(fp), fp = 0;
    if(!freopen(fn, "r", stdin)) goto catch;
    goto finally;
catch:
    success = 0;
    perror(fn);
finally:
    if(fp) fclose(fp), fp = 0;
    return success;
}

int main(void) {
    char str[100];
    struct trie t = {0};
    struct { int no; char fn[100]; } debug = { 0, { 0 } };

    // ideally, tests should be automatic
    if(!test("trie.txt")) exit(EXIT_FAILURE);

    while(fgets(str, sizeof(str), stdin)) {
        str[strcspn(str, "\n")] = '\0';
        // Print a graph so we can see what's going on.
        sprintf(debug.fn, "trie%d.gv", debug.no++), graph(&t, debug.fn);
        int is_insert = ValidateWord(&t, str);
        assert(is_insert);
    }
    // memory leak: all the nodes need to be freed from the bottom.
    free(t.root);
    return 0;
}

With one word.