i am trying to make a code for a college aplication of a patricia/radix tree that inserts words read from each line in a txt file in the tree, so if i read a file that says
roman
romance
romantic
its suposed to be implemented like
(root)
|
roman$
/ \
ce$ tic$
ive also added checkpoints so i could test how far the program gets before dying so my struct is set like this:
typedef struct no no; // declaration of the struct no
typedef struct no {
char *str; // pointer to store a string
struct no *child[26]; // Array of pointers to all possible children of the tree
int isword; // Flag to know if its a full word
int isroot; // Flag to know if its the root of the tree
} tree;
and my insertintree declaration:
no* InsertWord(no *r, char *str){
printf("checkpoint 5 %s\n", str);
//caso 1, we are at the root
if(r->isroot){
r->child[r->str[0] - 'a'] = InsertWord(r->str[r->str[0] - 'a'], str);
return r;
}
//caso 2, found an empty node
if(r->str == NULL){
strcpy(r->str, str);
r->isword = 1;
printf("word %s inserted\n", str);
return r;
}
//caso 3, word is a perfect fit for the prefix
if(strcmp(r->str, str)){
r->isword = 1;
printf("word %s inserted\n", str);
return r;
}
int i;
for(i = 0; str[i] == r->str[i]; i++);
//caso 4, the prefix fits the first part of the word, but there is still more to be inserted
if(i == strlen(r->str)){
r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
return r;
}
//caso 5, the word fits the prefix, but there is still more prefix
if(i == strlen(str)){
char *aux1;
char *aux2;
strncpy(aux1, r->str, i);
strcpy(aux2, r->str + i);
no *aux = r;
for(int j = 0; j < 26; j++){
free(r->filhos[j]);
}
r->child[aux2[0] - 'a'] = aux;
strcpy(r->str, aux1);
strcpy(r->child[aux2[0] - 'a'], aux2);
return r;
}
//caso 6, the first part of the word fits with the first part of the preix, but then they differ
if(i < strlen(str) && i < strlen(r->str)){
char *aux1;
char *aux2;
strncpy(aux1, r->str, i);
strcpy(aux2, r->str + i);
no *aux = r;
for(int j = 0; j < 26; j++){
free(r->child[j]);
}
r->child[aux2[0] - 'a'] = aux;
strcpy(r->str, aux1);
strcpy(r->child[aux2[0] - 'a'], aux2);
r->child[str[i] - 'a'] = InsertWord(r->child[str[i] - 'a'], str + i);
return r;
}
}
and my current output looks something like this
checkpoint 1
checkpoint 2
checkpoint 3 roman
checkpoint 4 roman
checkpoint 5
checkpoint 5
and then the program dies, the checkpoints 1 through 4 are all related to the functions that read the file and validate the word, and they all seem to run fine
i tried adding a dynamic allocation to the program, didnt seem to help with the problem, i am aware i should provide a minimal reproduceable example but im not really sure as to where the problem lies (the code was not originally in english i tried my best to provide a translation but if any part is still in portuguese and i didnt notice im sorry)
since it was requested of me i will add the rest of the functions that call this function, its a bit lenghty but here you go
no *ValidateWord(no *r, char *str) {
int i;
printf("checkpoint 3 %s\n", str);
for (i = 0; str[i] != '\0'; i++) {
if (str[i] < 'A' || (str[i] > 'Z' && str[i] < 'a') || str[i] > 'z') {
return r;
} else if (str[i] >= 'A' && str[i] <= 'Z') {
str[i] = str[i] + 32;
}
}
printf("checkpoint 4 %s\n", str);
return InsertWord(r, str);
}
no *LoadFile(no *r) {
printf("checkpoint 1");
printf("Insert the name of the chosen file: ");
char namefile[100];
scanf("%s", namefile);
FILE *fp1;
fp1 = fopen(namefile, "r");
if (fp1 == NULL) {
printf("couldnt open the file.\n");
return r;
}
char str[100];
while (fgets(str, sizeof(str), fp1)) {
printf("checkpoint 2 %s\n", str);
str[strcspn(str, "\n")] = '\0';
r = ValidateWord(r, str);
}
fclose(fp1);
return r;
}
this is the main file that calls all of those
void menu(no* r) {
int c = 0;
printf("choose an operation:\n");
printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
scanf("%d", &c);
while (c != 0) {
if (c == 1)
r = LoadFile(r); // Calls the LoadFile function
else if (c == 2)
CheckWords(r); // calls the function CheckWords
else if (c == 3)
PrintDictionary(r); // Calls the fucntion PrintDictionary
else if (c == 4)
r = LoadStopwordsFile(r); // calls the LoadStopwordsFile function
printf("choose an operation:\n");
printf("Load File - 1\nCheck Words - 2\nPrint Dictionary - 3\nLoad Stopwords File - 4\nExit - 0\n\n");
scanf("%d", &c);
}
}
int main() {
no* r = (no*) malloc(sizeof(no)); // alocates memory for the root
if (r == NULL) {
printf("Error alocating memory.\n");
return 1; // closes the aplication
}
r->isroot = 1; // defines the isroot flag to true
menu(&r); // calls the menu function
free(r); // frees the memory alocated for the root
return 0;
}
A Patrica tree is a special case of an implicit binary radix tree, which is more compact than what you have here, (to be pedantic, for sure.)
I'm not sure you need
isroot
; what is it used for? The empty tree?isword
is an out-of-band symbol for$
; you do not need (or want) two symbols for the same thing. (A natural trie entry for strings which are null-terminated is to use'\0'
as a terminal suffix$
.)Instead of duplicating and manipulating the string, another option is to not copy the string, but to just point at the substring.
Edit: In the Java code in the tutorial which you follow, it uses an out-of-band
isword
and copies the substring. It seems like you are allocating nodes, but not the copy of strings, which Java handles automatically.I think the following is a good analog: the substring is stored in a flexible array member at the end of the
struct
; this way, a node and a copy of the string are in a contiguous memory allocation that reduces fragmentation and calls tomalloc
.