char* pointer will not increment in function

518 views Asked by At

I know how pointers and pointer arithmetic works but this is confounding me.

I made a lexer a while back and now I'm modifying it so I can use a parser with it but, for some reason, it will not iterate the char* I pass to the lexer function.

I malloc()'d a char pointer and then passed it to the function where it iterates over it. It does the iteration but it resets back to the original address for some reason. I need the char* to be able to change and STAY at the address it finished at after iterating or else I cannot complete a proper parser...

main.c

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"

int main(int argc, char **argv)
{
    struct lexer *token_queue = NULL;

    FILE *srcfile = fopen( argv[1], "r+" );
    if (!srcfile) {
        fprintf(stderr, "file \"%s\" not found, is null\n", argv[1]);
        lexer_destroy(&token_queue);
        goto hell;
    }
    fseek(srcfile, 0, SEEK_END);    // go all the way to end of file
    long file_len = ftell(srcfile); // get total size of the file
    rewind(srcfile);        // reset the file reader to beginning

    char *srcBuffer = malloc(sizeof(char) * (file_len+1));  // pretty sure it's safe to assume char is 1 byte...
    memset(srcBuffer, '\0', sizeof(char) * (file_len+1));   // zero the whole buffer

    fread(srcBuffer, sizeof(char), (file_len+1), srcfile);  // get entire file as a string
    char decision[20];

    do {
        printf("print a token?\n");
        scanf("%19s", decision);
        lexer_get_single_token(srcBuffer, &token_queue); // tokenize baby!

        //printf(" *srcBuffer == %c \n", *srcBuffer);
        print_tokens_colored(&token_queue);
    }
    while (decision[0] == 'y') ;    // slowly simulate parser asking for another token!

    free(srcBuffer); srcBuffer = NULL;
    fclose(srcfile); srcfile = NULL;

    lexer_destroy(&token_queue);
hell:;
    return 0;
}

lexer function

void lexer_get_single_token(char *iter, struct lexer **Q)
{
    char wording[512] = "";     // buffer to hold identifiers, keywords, and strings
    unsigned int i = 0;

    const char *keywords[] = {
        "auto", "const", "double", "float", "int", "short", "struct", "unsigned",
        "break", "continue", "else", "for", "long", "signed", "switch", "void",
        "case", "default", "enum", "goto", "register", "sizeof", "typedef", "volatile",
        "char", "do", "extern", "if", "return", "static", "union", "while",
        "inline", "alignof", "_Generic", "bool", "_Bool", "true", "false"
    };
    printf("*iter == %c\n", *iter);
    while ( *iter != '\0' ) {
        while ( is_space(*iter) )
            ++iter;

        if (*iter == '/' && iter[1] == '*') {       // found C style /**/ comment
            do {
                ++iter;
            }
            while ( !(*iter == '*' && iter[1] == '/') );    // continuously skip until we find a */
            iter += 2;  // skip twice to pass over */ and go to the next token.
        }

        if (*iter == '/' && iter[1] == '/') {       // found C++ style // comment
            while ( *iter != '\n' )
                ++iter;     // skip until the next line which will be skipped itself.
        }

        if (*iter == '\\' && iter[1] == '\n') {     // formatting Left slash check
            lexer_add_token(Q, LeftSlash, "\\", 2);
            iter += 2;
            return;
        }

        if (*iter == '\"') {    // found string literal, adjust for "\\" so we won't crash
            wording[i++] = *iter++;     // add the first double quote to buffer
            while ( *iter != '\"' ) {
                if (*iter == '\\' && iter[1] == '\"' && iter[-1] != '\\') {
                    wording[i++] = *iter++;     // add the literal double quote as well
                }
                wording[i++] = *iter++;
            }
            wording[i++] = *iter++;     // found the ending double quote, add that too.

            if (wording[0] != '\0') {
                lexer_add_token(Q, StringConstant, wording, i+1);
                reset_string(wording);
                return;
            }
        }

        if ( *iter == '\'' ) {  // found character literal, adjust for '\\' so we won't crash
            wording[i++] = *iter++;
            int counter=0;
            while (*iter != '\'' && counter < 2) {      // Same operation as the string literal but limit as char
                if (*iter == '\\' && iter[1] == '\'' && iter[-1] != '\\') {
                    wording[i++] = *iter++;
                }
                wording[i++] = *iter++;
                ++counter;
            }
            wording[i++] = *iter++;     // add ending single quote to buffer

            if (wording[0] != '\0') {
                lexer_add_token(Q, CharConstant, wording, i+1);
                reset_string(wording);
                return;
            }
        }

        if (*iter == '0' && (iter[1] == 'x' || iter[1] == 'X')) {   // found hexadecimal constant
            wording[i++] = *iter++;     // copy both 0 and x to buffer
            wording[i++] = *iter++;

            while ( is_numeral(*iter) ) {
                wording[i++] = *iter++;     // copy numbers and letters A to F
            }
            if ( *iter == '.' && is_numeral(iter[1]) ) {    // found hexadecimal float
                wording[i++] = *iter++;
                while ( is_numeral(*iter) )
                    wording[i++] = *iter++;
                if (*iter == 'p' && is_numeral(iter[1])) {  // stuff like 0x0.3p10.
                    wording[i++] = *iter++;
                    while ( is_numeral(*iter) )
                        wording[i++] = *iter++;
                }
                if (wording[0] != '\0') {
                    lexer_add_token(Q, NumConstantHexFloat, wording, i+1);
                    reset_string(wording);
                    return;
                }
            }
            else {      // we didn't find a decimal, so tokenize what we found as a normal hex constant
                if (wording[0] != '\0') {
                    lexer_add_token(Q, NumConstantHex, wording, i+1);
                    reset_string(wording);
                    return;
                }
            }
        }

        while ( is_numeric(*iter) ) {   // found decimal constant
            wording[i++] = *iter++;
        }
        if ( *iter == '.' && is_numeric(iter[1]) ) {    // found floating point number
            wording[i++] = *iter++;     // add in the decimal char
            while ( is_numeric(*iter) )
                wording[i++] = *iter++;

            // add the 'e' constant for large floats as well as 'p' (power) constant
            if ( (*iter == 'p' || *iter == 'P' || *iter == 'e' || *iter == 'E') && is_numeric(iter[1]) )
            {
                wording[i++] = *iter++;
                while ( is_numeric(*iter) )
                    wording[i++] = *iter++;
            }
            if (*iter == 'f' || *iter == 'F')   // stuff like 2.0f, add that into the buffer!
                wording[i++] = *iter++;

            if (wording[0] != '\0') {
                lexer_add_token(Q, NumConstantReal, wording, i+1);
                reset_string(wording);
                return;
            }
        }
        else {      // no decimal, consider it a natural number
            if (wording[0] != '\0') {
                lexer_add_token(Q, NumConstant, wording, i+1);
                reset_string(wording);
                return;
            }
        }

        if (is_alphabetic(*iter)) { // found an identifier or potential keyword
            while (is_potential_identifier(*iter))
                wording[i++] = *iter++;

            if (wording[0] != '\0') {
                int x;
                int found_keyword = 0;
                for ( x=0 ; x<sizeof keywords/sizeof keywords[0] ; ++x ) {
                    if ( !strcmp(wording, keywords[x]) )
                        found_keyword = 1;
                }
                if (found_keyword)
                    lexer_add_token(Q, Keyword, wording, i+1);
                else lexer_add_token(Q, NumIdent, wording, i+1);
                reset_string(wording);
                return;
            }
        }

        switch ( *iter ) {  // add in individual characters
            case '=':
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, EqualCmp, "==", 3);
                }
                else lexer_add_token(Q, Equal, "=", 2);
                ++iter;
                return;
            case ';':
                lexer_add_token(Q, Semicolon, ";", 2);
                ++iter;
                return;
            case ':':
                lexer_add_token(Q, Colon, ";", 2);
                ++iter;
                return;
            case '+':   // possible uses => left unary is positive, twice unary is increment, once binary is addition
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, PlusEqual, "+=", 3);
                }
                else if (iter[1] == '+') {
                    ++iter;
                    lexer_add_token(Q, Increment, "++", 3);
                }
                else lexer_add_token(Q, Plus, "+", 2);
                ++iter;
                return;
            case '-':   // possible uses => left unary is negating, twice unary is decrement, one binary is minus
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, MinusEqual, "-=", 3);
                }
                else if (iter[1] == '-') {
                    ++iter;
                    lexer_add_token(Q, Decrement, "--", 3);
                }
                else if (iter[1] == '>') {
                    ++iter;
                    lexer_add_token(Q, Arrow, "->", 3);
                }
                else lexer_add_token(Q, Dash, "-", 2);
                ++iter;
                return;
            case '*':   // leftward unary is dereferencing ptr, binary be mult. Also check for / as ending comment
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, MultEqual, "*=", 3);
                }
                else lexer_add_token(Q, Asterisk, "*", 2);
                ++iter;
                return;
            case '/':   // check for * and / as comment EDIT: DONE
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, DivEqual, "/=", 3);
                }
                else lexer_add_token(Q, DivSlash, "/", 2);
                ++iter;
                return;
            case '(':
                lexer_add_token(Q, LeftParens, "(", 2);
                ++iter;
                return;
            case ')':
                lexer_add_token(Q, RiteParens, ")", 2);
                ++iter;
                return;
            case '[':
                lexer_add_token(Q, LeftSqBracket, "[", 2);
                ++iter;
                return;
            case ']':
                lexer_add_token(Q, RightSqBracket, "]", 2);
                ++iter;
                return;
            case '{':
                lexer_add_token(Q, LeftCurlBrace, "{", 2);
                ++iter;
                return;
            case '}':
                lexer_add_token(Q, RightCurlBrace, "}", 2);
                ++iter;
                return;
            case '.':
                if (iter[1] == '.' && iter[2] == '.') {
                    iter += 2;
                    lexer_add_token(Q, Ellipses, "...", 4);
                }
                else lexer_add_token(Q, Dot, ".", 2);
                ++iter;
                return;
            case ',':
                lexer_add_token(Q, Comma, ",", 2);
                ++iter;
                return;
            case '<':
                if (iter[1] == '<') {
                    if (iter[2] == '=') {
                        lexer_add_token(Q, LeftBitShiftEqual, "<<=", 4);
                        iter += 2;
                    }
                    else {
                        lexer_add_token(Q, LeftBitShift, "<<", 3);
                        ++iter;
                    }
                }
                else if (iter[1] == '=') {
                    lexer_add_token(Q, LessEqual, "<=", 3);
                    ++iter;
                }
                else lexer_add_token(Q, LeftArrow, "<", 2);
                ++iter;
                return;
            case '>':
                if (iter[1] == '>') {
                    if (iter[2] == '=') {
                        lexer_add_token(Q, RightBitShiftEqual, ">>=", 4);
                        iter += 2;
                    }
                    else {
                        lexer_add_token(Q, RightBitShift, ">>", 3);
                        ++iter;
                    }
                }
                else if (iter[1] == '=') {
                    lexer_add_token(Q, GreaterEqual, ">=", 3);
                    ++iter;
                }
                else lexer_add_token(Q, RightArrow, ">", 2);
                ++iter;
                return;
            case '?':
                lexer_add_token(Q, QuestionMark, "?", 2);
                ++iter;
                return;
            case '#':
                lexer_add_token(Q, HashSym, "#", 2);
                ++iter;
                return;
            case '&':
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, AndEqual, "&=", 3);
                }
                else if (iter[1] == '&') {
                    ++iter;
                    lexer_add_token(Q, BoolAnd, "&&", 3);
                }
                else lexer_add_token(Q, Ampersand, "&", 2);
                ++iter;
                return;
            case '^':
                if (iter[1] == '=') {
                    ++iter;
                    lexer_add_token(Q, XorEqual, "^=", 3);
                }
                else lexer_add_token(Q, Carot, "^", 2);
                ++iter;
                return;
            case '%':
                if (iter[1] == '=') {
                    ++iter;
                lexer_add_token(Q, ModuloEqual, "%=", 3);
            }
            else lexer_add_token(Q, Percent, "%", 2);
            ++iter;
            return;
        case '!':
            if (iter[1] == '=') {
                ++iter;
                lexer_add_token(Q, NotEqual, "!=", 3);
            }
            else lexer_add_token(Q, ExclamationMark, "!", 2);
            ++iter;
            return;
        case '|':
            if (iter[1] == '=') {
                ++iter;
                lexer_add_token(Q, OrEqual, "|=", 3);
            }
            else if (iter[1] == '|') {
                ++iter;
                lexer_add_token(Q, BoolOr, "||", 3);
            }
            else lexer_add_token(Q, VerticalBar, "|", 2);
            ++iter;
            return;
        case '~':
            lexer_add_token(Q, Tilde, "~", 2);
            ++iter;
            return;
        case '@':
            lexer_add_token(Q, AtSign, "@", 2);
            ++iter;
            return;
        case '$':
            lexer_add_token(Q, DollarSign, "$", 2);
            ++iter;
            return;
        case '`':
            lexer_add_token(Q, GraveAccent, "`", 2);
            ++iter;
            return;
    }
    ++iter;
}

}

1

There are 1 answers

2
ad absurdum On BEST ANSWER

C functions pass arguments by value, so inside the function lexer_get_single_token(), iter is a copy of the pointer srcBuffer. This means that changes to the value of iter are not reflected in srcBuffer. If you want to preserve the changes made to iter inside the function, you can either return the pointer to the calling function, or add another layer of indirection.

By changing the function signature to:

char * lexer_get_single_token(char *iter, struct lexer **Q);

this function returns a pointer to char, and iter can be returned when the function is finished. The function call would then look like:

srcBuffer = lexer_get_single_token(srcBuffer, &token_queue);

After the function call, srcBuffer points to the location indicated by iter when the end of the function was reached. You should save a copy of the original srcBuffer pointer so that you can free it later.

Alternatively, you can change the function signature to:

void lexer_get_single_token(char **iter, struct lexer **Q);

Now the function call will look like:

lexer_get_single_token(&srcBuffer, &token_queue);

The code in the function will need to be modified to account for the additional indirection, but because you pass in a pointer to srcBuffer, you will be able to make modifications to srcBuffer.