I know how pointers and pointer arithmetic works but this is confounding me.
I made a lexer a while back and now I'm modifying it so I can use a parser with it but, for some reason, it will not iterate the char*
I pass to the lexer function.
I malloc()'d a char pointer and then passed it to the function where it iterates over it. It does the iteration but it resets back to the original address for some reason. I need the char* to be able to change and STAY at the address it finished at after iterating or else I cannot complete a proper parser...
main.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lex.h"
int main(int argc, char **argv)
{
struct lexer *token_queue = NULL;
FILE *srcfile = fopen( argv[1], "r+" );
if (!srcfile) {
fprintf(stderr, "file \"%s\" not found, is null\n", argv[1]);
lexer_destroy(&token_queue);
goto hell;
}
fseek(srcfile, 0, SEEK_END); // go all the way to end of file
long file_len = ftell(srcfile); // get total size of the file
rewind(srcfile); // reset the file reader to beginning
char *srcBuffer = malloc(sizeof(char) * (file_len+1)); // pretty sure it's safe to assume char is 1 byte...
memset(srcBuffer, '\0', sizeof(char) * (file_len+1)); // zero the whole buffer
fread(srcBuffer, sizeof(char), (file_len+1), srcfile); // get entire file as a string
char decision[20];
do {
printf("print a token?\n");
scanf("%19s", decision);
lexer_get_single_token(srcBuffer, &token_queue); // tokenize baby!
//printf(" *srcBuffer == %c \n", *srcBuffer);
print_tokens_colored(&token_queue);
}
while (decision[0] == 'y') ; // slowly simulate parser asking for another token!
free(srcBuffer); srcBuffer = NULL;
fclose(srcfile); srcfile = NULL;
lexer_destroy(&token_queue);
hell:;
return 0;
}
lexer function
void lexer_get_single_token(char *iter, struct lexer **Q)
{
char wording[512] = ""; // buffer to hold identifiers, keywords, and strings
unsigned int i = 0;
const char *keywords[] = {
"auto", "const", "double", "float", "int", "short", "struct", "unsigned",
"break", "continue", "else", "for", "long", "signed", "switch", "void",
"case", "default", "enum", "goto", "register", "sizeof", "typedef", "volatile",
"char", "do", "extern", "if", "return", "static", "union", "while",
"inline", "alignof", "_Generic", "bool", "_Bool", "true", "false"
};
printf("*iter == %c\n", *iter);
while ( *iter != '\0' ) {
while ( is_space(*iter) )
++iter;
if (*iter == '/' && iter[1] == '*') { // found C style /**/ comment
do {
++iter;
}
while ( !(*iter == '*' && iter[1] == '/') ); // continuously skip until we find a */
iter += 2; // skip twice to pass over */ and go to the next token.
}
if (*iter == '/' && iter[1] == '/') { // found C++ style // comment
while ( *iter != '\n' )
++iter; // skip until the next line which will be skipped itself.
}
if (*iter == '\\' && iter[1] == '\n') { // formatting Left slash check
lexer_add_token(Q, LeftSlash, "\\", 2);
iter += 2;
return;
}
if (*iter == '\"') { // found string literal, adjust for "\\" so we won't crash
wording[i++] = *iter++; // add the first double quote to buffer
while ( *iter != '\"' ) {
if (*iter == '\\' && iter[1] == '\"' && iter[-1] != '\\') {
wording[i++] = *iter++; // add the literal double quote as well
}
wording[i++] = *iter++;
}
wording[i++] = *iter++; // found the ending double quote, add that too.
if (wording[0] != '\0') {
lexer_add_token(Q, StringConstant, wording, i+1);
reset_string(wording);
return;
}
}
if ( *iter == '\'' ) { // found character literal, adjust for '\\' so we won't crash
wording[i++] = *iter++;
int counter=0;
while (*iter != '\'' && counter < 2) { // Same operation as the string literal but limit as char
if (*iter == '\\' && iter[1] == '\'' && iter[-1] != '\\') {
wording[i++] = *iter++;
}
wording[i++] = *iter++;
++counter;
}
wording[i++] = *iter++; // add ending single quote to buffer
if (wording[0] != '\0') {
lexer_add_token(Q, CharConstant, wording, i+1);
reset_string(wording);
return;
}
}
if (*iter == '0' && (iter[1] == 'x' || iter[1] == 'X')) { // found hexadecimal constant
wording[i++] = *iter++; // copy both 0 and x to buffer
wording[i++] = *iter++;
while ( is_numeral(*iter) ) {
wording[i++] = *iter++; // copy numbers and letters A to F
}
if ( *iter == '.' && is_numeral(iter[1]) ) { // found hexadecimal float
wording[i++] = *iter++;
while ( is_numeral(*iter) )
wording[i++] = *iter++;
if (*iter == 'p' && is_numeral(iter[1])) { // stuff like 0x0.3p10.
wording[i++] = *iter++;
while ( is_numeral(*iter) )
wording[i++] = *iter++;
}
if (wording[0] != '\0') {
lexer_add_token(Q, NumConstantHexFloat, wording, i+1);
reset_string(wording);
return;
}
}
else { // we didn't find a decimal, so tokenize what we found as a normal hex constant
if (wording[0] != '\0') {
lexer_add_token(Q, NumConstantHex, wording, i+1);
reset_string(wording);
return;
}
}
}
while ( is_numeric(*iter) ) { // found decimal constant
wording[i++] = *iter++;
}
if ( *iter == '.' && is_numeric(iter[1]) ) { // found floating point number
wording[i++] = *iter++; // add in the decimal char
while ( is_numeric(*iter) )
wording[i++] = *iter++;
// add the 'e' constant for large floats as well as 'p' (power) constant
if ( (*iter == 'p' || *iter == 'P' || *iter == 'e' || *iter == 'E') && is_numeric(iter[1]) )
{
wording[i++] = *iter++;
while ( is_numeric(*iter) )
wording[i++] = *iter++;
}
if (*iter == 'f' || *iter == 'F') // stuff like 2.0f, add that into the buffer!
wording[i++] = *iter++;
if (wording[0] != '\0') {
lexer_add_token(Q, NumConstantReal, wording, i+1);
reset_string(wording);
return;
}
}
else { // no decimal, consider it a natural number
if (wording[0] != '\0') {
lexer_add_token(Q, NumConstant, wording, i+1);
reset_string(wording);
return;
}
}
if (is_alphabetic(*iter)) { // found an identifier or potential keyword
while (is_potential_identifier(*iter))
wording[i++] = *iter++;
if (wording[0] != '\0') {
int x;
int found_keyword = 0;
for ( x=0 ; x<sizeof keywords/sizeof keywords[0] ; ++x ) {
if ( !strcmp(wording, keywords[x]) )
found_keyword = 1;
}
if (found_keyword)
lexer_add_token(Q, Keyword, wording, i+1);
else lexer_add_token(Q, NumIdent, wording, i+1);
reset_string(wording);
return;
}
}
switch ( *iter ) { // add in individual characters
case '=':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, EqualCmp, "==", 3);
}
else lexer_add_token(Q, Equal, "=", 2);
++iter;
return;
case ';':
lexer_add_token(Q, Semicolon, ";", 2);
++iter;
return;
case ':':
lexer_add_token(Q, Colon, ";", 2);
++iter;
return;
case '+': // possible uses => left unary is positive, twice unary is increment, once binary is addition
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, PlusEqual, "+=", 3);
}
else if (iter[1] == '+') {
++iter;
lexer_add_token(Q, Increment, "++", 3);
}
else lexer_add_token(Q, Plus, "+", 2);
++iter;
return;
case '-': // possible uses => left unary is negating, twice unary is decrement, one binary is minus
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, MinusEqual, "-=", 3);
}
else if (iter[1] == '-') {
++iter;
lexer_add_token(Q, Decrement, "--", 3);
}
else if (iter[1] == '>') {
++iter;
lexer_add_token(Q, Arrow, "->", 3);
}
else lexer_add_token(Q, Dash, "-", 2);
++iter;
return;
case '*': // leftward unary is dereferencing ptr, binary be mult. Also check for / as ending comment
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, MultEqual, "*=", 3);
}
else lexer_add_token(Q, Asterisk, "*", 2);
++iter;
return;
case '/': // check for * and / as comment EDIT: DONE
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, DivEqual, "/=", 3);
}
else lexer_add_token(Q, DivSlash, "/", 2);
++iter;
return;
case '(':
lexer_add_token(Q, LeftParens, "(", 2);
++iter;
return;
case ')':
lexer_add_token(Q, RiteParens, ")", 2);
++iter;
return;
case '[':
lexer_add_token(Q, LeftSqBracket, "[", 2);
++iter;
return;
case ']':
lexer_add_token(Q, RightSqBracket, "]", 2);
++iter;
return;
case '{':
lexer_add_token(Q, LeftCurlBrace, "{", 2);
++iter;
return;
case '}':
lexer_add_token(Q, RightCurlBrace, "}", 2);
++iter;
return;
case '.':
if (iter[1] == '.' && iter[2] == '.') {
iter += 2;
lexer_add_token(Q, Ellipses, "...", 4);
}
else lexer_add_token(Q, Dot, ".", 2);
++iter;
return;
case ',':
lexer_add_token(Q, Comma, ",", 2);
++iter;
return;
case '<':
if (iter[1] == '<') {
if (iter[2] == '=') {
lexer_add_token(Q, LeftBitShiftEqual, "<<=", 4);
iter += 2;
}
else {
lexer_add_token(Q, LeftBitShift, "<<", 3);
++iter;
}
}
else if (iter[1] == '=') {
lexer_add_token(Q, LessEqual, "<=", 3);
++iter;
}
else lexer_add_token(Q, LeftArrow, "<", 2);
++iter;
return;
case '>':
if (iter[1] == '>') {
if (iter[2] == '=') {
lexer_add_token(Q, RightBitShiftEqual, ">>=", 4);
iter += 2;
}
else {
lexer_add_token(Q, RightBitShift, ">>", 3);
++iter;
}
}
else if (iter[1] == '=') {
lexer_add_token(Q, GreaterEqual, ">=", 3);
++iter;
}
else lexer_add_token(Q, RightArrow, ">", 2);
++iter;
return;
case '?':
lexer_add_token(Q, QuestionMark, "?", 2);
++iter;
return;
case '#':
lexer_add_token(Q, HashSym, "#", 2);
++iter;
return;
case '&':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, AndEqual, "&=", 3);
}
else if (iter[1] == '&') {
++iter;
lexer_add_token(Q, BoolAnd, "&&", 3);
}
else lexer_add_token(Q, Ampersand, "&", 2);
++iter;
return;
case '^':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, XorEqual, "^=", 3);
}
else lexer_add_token(Q, Carot, "^", 2);
++iter;
return;
case '%':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, ModuloEqual, "%=", 3);
}
else lexer_add_token(Q, Percent, "%", 2);
++iter;
return;
case '!':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, NotEqual, "!=", 3);
}
else lexer_add_token(Q, ExclamationMark, "!", 2);
++iter;
return;
case '|':
if (iter[1] == '=') {
++iter;
lexer_add_token(Q, OrEqual, "|=", 3);
}
else if (iter[1] == '|') {
++iter;
lexer_add_token(Q, BoolOr, "||", 3);
}
else lexer_add_token(Q, VerticalBar, "|", 2);
++iter;
return;
case '~':
lexer_add_token(Q, Tilde, "~", 2);
++iter;
return;
case '@':
lexer_add_token(Q, AtSign, "@", 2);
++iter;
return;
case '$':
lexer_add_token(Q, DollarSign, "$", 2);
++iter;
return;
case '`':
lexer_add_token(Q, GraveAccent, "`", 2);
++iter;
return;
}
++iter;
}
}
C functions pass arguments by value, so inside the function
lexer_get_single_token()
,iter
is a copy of the pointersrcBuffer
. This means that changes to the value ofiter
are not reflected insrcBuffer
. If you want to preserve the changes made toiter
inside the function, you can either return the pointer to the calling function, or add another layer of indirection.By changing the function signature to:
this function returns a pointer to
char
, anditer
can be returned when the function is finished. The function call would then look like:After the function call,
srcBuffer
points to the location indicated byiter
when the end of the function was reached. You should save a copy of the originalsrcBuffer
pointer so that you canfree
it later.Alternatively, you can change the function signature to:
Now the function call will look like:
The code in the function will need to be modified to account for the additional indirection, but because you pass in a pointer to
srcBuffer
, you will be able to make modifications tosrcBuffer
.