Delete comments from a C file

180 views Asked by At

Purpose of work: to learn how to work with files using the functions of the standard C library

Task: There is a file with a C program. It is necessary to delete all comments from it and write the code without comments to a new file.

Explanations and implementation peculiarities:

  1. The program file may be very large. Therefore, the whole file must NOT be read into an array beforehand.
  2. Comments can be single-line or multi-line. A single-line comment can also consist of several lines, if it is moved to the next line using the "backslash" character - "\".
  3. There are no nested comments in the C language
  4. Comments inside string constants are not taken into account
  5. A file does not necessarily represent a correct C program. For example, a comment may break without being closed
  6. It is allowed both the appearance of a few new spaces and/or line feeds in place of a deleted comment and the absence of some existing non-significant delimiter characters.
  7. It is not allowed to delete data from constant strings marked with quotation marks (double and single).

Input and output data: The source file is always named test.c The output file must be named test.wc

My code:

#include <stdio.h>

#define TRUE 1
#define FALSE 0

typedef int BOOL;

int mygetc (FILE *in) {
        for (;;) {
                int c = getc(in);
                if (c == '\\') {
                        c = getc(in);
                        if (c == '\n')
                                continue;
                        if (c != EOF)
                                ungetc(c, in);
                        c = '\\';
                }
                return c;
        }
}

int skip_line_comment (FILE *in) {
        int c;
        while ((c = mygetc(in)) != '\n' && c != EOF)
                continue;
        return c;
}

int skip_block_comment (FILE *in) {
        int c;
        for (;;) {
                while ((c = mygetc(in)) != '*') {
                        if (c == EOF)
                                return c;
                }
                while ((c = mygetc(in)) == '*')
                        continue;
                if (c == EOF)
                        return c;
                if (c == '/')
                        return ' ';
        }
}

void removeComments (FILE *in, FILE *out) {
        int c;
        while ((c = mygetc(in)) != EOF) {
                if (c == '"' || c == '\'') {
                        int separator = c;
                        fputc(c, out);
                        while ((c = mygetc(in)) != separator && c != EOF) {
                                fputc(c, out);
                                if (c == '\\') {
                                        c = mygetc(in);
                                        if (c == EOF) break;
                                        fputc(c, out);
                                }
                        }
                } else if (c == '/') {
                        c = mygetc(in);
                        if (c == '/') c = skip_line_comment(in);
                        else if (c == '*') c = skip_block_comment(in);
                        else fputc('/', out);
                        
                }
                if (c == EOF) break;
                fputc(c, out);
        }
}

int main () {
        const char inName[20]  = "test.c";
        const char outName[20] = "test.wc";
        FILE *in;
        FILE *out;
  
        in  = fopen(inName, "r");
        out = fopen(outName, "w");
       
        removeComments(in, out);

        fclose(in);
        fclose(out);
  
        return 0;
}

.zip with tests: Google Disk

  1. Test 1 - correct
  2. Test 2 - correct
  3. Test 3 - missing "/" in the first line
  4. Test 4 - expected "* * *" in one line but got * in different lines
  5. Test 5 - correct
  6. Test 6 - problems with "", got some extra lines like """"""""\"""\" which had to be removed
  7. Test 7 - correct
  8. Test 8 - stripped out the extra lines
  9. Test 9, 10 - problem with combination / and \
  10. Test 11 - correct
  11. Test 12 - correct
  12. Test 13 - correct
2

There are 2 answers

2
rvevau On BEST ANSWER

Decided to completely rewrite the program. Now there is a separate algorithm for each case. It passes all 13 tests.

#include <stdio.h>

#define BUFSIZE 2048
#define CODE                        0
#define STAR_IN_MULTI_COMMENT       1
#define SLASH_IN_LINE_COMMENT       2
#define BEGIN_COMMENT               3
#define STRING                      4
#define MULTI_COMMENT               5
#define LINE_COMMENT                6
#define SLASH_IN_STRING             7
#define LITERAL                     8
#define SLASH_IN_LITERAL            9

void processFile(FILE *filei, FILE *fileo);

int main() {
    char filenamei[] = "test.c";
    char filenameo[] = "test.wc";

    FILE *filei, *fileo;

    if ((filei = fopen(filenamei, "r")) == NULL
    || (fileo = fopen(filenameo, "w")) == NULL) {
        return 0;
    }

    processFile(filei, fileo);

    fclose(filei);
    fclose(fileo);

    return 0;
}

void processFile(FILE *filei, FILE *fileo) {
    int i, size, pOutStr, state;
    char buffer[BUFSIZE], result[BUFSIZE];

    state = CODE;
    do {
        pOutStr = 0;
        size = fread(buffer, 1, BUFSIZE, filei);

        for(i = 0; i < size; i++) {
            switch (state) {
                case CODE:
                    if (buffer[i] == '\"') {
                        state = STRING;
                        result[pOutStr++] = buffer[i];
                    } else if (buffer[i] == '/') {
                        state = BEGIN_COMMENT;
                    } else if (buffer[i] == '\'') {
                        state = LITERAL;
                        result[pOutStr++] = buffer[i];
                    } else {
                        result[pOutStr++] = buffer[i];
                    }
                    break;

                case STAR_IN_MULTI_COMMENT:
                    if (buffer[i] == '/') {
                        state = CODE;
                    } else if (buffer[i] != '*') {
                        state = MULTI_COMMENT;
                    }
                    break;

                case SLASH_IN_LINE_COMMENT:
                    state = LINE_COMMENT;
                    break;

                case BEGIN_COMMENT:
                    if (buffer[i] == '*') {
                        state = MULTI_COMMENT;
                    } else if (buffer[i] == '/') {
                        state = LINE_COMMENT;
                    } else {
                        result[pOutStr++] = '/';
                        --i;
                        state = CODE;
                    }
                    break;

                case STRING:
                    if (buffer[i] == '"') {
                        state = CODE;
                    } else if (buffer[i] == '\\') {
                        state = SLASH_IN_STRING;
                    } else if (buffer[i] == '\n') {
                        state = CODE;
                    }
                    result[pOutStr++] = buffer[i];
                    break;

                case LITERAL:
                    if (buffer[i] == '\'') {
                        state = CODE;
                    } else if (buffer[i] == '\\') {
                        state = SLASH_IN_LITERAL;
                    } else if (buffer[i] == '\n') {
                        state = CODE;
                    }
                    result[pOutStr++] = buffer[i];
                    break;

                case MULTI_COMMENT:
                    if (buffer[i] == '*') {
                        state = STAR_IN_MULTI_COMMENT;
                    }
                    break;

                case LINE_COMMENT:
                    if (buffer[i] == '\\') {
                        state = SLASH_IN_LINE_COMMENT;
                    } else if (buffer[i] == '\n') {
                        state = CODE;
                        result[pOutStr++] = buffer[i];
                    }
                    break;

                case SLASH_IN_STRING:
                    state = STRING;
                    result[pOutStr++] = buffer[i];
                    break;

                case SLASH_IN_LITERAL:
                    state = (buffer[i] != '\n') ? LITERAL : CODE;
                    result[pOutStr++] = buffer[i];
                    break;

                default:
                    break;
            }
        }

        if (pOutStr) {
            fwrite(result, 1, pOutStr, fileo);
        }

    } while(!feof(filei));
}
4
chqrlie On

There are multiple issues in this question:

  1. it is unclear whether the output program can differ from the source program in subtle ways such as

    • line endings (LF or CR/LF pairs)
    • non significant initial and/or trailing spaces
    • empty lines
    • suppressed escaped newline sequences (\ followed by and end of line sequence)
  2. the test files are encoded using the DOS end of line sequences (CR/LF) and the expected output files use an inconsistent combination of unix line endings (LF) and DOS end of line sequences. The zip file also contains macOS attribute files (in the __MACOSX/ directory), indicating that it was produced on a Mac, which uses unix line endings and does not translate line endings for files open in text mode ("r" and "w"). It is likely the expected output files were produced on a mac or on a unix system from DOS source files, where the DOS end of line sequences were copies unchanged from the source files but the '\n' output in replacement of multiline comments were not translated to CR/LF sequences by the Standard library as they would have been on a legacy system.

  3. the test files contain many inconsistencies such as missing spaces and newlines, even missing final newlines! The problem statement should be more precise and specify that single line comments must be replaced with a single newline and multiline comments with a single space. Yet replacing multiline comments with a single space is not always appropriate: in #define a/**/(b), this would cause a(1) to be defined as expanding to (b)(1) instead of nothing. A multiline comment should only expand to a space if its absence would cause the next character to be part of the previous token, which is non trivial to implement.

  • another problem in the test files is the presence of embedded unescaped newlines inside string literals (eg: test(5).c line 8). The expected test(5).wc file seems to expect the string to end on the newline. The source file is invalid, but the behavior on such syntax errors should be specified if any expected output is to be checked.

It seems difficult to write a program that will produce the expected output precisely.

Your program is quite straight forward and seems correct, except for a few limitations and errors:

  • if compiled on a unix system, it does not recognise CR/LF sequences, so it does not handle \ followed by CR/LF as a escaped newline. This prevents the correct handling of comments where the //, /* or */ sequences are split on 2 lines with one or more escaped newline.

  • the program does not handle the sequence /" as a / followed by the beginning of a string because the " is output by fputc(c, out) at the end of the body and only the next character is compared to ' and ". Same problem for /'.

  • the program suppresses all escaped newlines including those outside comments, which is not what the test files seem to expect.

  • it does not handle trigraphs such as ??/ that translate to \ before the escaped newline removal (support for trigraphs would be pedantic as this feature has been removed from the latest C Standard is never used except in test suites).

  • it does not handle single quotes used in the C++ style digit separators now standard in C23 (eg: int x = 1'000;). This ugly convention makes it more difficult to parse C tokens properly. It is quite unfortunate that C23 did not standardize _ as the digit separator as in almost all other modern languages, a convention that would have been compatible with existing preprocessors and parsers as _ was already accepted in pp-numbers. 1'000 will be interpreted as starting a character constant, causing the rest of the program to be misinterpreted and causing the comment stripping to fail.

Here is a modified version of your program with corrections for the line ending issue and the /" and /' parsing error. It still suppresses the escaped newlines, ignores trigraphs and misinterprets ' as digit separators.

#include <errno.h>
#include <stdio.h>
#include <string.h>

int mygetc0(FILE *in) {
    int c = getc(in);
    if (c == '\r') {
        /*  handle legacy line end combinations */
        c = getc(in);
        if (c != '\n' && c != EOF)
            ungetc(c, in);
        c = '\n';
    }
    return c;
}

int mygetc(FILE *in) {
    for (;;) {
        int c = mygetc0(in);
        if (c == '\\') {
            c = mygetc0(in);
            if (c == '\n')
                continue;
            if (c != EOF)
                ungetc(c, in);
            c = '\\';
        }
        return c;
    }
}

int skip_line_comment(FILE *in) {
    int c;
    while ((c = mygetc(in)) != '\n' && c != EOF)
        continue;
    return c;
}

int skip_block_comment(FILE *in) {
    int c;
    for (;;) {
        while ((c = mygetc(in)) != '*') {
            if (c == EOF)
                return c;
        }
        while ((c = mygetc(in)) == '*')
            continue;
        if (c == EOF)
            return c;
        if (c == '/')
            return ' ';
    }
}

void removeComments(FILE *in, FILE *out) {
    int c;
    while ((c = mygetc(in)) != EOF) {
        if (c == '"' || c == '\'') {
            int separator = c;
            fputc(c, out);
            while ((c = mygetc(in)) != separator && c != EOF) {
                fputc(c, out);
                if (c == '\\') {
                    c = mygetc(in);
                    if (c == EOF)
                        break;
                    fputc(c, out);
                }
            }
        } else if (c == '/') {
            c = mygetc(in);
            if (c == '/') {
                c = skip_line_comment(in);
            } else
            if (c == '*') {
                c = skip_block_comment(in);
            } else {
                if (c != EOF)
                    ungetc(c, in);
                c = '/';
            }
        }
        if (c == EOF)
            break;
        fputc(c, out);
    }
}

int strip_file(const char *inName) {
    FILE *in;
    FILE *out;
    int len = strlen(inName);

    if (len >= 2 && !strcmp(inName + len - 2, ".c"))
        len -= 2;

    char outName[len + 4];
    snprintf(outName, sizeof outName, "%.*s.wc", len, inName);

    if ((in = fopen(inName, "r")) == NULL) {
        fprintf(stderr, "cannot open %s: %s\n", inName, strerror(errno));
        return 1;
    }
    if ((out = fopen(outName, "w")) == NULL) {
        fprintf(stderr, "cannot open %s: %s\n", outName, strerror(errno));
        fclose(in);
        return 1;
    }

    removeComments(in, out);

    fclose(in);
    fclose(out);
    return 0;
}

int main(int argc, char *argv[]) {
    int res = 0;
    if (argc > 1) {
        for (int i = 1; i < argc; i++) {
            res |= strip_file(argv[i]);
        }
    } else {
        res |= strip_file("test.c");
    }
    return res;
}