Word-wrap issue

133 views Asked by At

I'm very new to C and programing in general and I'm currently trying to write a program that will word-wrap a paragraph of text so that no line in the text is longer than a certain size. The readfile function reads the lines of text from the text file and put it into array of strings called text where each element of the text array is a line in the text and the write code that creates a new array of strings called newtext where each element of the array is a word-wrapped line limited to a length specified by the linewidth variable. My current issue is that my code seems to be generating an output that is slightly off the expected output and I'm not certain why. Here is the expected output: expected output

And here's my output: myoutput

I've tried adjusting the end index and writing a separate loop for skipping empty spaces and nothing seems to fix this particular error

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

int readfile(char* filename, char*** bufp)
{
    FILE* fp = fopen(filename, "r");
    char** buf = NULL;
    int numlines = 0;
    char tmp[1000];
    while (fgets(tmp, sizeof(tmp), fp)) {
        if (numlines % 16 == 0) {
            buf = (char**)realloc(buf, (numlines+16) * sizeof(char*));
        }
        int len = strlen(tmp);
        tmp[len-1] = 0;
        buf[numlines] = malloc(len + 1);
        strcpy(buf[numlines], tmp);
        numlines++;
    }
    *bufp = buf;
    return numlines;
}

void print_text(char** lines, int numlines) {
    for (int i=0; i<numlines; i++) {
        printf("%s\n", lines[i]);
    }
}

int main(int argc, char** argv) {
    char** text;
    int numlines = readfile(argv[1], &text);
    int linewidth = atoi(argv[2]);

    char** newtext = NULL;
    int newnumlines = 0;

    // TODO
    // iterate through the text array
    //    create a char* variable line = text[i]
    //    iterate through the line
    //        if you are starting a new line allocate space for the newline
    //        make sure you put the newline into the newtext array
    //        and check if you need to reallocate the newtext array
    //
    //        copy the character into the newline array
    //        check if you have reached the max linewidth
    //            if you aren't already at the end of a word,
    //            backtrack till we find a space or get to start of line
    //            terminate the newline and reset the newline position to 0
    //    put a space in the newline, unless you are at the end of the newline
    
    for (int i = 0; i < numlines; i++)
    {
        char * line = text[i];
        int length = strlen(line);
        int x = 0;
        int start = 0;
        
        while (start < length) {
            // Calculate the end index of the current line segment
            int end = start + linewidth;

            // Adjust the end index if it falls within a word
            while (end > start && end < length && line[end] != ' ') {
                end--;
            }
            
            char *newline = malloc(end - x + 1 + 1); 
            strncpy(newline, line + start, end - start);
            newline[end - start] = '\0'; 
            
            newtext = realloc(newtext, (newnumlines + 1) * sizeof(char*));
            newtext[newnumlines++] = newline;
            
            start =  end;
            while(start < length && line[start] == ' ')
            {
                start++;
            }
            //x = end + 1;
           // start = x;
        }
        
    }

    for(int i = 0; i < newnumlines; i++) {
        // Skip printing empty lines
        if (strlen(newtext[i]) > 0) {
            printf("%s\n", newtext[i]);
        }
    }
    
    //freeing memory
    for(int i = 0; i < numlines; i++){
        free(text[i]);
    }
    free(text);
    
    for(int i = 0; i < newnumlines; i++){
        free(newtext[i]);
    }
    free(newtext);
    
    return 0;
}
3

There are 3 answers

0
greg spears On

Here is some wordwrap code that works nicely, and adheres to the rules mentioned in my comments to your question:

1.) Uses one continuous, single dimension array of text so that nothing but the end of the array will interrupt a paragraph.

2.) Ignores/strips all the existing newlines ('\n') in the source text/array. EDIT: per comments from user Fe2O3, updated code to translate newlines to spaces.

3.) Calls next_break() to do a 'lookahead' from the current offset to anticipate if the next natural break (a space, period, etc) comes before or after the wrap margin we intend to enforce.

Runnable, tested code is here. https://godbolt.org/z/hvY19Tq7x

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define TRUE 1
#define FALSE 0

/*--------------------------------------------------------------------------
    
    next_break()
    
    Algo: function does a look-ahead for a space, a hyphen... anything that
    constitutes a natural sentence break oppty.   Returns the index of 
    the break oppty to the caller.
*--------------------------------------------------------------------------*/
int next_break(const char * str)
{
int done = FALSE, tempindex= -1;
char ch;

    while(!done)
    {
        ch = str[++tempindex];
        switch( ch ) 
        {
                case 0:
                case (char)' ':
                case (char)'\n':
                case (char)'\t':
                case (char)'-':
                    done = TRUE;
                break;

                default:
                break;
        }
    }
    return(tempindex);
}

/*-------------------------------------------------------------------------------------
    
    wordwrap()
    
    Algo: parses a long string looking for line break opportunities with 
    every char. If a break oppty is found at cuurent offs, does a qwk scan ahead 
    via next_break() to see if a better oppty exists ahead. ('Better' means closer 
    to the margin but NOT past the margin)

    If no better oppty found ahead, inserts a newline into buffer & restarts the line
    count.  Else, postpones the newline until chars are read up to the better oppty.
    
    Inputs: char *src buffer needing word wrap formatting.
            int max_line_len for wrap margin.
            int pointer *ugly_breaks for returning number of middle-of-word breaks. 

    Returns a buffer having the formatted text.
*-------------------------------------------------------------------------------------*/
char *wordwrap(const char *src, const int max_line_len, int *ugly_breaks)
{
    int src_idx=0, dest_idx = 0, cur_line_len = 0, done = FALSE;
    char ch;
    char *dest = malloc(strlen(src)*3); /* Enough space for even the worst of wrap possibilities.*/ 
    int new_line_needed = FALSE;

    if(!dest)
    {
        printf("Memory Allocation error in wordwrap");
        return NULL;
    }

    while(!done)
    {
        ch = src[src_idx];
        switch(ch)
        {
            case 0:
                done = TRUE;
            break;

            case (char)' ':
            case (char)'-':
                dest[dest_idx++]=ch; 
                cur_line_len++;   
                /* Would the next break oppty put us past the margin/line limit? */
                if(cur_line_len + next_break(&src[src_idx+1]) >= max_line_len)
                {
                    /* A: Yes.  Take the break oppty here, Now*/
                    new_line_needed = TRUE;
                }
            break;

            case (char)'\n': /* translate newlines to space per comment user Fe2O3 on SO*/
                dest[dest_idx++]=' ';
            break;
            
            case (char)'\r': /* and carriage return. Strip them */
            break;

            case (char)'\t': /* Tab, replace with space(s)*/    
                    
                    if(cur_line_len+1 + next_break(&src[src_idx+1]) >= max_line_len)
                    {
                        /* tab is the last character of current line.  Rare but we have to care for it. */
                        new_line_needed = TRUE;
                    }
                    else
                    {
                        /* Replace the 4s here with any tab stop you like. 8 is standard.*/
                        int to_add = 4-((cur_line_len)%4);

                        while(to_add-- && cur_line_len < max_line_len)
                        {
                            dest[dest_idx++]=SPACE;  /* Adaptable space replacement char */
                            cur_line_len++;
                        }
                    }
            break;

            default:
                dest[dest_idx++]=ch;
                cur_line_len++;
            break;
        }
        
        /* Has one of our cases flagged a need for newline? */
        if(new_line_needed)
        {
            int space_remaining = (max_line_len-cur_line_len);
            double percent_remain = 0.0;

            new_line_needed = FALSE;

            /* Set percent_remain tolerance lower than 10% to get more greedy
            * with space conservation but get more ugly word breaks.
            * Set to 100% (1.0) and you won't get any ugly breaks -- unless 
            * you encounter a Huge word that is longer than your margin limit.
            */
            if(cur_line_len > 0 )
                    percent_remain = (double)space_remaining/cur_line_len;
            if(percent_remain < 0.25) 
            {
                /* Not much space remaining, we can newline here */
                dest[dest_idx++]='\n';
                cur_line_len = 0;
            }
        }
        /*check for margin overflow with every loop. */ 
        if(cur_line_len >= max_line_len)
        {
            /* We have or will overflow with next char.
            * This is called breaking the word ugly.*/
            dest[dest_idx++]='\n';
            cur_line_len = 0;
            /* Track ugly breaks for tolerance & adjusting newline rejections*/
            (*ugly_breaks)++;  
        }
        src_idx++;
    }
    dest[dest_idx++]='\0';   /* cap it */
    return dest;
}
0
Neil On

readfile is complicated: it reads multiples of fixed size (1000), hoping that this is enough, then copies this into another array of pointers. Words are scattered across multiple objects. I've modified the code below to read in all stdin at once in a single object, in exponentially increasing granularity. Then, a state-machine collapses words separated by standard isspace characters to a single space in-place.

#include <stdlib.h>
#include <stdio.h>
#include <string.h> /* memmove */

/** @return The flattened stdin with single spaces in place of whitespace that
 you must `free` or `errno` will be set (haven't tested esp. non-POSIX). */
static char *readfile(void) {
    struct { char *data; size_t capacity, size; } a = { 0, 64, 0 };
    /* Allocate and read exponentially more until the end of stdin. */
    size_t nread, nwant;
    do {
        char *redata;
        if(!(redata = realloc(a.data, a.capacity *= 2))) { free(a.data); return 0; }
        a.size += (nread = fread((a.data = redata) + a.size, 1, nwant = a.capacity - a.size - 1, stdin));
    } while(nread == nwant);
    if(ferror(stdin)) { free(a.data); return 0; }
    /* Collapse one space between words lazily. */
    enum { NONE, WORD, SPACE } state = NONE;
    char *space = a.data, *word = 0 /* Unrealizable. */;
    for(char *c = space, *end = c + a.size; c < end; c++) switch(*c) {
    case '\0': case ' ': case '\f': case '\n': case '\r': case '\t': case '\v': /* Null/space. */
        if(state == WORD) { /* Force. */
            if(space != a.data) *space++ = ' ';
            memmove(space, word, c - word);
            space += c - word;
        }
        state = SPACE;
        break;
    default: /* Word. */
        if(state != WORD) word = c;
        state = WORD;
        break;
    }
    if(state == WORD) { /* Final force. */
        if(space != a.data) *space++ = ' ';
        memmove(space, word, a.data + a.size - word);
        space += a.data + a.size - word;
    }
    *space = '\0'; /* Nul-terminate the string; -1 ensures capacity above. */
    return a.data;
}

int main(void) {
    int success = EXIT_SUCCESS;
    fprintf(stderr, "Read:\n");
    char *str = readfile();
    if(!str) { perror("stdin"); success = EXIT_FAILURE; }
    else fprintf(stderr, "Flattened: %s\n", str);
    free(str);
    return success;
}

stdin input followed by eof (ctrl-d? or redirect file), then readfile allocates a string big enough to fit everything, and fills it with non-whitespace characters separated by spaces. Maybe this will help transform the problem to a more readily-solvable form.

0
Fe2O3 On

Writing code using dynamic memory allocation and arrays of pointers suggests that "new to C and programing in general" undersells your current progress.

I'm not going to address your code trying to find a particular problem that is causing your headache. In short, the code you've posted has too much complexity in its ancillary aspects. You've begun with reading segments of a text into separately allocated buffers, then, with all that "junior assistant" seeming to work, you face the challenge of dealing with that complexity while trying to achieve the original objective... not following the principles of KISS!

I'd written "Use a simple loop" because a "good" program could be used as a light-weight filter to quickly process files larger than available memory.

The following is intended as tutorial in incremental development of the solution of a task. (All new projects should evolve from a "HelloWorld.c" seed.)

In the interests of an MRE, I use a compile-time string instead of reading one character-at-a-time from a file with fgetc(). For a utility program, I find this saves a lot of time and manual manipulation for testing. And, the "logic" is in the body of a single function (main()). This response is for demonstration purposes only.

The source text used:

char *str =
    "It was the best of times, it was the worst of times,\n"
    "it was the age of wisdom, it was the age of foolishness,\n"
    "it was the epoch of belief, it was the epoch of incredulity,\n"
    "it was the season of Light, it was the season of Darkness,\n"
    "it was the spring of hope, it was the winter of despair,\n"
    "we had everything before us, we had nothing before us,\n"
    "we were all going direct to Heaven, we were all going direct the other way-in short,\n"
    "the period was so far like the present period,\n"
    "that some of its noisiest authorities insisted on its being received,\n"
    "for good or for evil,\n"
    "in the superlative degree of comparison only.\n";

(The single hyphen should be an "emdash" surrounded by whitespace, but I've followed your text here. Further, the online source scraped uses "Light" where your text uses "Life". This difference is inconsequential in this instance.)

Version 1: Merely detect "word" boundaries (aka "hello world"):
(Header #includes and the main() wrapper will not be repeated below.)

#include <stdio.h>
#include <string.h> // for memmove()
#include <ctype.h> // for isspace()

int main( void ) {
    char *str = /* as shown above */
    for( unsigned int ch; ( ch = *str++ ) != '\0'; ) // simulate fgetc()
        if( isspace( ch ) ) {
            ch = '\n'; // any whitespace becomes LF
            putchar( ch );
        } else {
            putchar( ch );
        }
    // Result: well formatted with one "word" per output line
    // A "word" includes any/all punctuation (or digits)
    return 0;
}

Version 2: Consolidate consecutive whitespace

// Alter first line source text inserting consecutive whitespace to suppress:
//  "It was the best of times, it was the worst of times,\n"
    "  \n  It was the best of times, \n\n  \n it was the worst of times,\n\n"

    for( unsigned int ch, chPrv = '\n '; ( ch = *str++ ) != '\0'; chPrv = ch )
        if( isspace( ch ) ) {
            ch = '\n'; // any whitespace becomes LF
            if( chPrv != '\n' ) {
                putchar( ch );
            }
        } else {
            putchar( ch );
        }
    // Result: Same as above, but consolidation of consecutive whitespace noted.

Version 3: Suppress leading whitespace

    int oCnt = 0;
    for( unsigned int ch, chPrv = '\n '; ( ch = *str++ ) != '\0'; chPrv = ch )
        if( isspace( ch ) ) {
            ch = '\n'; // any whitespace becomes LF
            if( chPrv != '\n' && oCnt ) {
                putchar( ch );
                oCnt++;
            }
        } else {
            putchar( ch );
            oCnt++;
        }

Version 4: Buffer characters of a "word", only outputting after assembled

    int oCnt = 0;
    char wBuf[ 64 ]; // a "word" buffer that is "big enough"
    int wbCnt = 0; // count of buffered characters
    for( unsigned int ch, chPrv = '\n '; ( ch = *str++ ) != '\0'; chPrv = ch )
        if( isspace( ch ) ) {
            ch = '\n'; // any whitespace becomes LF
            if( chPrv != '\n' && oCnt ) {
                if( wbCnt )
                    printf( "%.*s", wbCnt, wBuf ); // the "word" in the buffer
                wbCnt = 0;
                putchar( ch ); // still need a LF output here
                oCnt++;
            }

        } else {
        //  putchar( ch ); /* Nope! */
            wBuf[ wbCnt++] = (char)ch; /* Yep! */
            oCnt++;
        }
    if( wbCnt )
        printf( "%.*s\n", wbCnt, wBuf ); // last "word" left in buffer (and a LF)

Version 5: Trickier, now...
Use a single buffer and indices to accumulate multiple words.
Detect when margin overrun occurs and accommodate line breaking.
Added slashes to show extent of output string segments.

    int oCnt = 0;
    char wBuf[ 128 ]; // a "line" buffer that is "big enough"
    int wbCnt = 0;
    int wbAt = 0; // jumping index of start of "word" that could be last for output line

    puts( "/....V....X....V....X/" ); // primitive ruler of 20 character width
    for( unsigned int ch, chPrv = '\n '; ( ch = *str++ ) != '\0'; chPrv = ch )
        if( isspace( ch ) ) {
            ch = '\n';
            if( chPrv != '\n' && oCnt ) {
                if( wbAt + wbCnt > 20 ) {
                    printf( "/%.*s/\n", wbAt - 1, wBuf ); // the "word(s)" in the buffer
                    memmove( wBuf, wBuf + wbAt, wbCnt ); // perhaps a "ring buffer" instead?
                    wbAt = wbCnt;
                    wBuf[ wbAt++] = ' ';
                    wbCnt = 0;
                } else if( wbCnt ) {
                    wBuf[ wbAt + wbCnt++] = ' ';
                    wbAt += wbCnt;
                    wbCnt = 0;
                }
                oCnt++;
            }
        } else {
            wBuf[ wbAt + wbCnt++ ] = (char)ch;
            oCnt++;
        }
    if( wBuf[ wbAt + wbCnt - 1 ] == ' ' ) // did src end with LF or CRLF??
        wbCnt -= 1;

    if( wbAt + wbCnt )
        printf( "/%.*s/\n", wbAt + wbCnt, wBuf ); // last "word(s)" left in buffer (and a LF)

"Working" version: after a bit of tidying

/* #include headers omitted */
int main( void ) {
/* char *str = ... omitted */
    char wBuf[ 128 ], *fmt = "/%.*s/\n";
    int oCnt = 0, wbAt = 0, wbCnt = 0;

    for( unsigned int ch, chPrv = '\n '; ( ch = *str++ ) != '\0'; chPrv = ch )
        if( isspace( ch ) ) {
            ch = '\n';
            if( chPrv != '\n' && oCnt ) {
                if( wbAt + wbCnt > 20 ) { // !! Magic number... Hmmm...
                    printf( fmt, wbAt - 1, wBuf );
                    memmove( wBuf, wBuf + wbAt, wbCnt );
                    wbAt = wbCnt, wbCnt = 0;
                    wBuf[ wbAt++] = ' ';
                } else if( wbCnt ) {
                    wBuf[ wbAt + wbCnt++] = ' ';
                    wbAt += wbCnt, wbCnt = 0;
                }
                oCnt++;
            }
        } else {
            wBuf[ wbAt + wbCnt++ ] = (char)ch;
            oCnt++;
        }
    wbCnt -= ( wBuf[ wbAt + wbCnt - 1 ] == ' ' );
    if( wbAt + wbCnt )
        printf( fmt, wbAt + wbCnt, wBuf );

    return 0;
}

Left as an exercise is "tarting up" this functionality to process from, I suggest, stdin (to act as a filter), using argv[1] to specify the output margin width desired and adding code to prevent buffer overrun if the source contains "unreasonably" long, unbroken runs of non-whitespace characters.

I'd written "simple loop" and hope this exposition of an evolution helps further you on your programming pathway.