Why do I have different line counts?

178 views Asked by At

I made these different programs in different programming languages to count the number of lines of a file, and it turns out that the output differs according to the program, but the strange thing is that some programs have the same results, I was testing them with a 6gb utf-8 xml file with about 146 million lines.

# Python
# Output -> 146114085 lines
import time

lines = 0

start = time.perf_counter()

with open('file_path') as myfile:
    for line in myfile:
        lines += 1

print("{} lines".format(lines))

end = time.perf_counter()

elapsed = end - start

print(f'Elapsed time: {elapsed:.3f} seconds')
// Java
// Output -> 146114085 lines (just as with python)

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {
        try {
            long startTime = System.currentTimeMillis();
            int BUFFER_SIZE = 1024*1024;
            String filePath = "file_path";
            FileReader file = file = new FileReader(filePath);
            BufferedReader reader = new BufferedReader(file, BUFFER_SIZE);
            long lines = reader.lines().count();
            reader.close();
            System.out.println("The number of lines is " + lines);
            long elapsedTime = System.currentTimeMillis() - startTime;
            System.out.println("Duration in seconds: " + elapsedTime/1000);
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
}
// Rust
// Output -> 146113746 lines
use std::fs::File;
use std::io::{BufRead, BufReader, Error, Read};
use std::time::Instant;

fn main() {
    let file_path = "file_path";
    let buffer_size = 1024*1024;
    let start = Instant::now();
    if let Err(err) = read_file(buffer_size, file_path) {
        println!("{}", err);
    }
    let duration = start.elapsed();
    println!("The function took {} seconds to execute", duration.as_secs());
}

fn read_file(buffer_size: usize, file_path: &str) -> Result<(), Error> {
    let file = File::open(file_path)?;
    let reader = BufReader::with_capacity(buffer_size, file);
    let lines = reader.lines().fold(0, |sum, _| sum + 1);
    println!("Number of lines {}", lines);
    Ok(())
}
// C
// Output -> 146113745 lines (one line less than rust output)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[]) {
    // start time
    clock_t start = clock();

    // File path
    const char* file_path = "file_path";

    // Open the file for reading
    FILE *fp = fopen(file_path, "r");

    // Allocate a buffer to hold the data
    const size_t BUFFER_SIZE = 1024*1024;
    char *buffer = malloc(BUFFER_SIZE);

    // Declare the number of lines variable
    unsigned int lines = 0;

    // Read the data in chunks
    while (!feof(fp)) {
        // Read a chunk of data from the file
        size_t bytes_read = fread(buffer, 1, BUFFER_SIZE, fp);

        // Process the data here...
        for (int i = 0; i < bytes_read; i++) {
            if (buffer[i] == '\n') {
                lines++;
            }
        }
    }

    printf("The number of lines %u\n", lines);

    // Clean up
    free(buffer);
    fclose(fp);

    // End
    clock_t end = clock();

    // Calculate the elapsed time in seconds
    double elapsed = (double) ((end - start) / CLOCKS_PER_SEC);

    printf("Elapsed time: %f seconds", elapsed);

    return 0;
}

Finally, the command wc Output -> 146113745 lines (just as with C) wc -l file_path

I think the correct answer is Rust's, because it has one more than wc/C, and it is the last line that has no line change as it reaches the end of the file. The cases that cause me confusion are java and python.

1

There are 1 answers

0
Sebasos On

My Regex expression for a line is .*?\\n|.+. This works in https://regexr.com/.

For some reason in the file reading implementation I'm using in Python and Java the character '\r' is interpreted as a line feed, but this doesn't happen with the Rust implementation, nor the wc one and obviously neither with the one I made in C (even when it is explicit).

But if I change the conditional ((buffer[i] == '\n') for ((buffer[i] == '\n') || (buffer[i] == '\r')) I get the same value as in Python and Java minus 1.