I made these different programs in different programming languages to count the number of lines of a file, and it turns out that the output differs according to the program, but the strange thing is that some programs have the same results, I was testing them with a 6gb utf-8 xml file with about 146 million lines.
# Python
# Output -> 146114085 lines
import time
lines = 0
start = time.perf_counter()
with open('file_path') as myfile:
for line in myfile:
lines += 1
print("{} lines".format(lines))
end = time.perf_counter()
elapsed = end - start
print(f'Elapsed time: {elapsed:.3f} seconds')
// Java
// Output -> 146114085 lines (just as with python)
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
long startTime = System.currentTimeMillis();
int BUFFER_SIZE = 1024*1024;
String filePath = "file_path";
FileReader file = file = new FileReader(filePath);
BufferedReader reader = new BufferedReader(file, BUFFER_SIZE);
long lines = reader.lines().count();
reader.close();
System.out.println("The number of lines is " + lines);
long elapsedTime = System.currentTimeMillis() - startTime;
System.out.println("Duration in seconds: " + elapsedTime/1000);
} catch (FileNotFoundException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
// Rust
// Output -> 146113746 lines
use std::fs::File;
use std::io::{BufRead, BufReader, Error, Read};
use std::time::Instant;
fn main() {
let file_path = "file_path";
let buffer_size = 1024*1024;
let start = Instant::now();
if let Err(err) = read_file(buffer_size, file_path) {
println!("{}", err);
}
let duration = start.elapsed();
println!("The function took {} seconds to execute", duration.as_secs());
}
fn read_file(buffer_size: usize, file_path: &str) -> Result<(), Error> {
let file = File::open(file_path)?;
let reader = BufReader::with_capacity(buffer_size, file);
let lines = reader.lines().fold(0, |sum, _| sum + 1);
println!("Number of lines {}", lines);
Ok(())
}
// C
// Output -> 146113745 lines (one line less than rust output)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
int main(int argc, char *argv[]) {
// start time
clock_t start = clock();
// File path
const char* file_path = "file_path";
// Open the file for reading
FILE *fp = fopen(file_path, "r");
// Allocate a buffer to hold the data
const size_t BUFFER_SIZE = 1024*1024;
char *buffer = malloc(BUFFER_SIZE);
// Declare the number of lines variable
unsigned int lines = 0;
// Read the data in chunks
while (!feof(fp)) {
// Read a chunk of data from the file
size_t bytes_read = fread(buffer, 1, BUFFER_SIZE, fp);
// Process the data here...
for (int i = 0; i < bytes_read; i++) {
if (buffer[i] == '\n') {
lines++;
}
}
}
printf("The number of lines %u\n", lines);
// Clean up
free(buffer);
fclose(fp);
// End
clock_t end = clock();
// Calculate the elapsed time in seconds
double elapsed = (double) ((end - start) / CLOCKS_PER_SEC);
printf("Elapsed time: %f seconds", elapsed);
return 0;
}
Finally, the command wc
Output -> 146113745 lines (just as with C)
wc -l file_path
I think the correct answer is Rust's, because it has one more than wc/C, and it is the last line that has no line change as it reaches the end of the file. The cases that cause me confusion are java and python.
My Regex expression for a line is
.*?\\n|.+. This works in https://regexr.com/.For some reason in the file reading implementation I'm using in Python and Java the character
'\r'is interpreted as a line feed, but this doesn't happen with the Rust implementation, nor the wc one and obviously neither with the one I made in C (even when it is explicit).But if I change the conditional
((buffer[i] == '\n')for((buffer[i] == '\n') || (buffer[i] == '\r'))I get the same value as in Python and Java minus 1.