Save simulation data to binary in C

163 views Asked by At

(EDITED) I have a particle simulator in C where I generate files with the particle states per integration step containing time, position, and velocity components. The code works well, but the resulting text files can easily go to gigabytes of data, with billions of lines. My initial idea is trying to save those files in binary instead in order to save space. I'm also using GSL for the ODEs. If so, I'm trying to save to binary file with fwrite. The following is a minimal working example:

#include <time.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>

#include <gsl/gsl_rng.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_errno.h>
#include <gsl/gsl_odeiv2.h>

int uniformFieldODE(double t, const double s[], double f[], void *params);

int main() {

    int noPrimaries = 1;

    double q = 1.0;
    double m = 1.0;

    double B[3] = { 0.0, 0.0, 1.0 };            
    double E[3] = { 0.1, 0.0, 0.1 };

    double x0[3];  // = {0, 0, 0};                 // Initial position
    double v0[3];  // = {1, 1, 1};                 // Initial velocity

    double t0 = 0.0;
    double tf = 5000.0;
    double dt = 0.05;

    double h = 1.0e-06;
    double epsAbs = 1.0e-08;
    double epsRel = 1.0e-10;

    char *file = "lorentz";

    const gsl_rng_type *T = gsl_rng_ranlxs0;
    gsl_rng *r = gsl_rng_alloc(T);
    gsl_rng_set(r, (unsigned long)time(NULL));

    for (int i = 0; i < noPrimaries; i++) {

        x0[0] = gsl_rng_uniform(r);
        x0[1] = gsl_rng_uniform(r);
        x0[2] = gsl_rng_uniform(r);
        v0[0] = gsl_rng_uniform(r);
        v0[1] = gsl_rng_uniform(r);
        v0[2] = gsl_rng_uniform(r);

        char fileNumber[i+1];
        char fileName[i+20];
        char extension[8] = ".bin";
        sprintf(fileNumber, "%d", i+1);
        strcpy(fileName, file);
        strcat(fileName, fileNumber);
        strcat(fileName, extension);  
    
        // Initial conditions & parameters
        const int dimension = 6;          // number of differential equations
        double s[dimension];              // Initial state vector
        int status;                       // status of driver function
        double paramsB[8];
        paramsB[0] = q;
        paramsB[1] = m;
        paramsB[2] = B[0];
        paramsB[3] = B[1];
        paramsB[4] = B[2];
        paramsB[5] = E[0];
        paramsB[6] = E[1];
        paramsB[7] = E[2];

        s[0] = x0[0]; s[1] = x0[1]; s[2] = x0[2];
        s[3] = v0[0]; s[4] = v0[1]; s[5] = v0[2];

        // File creation for state storing
        FILE *data = fopen(fileName, "wb");
    
        // Integrator configuration
        double t, t_next;
        gsl_odeiv2_system odeSystem;
        odeSystem.function = uniformFieldODE;
        odeSystem.dimension = dimension;
        odeSystem.params = paramsB;

        t = t0;
        gsl_odeiv2_driver *drv;
        drv = gsl_odeiv2_driver_alloc_y_new(&odeSystem, gsl_odeiv2_step_rk8pd, h, epsAbs, epsRel);

        for (t_next = t0 + dt; t_next <= tf; t_next += dt) {
        
            status = gsl_odeiv2_driver_apply(drv, &t, t_next, s);
            if (status != GSL_SUCCESS) {
                printf("Error: status = %d\n", status);
                break;
            }
            //fprintf(data, "%.5e %.5e %.5e %.5e %.5e %.5e %.5e\n", t, s[0], s[1], s[2], s[3], s[4], s[5]);
            char str[100];
            sprintf(str, "%.5e %.5e %.5e %.5e %.5e %.5e %.5e\n", t, s[0], s[1], s[2], s[3], s[4], s[5]);
            fwrite(&str, sizeof(str), 1, data);
        }
        gsl_odeiv2_driver_free(drv);
        fclose(data);
    }

    return 0;
}

int uniformFieldODE(double t, const double s[], double f[], void *params){

    (void)(t); /* avoid unused parameter warning */
    double *lparams = (double *)params;

    double q  = lparams[0];
    double m  = lparams[1];
    double mu = q / m;

    double Bx = lparams[2];
    double By = lparams[3];
    double Bz = lparams[4];

    double Ex = lparams[5];
    double Ey = lparams[6];
    double Ez = lparams[7];

    f[0] = s[3];
    f[1] = s[4];
    f[2] = s[5];
    f[3] = mu * (Bz*s[4] - By*s[5] + Ex);
    f[4] = mu * (Bx*s[5] - Bz*s[3] + Ey);
    f[5] = mu * (By*s[3] - Bx*s[4] + Ez);
    return GSL_SUCCESS;
}

The problem is that the output still comes out as text, even if it has .bin as extension. Some unreadable characters show up in the beginning of the strings, but otherwise the output is the same as of a text file:

1.00000e-04 7.14907e+08 7.14919e+08 2.14475e+08 -1.30229e+08 -9.46995e+06 -9.50139e+06
5せ2.00000e-04 7.14894e+08 7.14918e+08 2.14474e+08 -1.30245e+08 -9.31686e+06 -9.43359e+06

I'm sure that there's a simple way of doing this, but I cannot figure it out. I'll really appreciate any inputs.

From my original post I got feedbacks that binary might not be the way to go to preserve all the data. So my goal here is to find an output format with a small size when compared to pure text files.

The files will later be used for data analysis via e.g. Python with numpy and/or pandas.

3

There are 3 answers

1
Bodo On BEST ANSWER

Brad Lanam's answer mentions using libz to compress the output. Unfortunately it does not show an example.

If your program would write text output to stdout, not to a file, like

printf(str, "%.5e %.5e %.5e %.5e %.5e %.5e %.5e\n", t, s[0], s[1], s[2], s[3], s[4], s[5]);

you could use a pipe like

yourprogram | gzip > output.txt.gz

In Python you can probably use classes gzip or libz to process a compressed input stream or you can use gzip in a pipe again like

gzip -dc output.txt.gz | python_program
0
Brad Lanam On

Edit: Consider using libz to write out a compressed data file.

A very non-portable method is to simply write the entire data structure out to disk. Note that pointers to other data items would need to be written out separately.

Advantages:

  • Very fast to write and read.
  • Saved double precision would be better than text.

Disadvantages:

  • Extremely non-portable. The data may not be transferable to other machines. With doubles in the structure, I wouldn't even try. Using only the integer types defined in <inttypes.h>, and using the same compiler on all machines, with care, I could probably make a semi-portable binary data file.
  • The data cannot be examined manually.
  • All compiler generated padding within the structure will remain. And as Serge pointed out, there may not be a lot of space saved.

This method is best for data that needs to be temporarily saved on the same machine.

Binary data files used to be much more common when computers were much smaller. They did indeed save a lot of space. But developers and users often want to work with the data, and proprietary and un-documented binary data files are not useful.

Saving character strings in binary data files was either fixed length (limitations and space wasted), or variable length (hard to re-write, makes updating the binary data much more difficult). You could also use a fixed length block with variable length strings within it.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef struct {
  double    vala;
  double    valb;
} data_t;

int
main (int argc, char *argv [])
{
  data_t    d;
  data_t    dt;
  FILE      *fh;
  int       rc;

  d.vala = 14.6;
  d.valb = 15.6;

  fh = fopen ("out.dat", "wb+");
  if (fh == NULL) {
    fprintf (stderr, "fail: unable to open file\n");
    return 1;
  }

  if (fwrite (&d, sizeof (data_t), 1, fh) != 1) {
    fprintf (stderr, "fail: unable to write\n");
    return 1;
  }
  fseek (fh, 0, SEEK_SET);
  if (fread (&dt, sizeof (data_t), 1, fh) != 1) {
    fprintf (stderr, "fail: unable to read\n");
    return 1;
  }
  fclose (fh);

  rc = memcmp (&d, &dt, sizeof (data_t));
  fprintf (stderr, "comparison: rc: %d\n", rc);

  return rc;
}
0
chqrlie On

The data are still in text form because you convert them with sprintf. File extensions are only hints for the user and/or operating system to determine how to handle them, they do not determine the file contents.

Consider writing the binary data instead:

        for (t_next = t0 + dt; t_next <= tf; t_next += dt) {
            status = gsl_odeiv2_driver_apply(drv, &t, t_next, s);
            if (status != GSL_SUCCESS) {
                printf("Error: status = %d\n", status);
                break;
            }
            fwrite(&t, sizeof t, 1, data);
            fwrite(s, sizeof s[0], 6, data);
        }

Note however that you will need to use the same approach when reading the data back from the binary file and the machine on which you read the data must use the same representation and endianness as the one that wrote it. Representation of double values is almost certainly IEEE-754 on all your systems, but endianness may differ.