C++ program needs to much memory to run and crashes. How to avoid this?

77 views Asked by At

I want to find ANY repeating combinations of k taken from n across all vector of vectors of 20 int values each. That vector of vectors is all about 50.000+ in size.

For example this is the SQLite piece of code that finds all repeating combos of 5 across all rows:

    DROP TABLE IF EXISTS unpivot;
CREATE TABLE IF NOT EXISTS unpivot AS
SELECT *
FROM (
        SELECT DrawId, N1 as n_value FROM draws union all
        SELECT DrawId, N2 as n_value FROM draws union all
        SELECT DrawId, N3 as n_value FROM draws union all
        SELECT DrawId, N4 as n_value FROM draws union all
        SELECT DrawId, N5 as n_value FROM draws union all
        SELECT DrawId, N6 as n_value FROM draws union all
        SELECT DrawId, N7 as n_value FROM draws union all
        SELECT DrawId, N8 as n_value FROM draws union all
        SELECT DrawId, N9 as n_value FROM draws union all
        SELECT DrawId, N10 as n_value FROM draws union all
        SELECT DrawId, N11 as n_value FROM draws union all
        SELECT DrawId, N12 as n_value FROM draws union all
        SELECT DrawId, N13 as n_value FROM draws union all
        SELECT DrawId, N14 as n_value FROM draws union all
        SELECT DrawId, N15 as n_value FROM draws union all
        SELECT DrawId, N16 as n_value FROM draws union all
        SELECT DrawId, N17 as n_value FROM draws union all
        SELECT DrawId, N18 as n_value FROM draws union all
        SELECT DrawId, N19 as n_value FROM draws union all
        SELECT DrawId, N20 as n_value FROM draws
     ) as T;
     CREATE TABLE IF NOT EXISTS COMB5 AS SELECT n1, n2, n3, n4, n5, count(*) as total
FROM
    (
    SELECT up1.n_value as n1, up2.n_value as n2, up3.n_value as n3, up4.n_value as n4, up5.n_value as n5
    FROM unpivot up1
    JOIN unpivot up2
      ON up1.DrawId = up2.DrawId
     AND up1.n_value < up2.n_value
     JOIN unpivot up3
      ON up2.DrawId = up3.DrawId
     AND up2.n_value < up3.n_value
    JOIN unpivot up4
        ON up3.DrawId = up4.DrawId
    AND up3.n_value < up4.n_value
    JOIN unpivot up5
        ON up4.DrawId = up5.DrawId
    AND up4.n_value < up5.n_value
   ) T
GROUP BY n1, n2, n3, n4, n5
ORDER BY total desc;

Input sample:

Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20
2020-08-16, 15:00,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
2020-08-15, 22:40,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
2020-08-15, 15:00,18,19,20,23,24,25,26,27,49,50,51,52,53,54,55,56,57,58,59,60
2020-08-14, 22:40,1,2,23,24,25,26,27,47,69,70,71,72,73,74,75,76,77,78,79,80
2020-08-14, 15:00,23,24,25,26,27,140,144,145,146,147,148,92,93,94,95,96,97,98,99,100

Output sample:

n1, n2, n3, n4, n5, total
23, 24, 25, 26, 27, 4

Which means (for this example) that the only combination of 5 numbers that is repeating in two or more rows is: 23,24,25,26,27 and it is found 4 times in total across all table (and this is shown by total column with the value of 4 in this case). It is count only if all those numbers are found all together.

Image showing this more clearly:

enter image description here

I can modify that SQLite working code above to find all repeating combinations of 6 numbers, 7 numbers and so on. And it works fine for each combination size I want, even for those with 15 or 18 numbers on each of them.

The only big problem is that this is very time consuming and it takes a lot of time to complete its job.

That is why I am trying to create a program that is doing the same thing but more faster than these SQLite commands.

So, I did this one below:

#include <fstream>
#include <sstream>
#include <iostream>
#include <vector>
#include <string>
#include <algorithm>
#include <cstring>
#include <unordered_map>
#include <map>
#include <tuple>
#include <set>


using namespace std;


vector<vector<int>> comb(vector<int> &myVec, int K) {
    /* Returning all combinations from a vector as a vector of vectors. */
    vector<vector<int>> all_combos; //vector to store all combos
    vector<int> cmb; //vector to store one single combo!
    std::string bitmask(K, 1); // K leading 1's
    bitmask.resize((int) myVec.size(), 0); // N-K trailing 0's

    // print integers and permute bitmask
    do {
        for (int i = 0; i < (int) myVec.size(); ++i) // [0..N-1] integers
        {
            if (bitmask[i]) {
                cmb.push_back(myVec[i]);
            }
        }//cmb finished to be generated!
        //store cmb.
        all_combos.push_back(cmb);
        cmb.clear();
    } while (std::prev_permutation(bitmask.begin(), bitmask.end()));
    return all_combos;
}

std::string join(const std::vector<int> & sequence, const std::string & separator)
{
    std::string result;
    for(size_t i = 0; i < sequence.size(); ++i)
        result += to_string(sequence[i]) + ((i != sequence.size()-1) ? separator : "");
    return result;
}

vector<vector<int>> extrageri;

int main() {
    std::ifstream inFile("./DrawsDB.csv");
    if (inFile.is_open())
    {
        std::string line;
        while( std::getline(inFile,line) )
        {
            std::stringstream ss(line);

            std::string Date, Hour, N1, N2, N3, N4, N5, N6, N7, N8, N9, N10, N11, N12, N13, N14, N15, N16, N17, N18, N19, N20;
            std::getline(ss,Date,',');
            std::getline(ss,Hour,',');
            std::getline(ss,N1,',');
            std::getline(ss,N2,',');
            std::getline(ss,N3,',');
            std::getline(ss,N4,',');
            std::getline(ss,N5,',');
            std::getline(ss,N6,',');
            std::getline(ss,N7,',');
            std::getline(ss,N8,',');
            std::getline(ss,N9,',');
            std::getline(ss,N10,',');
            std::getline(ss,N11,',');
            std::getline(ss,N12,',');
            std::getline(ss,N13,',');
            std::getline(ss,N14,',');
            std::getline(ss,N15,',');
            std::getline(ss,N16,',');
            std::getline(ss,N17,',');
            std::getline(ss,N18,',');
            std::getline(ss,N19,',');
            std::getline(ss,N20,',');


            vector<int> draw;
            draw.push_back(stoi(N1));
            draw.push_back(stoi(N2));
            draw.push_back(stoi(N3));
            draw.push_back(stoi(N4));
            draw.push_back(stoi(N5));
            draw.push_back(stoi(N6));
            draw.push_back(stoi(N7));
            draw.push_back(stoi(N8));
            draw.push_back(stoi(N9));
            draw.push_back(stoi(N10));
            draw.push_back(stoi(N11));
            draw.push_back(stoi(N12));
            draw.push_back(stoi(N13));
            draw.push_back(stoi(N14));
            draw.push_back(stoi(N15));
            draw.push_back(stoi(N16));
            draw.push_back(stoi(N17));
            draw.push_back(stoi(N18));
            draw.push_back(stoi(N19));
            draw.push_back(stoi(N20));

            extrageri.push_back(draw);

        }
    }
    inFile.close();

    for(int cmb=1; cmb<21; cmb++){
        std::unordered_map<string, set<int>> combos;
        ofstream MyFile("REPEATING_COMBOS_OF_"+ to_string(cmb)  +".csv");
        for(int i=0; i<extrageri.size(); i++){
            vector<vector<int>> temp_draws = std::vector<vector<int>>(extrageri.begin() + i+1, extrageri.end());
            for(auto & temp_draw : temp_draws){
                std::vector<int> v_intersection;
                std::set_intersection(extrageri[i].begin(), extrageri[i].end(),
                                      temp_draw.begin(), temp_draw.end(),
                                      std::back_inserter(v_intersection));

                if(v_intersection.size()==cmb){
                    string stringval = join(v_intersection, ",");
                    combos[stringval].insert(i);
                }else if(v_intersection.size()>cmb){
                    vector<vector<int>> groups = comb(v_intersection, cmb);
                    for(auto & group : groups){
                        string stringval = join(group, ",");
                        combos[stringval].insert(i);
                    }
                }
            }
        }


        for (const auto& keyvaluepair : combos)
        {
            // .first to access key, .second to access value
            MyFile << keyvaluepair.first <<","<<keyvaluepair.second.size()+1<<endl;

        }

        MyFile.close();
    }


    return 0;
}

Which is working very well until it reaches Combinations of 6. Then it crashes. Also, during this time and until it get crashed it uses all the available RAM. The SQLite command works fine on 8GB RAM PC and retrieves even repeating Combinations of 15 or even 18 while this program crashes because not enough RAM memory available even when running it on 16GB RAM PC I have.

When the full amount of RAM is used then it crashes and this error is thrown:

terminate called after throwing an instance of 'std::bad_alloc'
  what():  std::bad_alloc
Abort

I found more info about this bad_alloc: here, here, here, here and here and all of them are saying that this is caused by not enough memory available and not by the program logic itself.

My question is:

What is the problem with my C++ program and why it crashes and why SQLite commands runs very well and not get into "not enough memory" problems while this C++ program does?

Thank you very much in advance!

P.S. For the C++ sample input remove the first row (Date,Hour,N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14,N15,N16,N17,N18,N19,N20) which is the .csv header I posted in order to make it clear which column is which.

Update:

Maybe it is possible to do it using python and pandas?

I tried to do it here:

The very fast way to find repeating combinations in Python using pandas?

//Tags added too.

0

There are 0 answers