Optimize RAM usage of dictionary

600 views Asked by At

I can use the dictionary to find the frequency of an element in a string array. But when the number of elements in an array reaches around 50 million, the RAM usage for my program is around 8-9 GB. This is way too high compared to what I expected. My dictionary is a Dictionary<string, int>, 50 million key-value pairs (if there is no duplicate key) will only cost me around 2 - 2.5GB. I wonder where I was wrong.

The part to get the frequency of elements:

public IEnumerable<string> GetTopTenStrings(string path)
{
    // Dictionary to store the result
    Dictionary<string, int> result = new Dictionary<string, int>();

    var txtFiles = Directory.EnumerateFiles(Path, "*.dat");

    int i = 1;

    foreach (string currentFile in txtFiles)
    {
        using (FileStream fs = File.Open(currentFile, FileMode.Open,
            FileAccess.Read, FileShare.Read))
        using (BufferedStream bs = new BufferedStream(fs))
        using (StreamReader buffer = new StreamReader(bs))
        {
            Console.WriteLine("Now we are at the file {0}", i);

            // Store processed lines                        
            string storage = buffer.ReadToEnd();

            Process(result, storage);  

            i++;
        }
    }

    // Sort the dictionary and return the needed values
    var final = result.OrderByDescending(x => x.Value).Take(10);

    foreach (var element in final)
    {
        Console.WriteLine("{0} appears {1}", element.Key, element.Value);
    }

    var test = final.Select(x => x.Key).ToList();

    Console.WriteLine('\n');

    return test;
}

The function to add key value to the dictionary:

public void Process(Dictionary<string, int> dict, string storage)
{
    List<string>lines = new List<string>();

    string[] line = storage.Split(";");

    foreach (var item in line.ToList())
    {
        if(item.Trim().Length != 0)
        {
            if (dict.ContainsKey(item.ToLower()))
            {
                dict[item.ToLower()]++;
            }
            else
            {
                dict.Add(item.ToLower(), 1);
            }
        }
    }
}
1

There are 1 answers

7
Charlieface On

There are significant performance and memory improvements possible here:

  • We can improve memory usage by holding the whole file in one big string, and using ReadOnlyMemory<char> to hold references to it.
  • Pre-initialize the dictionary to some large size, so that resizes are not necessary.
  • Instead of .Trim() use string.IsNullOrWhiteSpace or similar.
  • Instead of .ToLower() use a case-insensitive comparer on the dictionary.
    If you were using strings you can use StringComparer, but we need a custom comparer for ReadOnlyMemory<char>.
  • Don't use .ToList() unnecessarily.
  • BufferedStream seems unnecessary.
  • Close the file before processing the string.
  • We can use CollectionsMarshal.GetValueRefOrAddDefault to avoid a double lookup on the dictionary.
  • Instead of using OrderByDescending.Take, which would require sorting the entire list, we can use an insertion sort on a fixed sized list.
    • We just binary-search each value's location, and then insert at the new location.
    • Binary search returns the two's-complement (negation) of the index if the value isn't found.

This all assumes that the number of duplicate strings isn't very high. If the number of duplicates is high then it would make more sense to split each string and Intern the results using a StringPool.

public IEnumerable<string> GetTopTenStrings(string path)
{
    // Dictionary to store the result
    var result = new Dictionary<ReadOnlyMemory<char>, int>(someLargeCapacityHere, new MemoryComparer());

    int i = 1;

    foreach (string currentFile in Directory.EnumerateFiles(Path, "*.dat"))
    {
        string str;
        using (FileStream fs = File.Open(currentFile, FileMode.Open,
            FileAccess.Read, FileShare.Read))
        using (StreamReader sr = new StreamReader(fs))
        {
            Console.WriteLine($"Now we are at the file {i}: {currentFile}");
            str = sr.ReadToEnd();

            i++;
        }
        Process(result, str);
    }

    // Sort the dictionary and return the needed values
    var final = TopByOrderDesc(result, 10);

    foreach (var element in final)
    {
        Console.WriteLine("{0} appears {1}", element.Key, element.Value);
    }

    var test = final.Select(x => x.Key.ToString()).ToList();

    Console.WriteLine('\n');

    return test;
}
public void Process(Dictionary<ReadOnlyMemory<char>, int> dict, string str)
{
    var startIndex = 0;
    while (true)
    {
        // search for separator
        var endIndex = str.IndexOf(';', startIndex);
        if (endIndex <= 0)   // not found
            endIndex = str.Length;   // go til the end of string
        if (endIndex - startIndex > 0)    // if non-zero
        {
            var mem = str.AsMemory(startIndex, endIndex - startIndex);
            if (!MemoryExtensions.IsWhiteSpace(mem.Span))    // and not whitespace
            {
                ref var val = ref CollectionsMarshal.GetValueRefOrAddDefault(dict, mem, out _);  // get ref of KVP location in dictionary
                val++;    // increment location by 1
            }
        }
        if (endIndex == str.Length)    // finished string
            break;
        startIndex = endIndex + 1;    // otherwise move to next char
    }
}
public List<KeyValuePair<ReadOnlyMemory<char>, int>> TopByOrderDesc(Dictionary<ReadOnlyMemory<char>, int> source, int top)
{
    var list = new List<KeyValuePair<ReadOnlyMemory<char>, int>>(top + 1);  //pre-initialize
    var comparer = Comparer<KeyValuePair<ReadOnlyMemory<char>, int>>.Create(
        (kvp1, kvp2) => kvp2.Value.CompareTo(kvp1.Value)
    );    // !!! Reverse comparer !!!

    foreach (var item in source)
    {
        var index = list.BinarySearch(item, comparer);
        if (index < 0)
            index = ~index;

        if (index < top) // no point inserting last one
        {
            if (list.Count == top)
                list.RemoveAt(top - 1);

            list.InsertAt(index, item);
        }
    }
    return list;
}
class MemoryComparer : IEqualityComparer<ReadOnlyMemory<char>>
{
    public StringComparison Comparison {get; set;} = StringComparison.OrdinalIgnoreCase;

    public bool Equals(ReadOnlyMemory<char> a, ReadOnlyMemory<char> b) =>
        MemoryExtensions.Equals(b.Span, a.Span, Comparison);

    public int GetHashCode(ReadOnlyMemory<char> o) =>
        string.GetHashCode(o.Span, Comparison);
}